├── .github
└── workflows
│ ├── build_and_test.yml
│ ├── check_jupyterbook.yml
│ ├── codecov.yml
│ ├── deploy_jupyterbook.yml
│ └── linting.yml
├── .gitignore
├── LICENSE
├── PSL_catalog.json
├── README.md
├── ROADMAP.md
├── codecov.yml
├── docs
├── _config.yml
├── _toc.yml
├── agg.ipynb
├── charts.ipynb
├── custom_taxes.ipynb
├── demo.ipynb
├── examples.md
├── gini.ipynb
├── home.md
├── income_measures.ipynb
├── microdf_logo.png
└── weighting.ipynb
├── environment.yml
├── microdf
├── __init__.py
├── _optional.py
├── agg.py
├── chart_utils.py
├── charts.py
├── concat.py
├── constants.py
├── custom_taxes.py
├── generic.py
├── income_measures.py
├── inequality.py
├── io.py
├── poverty.py
├── style.py
├── tax.py
├── taxcalc.py
├── tests
│ ├── __pycache__
│ │ └── .vscode
│ │ │ └── settings.json
│ ├── conftest.py
│ ├── test_compare.py
│ ├── test_generic.py
│ ├── test_inequality.py
│ ├── test_io.py
│ ├── test_optional_dependency.py
│ ├── test_percentile_actual.csv
│ ├── test_percentile_expected.csv
│ ├── test_poverty.py
│ ├── test_quantile_chg.py
│ ├── test_tax.py
│ ├── test_taxcalc.py
│ ├── test_utils.py
│ └── test_weighted.py
├── ubi.py
├── utils.py
└── weighted.py
└── setup.py
/.github/workflows/build_and_test.yml:
--------------------------------------------------------------------------------
1 | name: Build and test [Python 3.9, 3.10, 3.11]
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | matrix:
10 | python-version: ["3.9", "3.10", "3.11"]
11 |
12 | steps:
13 | - name: Checkout
14 | uses: actions/checkout@v2
15 | with:
16 | persist-credentials: false
17 |
18 | - name: Setup Miniconda using Python ${{ matrix.python-version }}
19 | uses: conda-incubator/setup-miniconda@v2
20 | with:
21 | activate-environment: microdf
22 | environment-file: environment.yml
23 | python-version: ${{ matrix.python-version }}
24 | auto-activate-base: false
25 |
26 | - name: Build
27 | shell: bash -l {0}
28 | run: pip install -e .
29 |
30 | - name: Test
31 | shell: bash -l {0}
32 | run: pytest
33 |
--------------------------------------------------------------------------------
/.github/workflows/check_jupyterbook.yml:
--------------------------------------------------------------------------------
1 | name: Test that Jupyter-Book builds
2 | on: [push, pull_request]
3 | jobs:
4 | build:
5 | if: github.repository == 'PSLmodels/microdf'
6 | runs-on: ubuntu-latest
7 | steps:
8 | - name: Checkout
9 | uses: actions/checkout@v2
10 | with:
11 | persist-credentials: false
12 |
13 | - name: Setup Miniconda
14 | uses: conda-incubator/setup-miniconda@v2
15 | with:
16 | activate-environment: microdf
17 | environment-file: environment.yml
18 | python-version: 3.9
19 | auto-activate-base: false
20 |
21 | - name: Build # Build Jupyter Book
22 | shell: bash -l {0}
23 | run: |
24 | pip install -e .
25 | jb build docs/.
26 |
--------------------------------------------------------------------------------
/.github/workflows/codecov.yml:
--------------------------------------------------------------------------------
1 | name: CodeCov
2 | on: [push, pull_request]
3 | jobs:
4 | run:
5 | runs-on: ubuntu-latest
6 | env:
7 | OS: ubuntu-latest
8 | PYTHON: '3.9'
9 | steps:
10 | - uses: checkout@v2
11 | with:
12 | fetch-depth: ‘2’
13 |
14 | - name: Setup Python
15 | uses: actions/setup-python@master
16 | with:
17 | python-version: 3.9
18 | - name: Generate Report
19 | run: |
20 | pip install coverage
21 | coverage run -m unittest
22 | - name: Upload Coverage to Codecov
23 | uses: codecov/codecov-action@v1
24 |
--------------------------------------------------------------------------------
/.github/workflows/deploy_jupyterbook.yml:
--------------------------------------------------------------------------------
1 | name: Build and Deploy Jupyter Book
2 | on:
3 | push:
4 | branches:
5 | - master
6 | jobs:
7 | build-and-deploy:
8 | if: github.repository == 'PSLmodels/microdf'
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Checkout
12 | uses: actions/checkout@v2
13 | with:
14 | persist-credentials: false
15 |
16 | - name: Setup Miniconda
17 | uses: conda-incubator/setup-miniconda@v2
18 | with:
19 | activate-environment: microdf
20 | environment-file: environment.yml
21 | python-version: 3.9
22 | auto-activate-base: false
23 |
24 | - name: Build
25 | shell: bash -l {0}
26 | run: |
27 | pip install -e .
28 | jb build docs/.
29 |
30 | - name: Deploy
31 | uses: JamesIves/github-pages-deploy-action@releases/v3
32 | with:
33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34 | BRANCH: gh-pages # The branch the action should deploy to.
35 | FOLDER: docs/_build/html # The folder the action should deploy.
36 |
--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
1 | name: Lint
2 |
3 | on:
4 | push:
5 | paths:
6 | - '**.py'
7 | pull_request:
8 | paths:
9 | - '**.py'
10 |
11 | jobs:
12 | build:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | matrix:
16 | python-version: [3.9]
17 |
18 | steps:
19 | - name: Checkout
20 | uses: actions/checkout@v2
21 | with:
22 | persist-credentials: false
23 |
24 | - name: Setup Miniconda using Python ${{ matrix.python-version }}
25 | uses: conda-incubator/setup-miniconda@v2
26 | with:
27 | activate-environment: microdf
28 | environment-file: environment.yml
29 | python-version: ${{ matrix.python-version }}
30 | auto-activate-base: false
31 |
32 | - name: Lint
33 | shell: bash -l {0}
34 | run: flake8
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled python modules.
2 | *.pyc
3 |
4 | # Setuptools distribution folder.
5 | /dist/
6 |
7 | # Python egg metadata, regenerated from source files by setuptools.
8 | /*.egg-info
9 |
10 | .ipynb_checkpoints
11 |
12 | # Built Jupyter-Book documentation.
13 | docs/_build
14 |
15 | .vscode/settings.json
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Max Ghenis
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/PSL_catalog.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "microdf",
3 | "img": "https://github.com/PSLmodels/microdf/blob/master/docs/microdf_logo.png?raw=true",
4 | "banner_title": "microdf",
5 | "banner_subtitle": "Analysis tools for working with survey microdata as DataFrames",
6 | "detailed_description": "microdf is a Python package for analyzing economic microdata as pandas DataFrames, with special functions for handling sampling weights.",
7 | "policy_area": "Survey data, data analysis",
8 | "geography": "Not specific",
9 | "language": "Python",
10 | "maintainers": [
11 | {
12 | "name": "Max Ghenis",
13 | "image": "https://policyengine.org/static/media/max-ghenis.536762d4b2439bf591f5.png",
14 | "link": "mailto:max@policyengine.org"
15 | }
16 | ],
17 | "links": {
18 | "code_repository": "https://github.com/PSLmodels/microdf",
19 | "user_documentation": "http://pslmodels.github.io/microdf/",
20 | "contributor_documentation": "",
21 | "webapp": "",
22 | "recent_changes": "https://github.com/PSLmodels/microdf/releases"
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/PSLmodels/microdf/actions?query=workflow%3A%22Build+and+test+%5BPython+3.7%2C+3.8%2C+3.9%5D%22)
2 | [](https://codecov.io/gh/PSLmodels/microdf)
3 |
4 | # microdf
5 | Analysis tools for working with survey microdata as DataFrames.
6 |
7 | *Disclaimer: `MicroSeries` and `MicroDataFrame` are experimental features and may not consider weights after performing some operations. See open issues.*
8 |
9 | ## Installation
10 | Install with:
11 |
12 | pip install git+git://github.com/PSLmodels/microdf.git
13 |
14 | ## Questions
15 | Contact the maintainer, Max Ghenis (mghenis@gmail.com).
16 |
17 | ## Citation
18 | You may cite the source of your analysis as "microdf release #.#.#, author's calculations."
19 |
--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
1 | # `microdf` roadmap
2 |
3 | `microdf` currently provides capabilities for analyzing weighted microdata, including statistics, distributional tables, graphs, and special functions for working with PSL Tax-Calculator. In the future, it will provide more functionality, including:
4 | * Charts showing distributional changes between a baseline and reform policy
5 | * Extending these charts to more than one reform
6 | * Presets for working with common datasets, e.g. suggesting the appropriate weight for SCF and CPS
7 | * Standard error calculations for surveys with replicate weight files
8 |
9 | See the [issues page](https://github.com/PSLmodels/microdf/issues) to view and suggest other items.
10 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PolicyEngine/microdf/ccf2e54e559ce7563ca9c19b144ab8d41986e1fb/codecov.yml
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | # Book settings
2 | title: microdf documentation
3 | author: Max Ghenis
4 | logo: microdf_logo.png
5 |
6 | launch_buttons:
7 | colab_url: "https://colab.research.google.com"
8 |
9 | repository:
10 | url: https://github.com/PSLmodels/microdf
11 | branch: master
12 | path_to_book: docs
13 |
14 | html:
15 | use_edit_page_button : true
16 | use_repository_button : true
17 | use_issues_button : true
18 |
--------------------------------------------------------------------------------
/docs/_toc.yml:
--------------------------------------------------------------------------------
1 | format: jb-article
2 | root: home
3 | sections:
4 | - file: examples
5 | sections:
6 | - file: agg
7 | - file: charts
8 | - file: custom_taxes
9 | - file: demo
10 | - file: gini
11 | - file: income_measures
12 | - file: weighting
13 |
--------------------------------------------------------------------------------
/docs/agg.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# The `agg` function\n",
8 | "\n",
9 | "Use `agg` to see the effect of a $10,000 UBI by marital status.\n",
10 | "\n",
11 | "## Setup"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import numpy as np\n",
21 | "import pandas as pd\n",
22 | "\n",
23 | "import taxcalc as tc\n",
24 | "import microdf as mdf"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "data": {
34 | "text/plain": [
35 | "'2.3.0'"
36 | ]
37 | },
38 | "execution_count": 2,
39 | "metadata": {},
40 | "output_type": "execute_result"
41 | }
42 | ],
43 | "source": [
44 | "tc.__version__"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "## Load data\n",
52 | "\n",
53 | "Start with a standard `DataFrame`, then add a UBI manually in a reform copy."
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "base = mdf.calc_df(group_vars=['expanded_income', 'MARS', 'XTOT'],\n",
63 | " metric_vars='aftertax_income')"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "reform = base.copy(deep=True)\n",
73 | "UBI_PP = 10000\n",
74 | "reform['ubi'] = reform.XTOT * UBI_PP\n",
75 | "reform['aftertax_income'] = reform.aftertax_income + reform.ubi\n",
76 | "mdf.add_weighted_metrics(reform, 'aftertax_income')"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "## `agg`\n",
84 | "\n",
85 | "### Change in aftertax income by marital status."
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 5,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/html": [
96 | "
\n",
97 | "\n",
110 | "
\n",
111 | " \n",
112 | " \n",
113 | " | \n",
114 | " aftertax_income_m_base | \n",
115 | " aftertax_income_m_reform | \n",
116 | " aftertax_income_pctchg | \n",
117 | "
\n",
118 | " \n",
119 | " MARS | \n",
120 | " | \n",
121 | " | \n",
122 | " | \n",
123 | "
\n",
124 | " \n",
125 | " \n",
126 | " \n",
127 | " 1.0 | \n",
128 | " 3.916351e+06 | \n",
129 | " 4.939093e+06 | \n",
130 | " 0.261147 | \n",
131 | "
\n",
132 | " \n",
133 | " 2.0 | \n",
134 | " 7.692072e+06 | \n",
135 | " 9.577865e+06 | \n",
136 | " 0.245161 | \n",
137 | "
\n",
138 | " \n",
139 | " 4.0 | \n",
140 | " 8.531427e+05 | \n",
141 | " 1.275820e+06 | \n",
142 | " 0.495436 | \n",
143 | "
\n",
144 | " \n",
145 | "
\n",
146 | "
"
147 | ],
148 | "text/plain": [
149 | " aftertax_income_m_base aftertax_income_m_reform aftertax_income_pctchg\n",
150 | "MARS \n",
151 | "1.0 3.916351e+06 4.939093e+06 0.261147\n",
152 | "2.0 7.692072e+06 9.577865e+06 0.245161\n",
153 | "4.0 8.531427e+05 1.275820e+06 0.495436"
154 | ]
155 | },
156 | "execution_count": 5,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "mdf.agg(base, reform, 'MARS', 'aftertax_income')"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "### Also sum baseline `expanded_income`"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 6,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/html": [
180 | "\n",
181 | "\n",
194 | "
\n",
195 | " \n",
196 | " \n",
197 | " | \n",
198 | " aftertax_income_m_base | \n",
199 | " expanded_income | \n",
200 | " aftertax_income_m_reform | \n",
201 | " aftertax_income_pctchg | \n",
202 | "
\n",
203 | " \n",
204 | " MARS | \n",
205 | " | \n",
206 | " | \n",
207 | " | \n",
208 | " | \n",
209 | "
\n",
210 | " \n",
211 | " \n",
212 | " \n",
213 | " 1.0 | \n",
214 | " 3.916351e+06 | \n",
215 | " 1.593936e+10 | \n",
216 | " 4.939093e+06 | \n",
217 | " 0.261147 | \n",
218 | "
\n",
219 | " \n",
220 | " 2.0 | \n",
221 | " 7.692072e+06 | \n",
222 | " 6.242669e+10 | \n",
223 | " 9.577865e+06 | \n",
224 | " 0.245161 | \n",
225 | "
\n",
226 | " \n",
227 | " 4.0 | \n",
228 | " 8.531427e+05 | \n",
229 | " 2.210208e+09 | \n",
230 | " 1.275820e+06 | \n",
231 | " 0.495436 | \n",
232 | "
\n",
233 | " \n",
234 | "
\n",
235 | "
"
236 | ],
237 | "text/plain": [
238 | " aftertax_income_m_base expanded_income aftertax_income_m_reform \\\n",
239 | "MARS \n",
240 | "1.0 3.916351e+06 1.593936e+10 4.939093e+06 \n",
241 | "2.0 7.692072e+06 6.242669e+10 9.577865e+06 \n",
242 | "4.0 8.531427e+05 2.210208e+09 1.275820e+06 \n",
243 | "\n",
244 | " aftertax_income_pctchg \n",
245 | "MARS \n",
246 | "1.0 0.261147 \n",
247 | "2.0 0.245161 \n",
248 | "4.0 0.495436 "
249 | ]
250 | },
251 | "execution_count": 6,
252 | "metadata": {},
253 | "output_type": "execute_result"
254 | }
255 | ],
256 | "source": [
257 | "mdf.agg(base, reform, 'MARS', 'aftertax_income', 'expanded_income')"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {},
263 | "source": [
264 | "### Also sum UBI amount"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 7,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "data": {
274 | "text/html": [
275 | "\n",
276 | "\n",
289 | "
\n",
290 | " \n",
291 | " \n",
292 | " | \n",
293 | " aftertax_income_m_base | \n",
294 | " aftertax_income_m_reform | \n",
295 | " ubi_m | \n",
296 | " aftertax_income_pctchg | \n",
297 | "
\n",
298 | " \n",
299 | " MARS | \n",
300 | " | \n",
301 | " | \n",
302 | " | \n",
303 | " | \n",
304 | "
\n",
305 | " \n",
306 | " \n",
307 | " \n",
308 | " 1.0 | \n",
309 | " 3.916351e+06 | \n",
310 | " 4.939093e+06 | \n",
311 | " 1.022742e+06 | \n",
312 | " 0.261147 | \n",
313 | "
\n",
314 | " \n",
315 | " 2.0 | \n",
316 | " 7.692072e+06 | \n",
317 | " 9.577865e+06 | \n",
318 | " 1.885793e+06 | \n",
319 | " 0.245161 | \n",
320 | "
\n",
321 | " \n",
322 | " 4.0 | \n",
323 | " 8.531427e+05 | \n",
324 | " 1.275820e+06 | \n",
325 | " 4.226775e+05 | \n",
326 | " 0.495436 | \n",
327 | "
\n",
328 | " \n",
329 | "
\n",
330 | "
"
331 | ],
332 | "text/plain": [
333 | " aftertax_income_m_base aftertax_income_m_reform ubi_m \\\n",
334 | "MARS \n",
335 | "1.0 3.916351e+06 4.939093e+06 1.022742e+06 \n",
336 | "2.0 7.692072e+06 9.577865e+06 1.885793e+06 \n",
337 | "4.0 8.531427e+05 1.275820e+06 4.226775e+05 \n",
338 | "\n",
339 | " aftertax_income_pctchg \n",
340 | "MARS \n",
341 | "1.0 0.261147 \n",
342 | "2.0 0.245161 \n",
343 | "4.0 0.495436 "
344 | ]
345 | },
346 | "execution_count": 7,
347 | "metadata": {},
348 | "output_type": "execute_result"
349 | }
350 | ],
351 | "source": [
352 | "mdf.add_weighted_metrics(reform, 'ubi') # Creates ubi_m = ubi * s006 / 1e6.\n",
353 | "\n",
354 | "mdf.agg(base, reform, 'MARS', 'aftertax_income', reform_metrics='ubi_m')"
355 | ]
356 | }
357 | ],
358 | "metadata": {
359 | "kernelspec": {
360 | "display_name": "Python 3",
361 | "language": "python",
362 | "name": "python3"
363 | },
364 | "language_info": {
365 | "codemirror_mode": {
366 | "name": "ipython",
367 | "version": 3
368 | },
369 | "file_extension": ".py",
370 | "mimetype": "text/x-python",
371 | "name": "python",
372 | "nbconvert_exporter": "python",
373 | "pygments_lexer": "ipython3",
374 | "version": "3.7.3"
375 | },
376 | "toc": {
377 | "base_numbering": 1,
378 | "nav_menu": {},
379 | "number_sections": true,
380 | "sideBar": true,
381 | "skip_h1_title": false,
382 | "title_cell": "Table of Contents",
383 | "title_sidebar": "Contents",
384 | "toc_cell": false,
385 | "toc_position": {},
386 | "toc_section_display": true,
387 | "toc_window_display": false
388 | }
389 | },
390 | "nbformat": 4,
391 | "nbformat_minor": 2
392 | }
393 |
--------------------------------------------------------------------------------
/docs/custom_taxes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Custom taxes\n",
8 | "\n",
9 | "## Setup"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "ename": "ModuleNotFoundError",
19 | "evalue": "No module named 'taxcalc'",
20 | "output_type": "error",
21 | "traceback": [
22 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
23 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
24 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtaxcalc\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmicrodf\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
25 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'taxcalc'"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "import numpy as np\n",
31 | "import pandas as pd\n",
32 | "\n",
33 | "import taxcalc as tc\n",
34 | "import microdf as mdf"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "tc.__version__"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "## Load data\n",
51 | "\n",
52 | "Start with a `DataFrame` with `aftertax_income` and necessary ingredients of `tpc_eci`. "
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "df = mdf.calc_df(group_vars=['expanded_income', 'aftertax_income'] +\n",
62 | " mdf.ECI_REMOVE_COLS,\n",
63 | " metric_vars=['XTOT'])\n",
64 | "df.columns"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "Calculate Tax Policy Center's Expanded Cash Income measure, used for the analysis."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "df['tpc_eci'] = mdf.tpc_eci(df)"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "Incidence of a VAT per Tax Policy Center."
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "mdf.add_vat(df)\n",
97 | "df.columns"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "df.head() # Note these are zero because we block negative tax liability."
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "df.sample(5)"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "Add carbon tax and financial transaction tax."
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "mdf.add_carbon_tax(df)\n",
132 | "mdf.add_ftt(df)\n",
133 | "df.columns"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "df.sample(5)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "VAT with a custom amount generated."
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "mdf.add_vat(df, total=500e9, name='vat2')\n",
159 | "df.columns"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "mdf.weighted_sum(df, 'vat', 's006') / 1e9"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "mdf.weighted_sum(df, 'vat2', 's006') / 1e9"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "Calculate by hand using `add_custom_tax`."
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "mdf.add_custom_tax(df, 'tpc_eci', 'XTOT_m', 'aftertax_income', \n",
194 | " mdf.VAT_INCIDENCE, 'vat3', 1e12)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "mdf.weighted_sum(df, 'vat3', 's006') / 1e9"
204 | ]
205 | }
206 | ],
207 | "metadata": {
208 | "kernelspec": {
209 | "display_name": "Python 3",
210 | "language": "python",
211 | "name": "python3"
212 | },
213 | "language_info": {
214 | "codemirror_mode": {
215 | "name": "ipython",
216 | "version": 3
217 | },
218 | "file_extension": ".py",
219 | "mimetype": "text/x-python",
220 | "name": "python",
221 | "nbconvert_exporter": "python",
222 | "pygments_lexer": "ipython3",
223 | "version": "3.7.9"
224 | },
225 | "toc": {
226 | "base_numbering": 1,
227 | "nav_menu": {},
228 | "number_sections": true,
229 | "sideBar": true,
230 | "skip_h1_title": false,
231 | "title_cell": "Table of Contents",
232 | "title_sidebar": "Contents",
233 | "toc_cell": false,
234 | "toc_position": {},
235 | "toc_section_display": true,
236 | "toc_window_display": false
237 | }
238 | },
239 | "nbformat": 4,
240 | "nbformat_minor": 4
241 | }
242 |
--------------------------------------------------------------------------------
/docs/demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# `microdf` demo"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Setup"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd\n",
25 | "\n",
26 | "import taxcalc as tc\n",
27 | "import microdf as mdf\n",
28 | "\n",
29 | "import matplotlib as mpl\n",
30 | "import matplotlib.pyplot as plt\n",
31 | "import seaborn as sns"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "Chart options."
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "name": "stderr",
48 | "output_type": "stream",
49 | "text": [
50 | "/home/mghenis/anaconda3/lib/python3.7/site-packages/microdf/style.py:24: MatplotlibDeprecationWarning: \n",
51 | "The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.\n",
52 | " fm.fontManager.ttflist += fm.createFontList([\"Roboto-Regular.ttf\"])\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "mdf.set_plot_style()"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Generate data"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "base = mdf.calc_df(group_vars=['expanded_income', 'MARS'],\n",
74 | " metric_vars=['aftertax_income', 'XTOT'])"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 4,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": [
85 | "Index(['e02400', 'mcare_ben', 'aftertax_income', 'ssi_ben', 'expanded_income',\n",
86 | " 'snap_ben', 'vet_ben', 'housing_ben', 's006', 'other_ben', 'e02300',\n",
87 | " 'mcaid_ben', 'XTOT', 'tanf_ben', 'MARS', 'wic_ben', 'market_income',\n",
88 | " 'bens', 'tax', 's006_m', 'aftertax_income_m', 'XTOT_m'],\n",
89 | " dtype='object')"
90 | ]
91 | },
92 | "execution_count": 4,
93 | "metadata": {},
94 | "output_type": "execute_result"
95 | }
96 | ],
97 | "source": [
98 | "base.columns"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "Define a reform that treats capital gains as ordinary income and sets the top marginal rate to 70%."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 5,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "CG_REFORM = {\n",
115 | " 'CG_nodiff': {2019: True},\n",
116 | " 'II_rt7': {2019: 0.7}\n",
117 | "}"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "reform = mdf.calc_df(reform=CG_REFORM, group_vars=['MARS'], group_n65=True, \n",
127 | " metric_vars=['aftertax_income', 'XTOT'])"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 7,
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "data": {
137 | "text/plain": [
138 | "Index(['vet_ben', 's006', 'e02300', 'MARS', 'e02400', 'mcare_ben', 'ssi_ben',\n",
139 | " 'snap_ben', 'housing_ben', 'other_ben', 'aftertax_income',\n",
140 | " 'expanded_income', 'mcaid_ben', 'XTOT', 'tanf_ben', 'wic_ben',\n",
141 | " 'market_income', 'bens', 'tax', 'n65', 's006_m', 'aftertax_income_m',\n",
142 | " 'XTOT_m'],\n",
143 | " dtype='object')"
144 | ]
145 | },
146 | "execution_count": 7,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "reform.columns"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "### Calculate senior UBI.\n",
160 | "\n",
161 | "Start with total revenue ($ billions)."
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 8,
167 | "metadata": {},
168 | "outputs": [
169 | {
170 | "data": {
171 | "text/plain": [
172 | "326.110945495585"
173 | ]
174 | },
175 | "execution_count": 8,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "new_rev_m = base.aftertax_income_m.sum() - reform.aftertax_income_m.sum()\n",
182 | "new_rev_m / 1e3"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "How many seniors are there?"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 9,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "data": {
199 | "text/plain": [
200 | "59.21619976999999"
201 | ]
202 | },
203 | "execution_count": 9,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "mdf.add_weighted_metrics(reform, 'n65')\n",
210 | "\n",
211 | "n65_total_m = reform.n65_m.sum()\n",
212 | "n65_total_m"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "Divide."
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 10,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "data": {
229 | "text/plain": [
230 | "5507.123840473106"
231 | ]
232 | },
233 | "execution_count": 10,
234 | "metadata": {},
235 | "output_type": "execute_result"
236 | }
237 | ],
238 | "source": [
239 | "senior_ubi = new_rev_m / reform.n65_m.sum()\n",
240 | "senior_ubi"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "### Add senior UBI to `aftertax_income` and recalculate"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 11,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "reform['ubi'] = senior_ubi * reform.n65\n",
257 | "reform['aftertax_income'] = reform.aftertax_income + reform.ubi\n",
258 | "mdf.add_weighted_metrics(reform, 'aftertax_income')"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 12,
264 | "metadata": {},
265 | "outputs": [
266 | {
267 | "data": {
268 | "text/plain": [
269 | "True"
270 | ]
271 | },
272 | "execution_count": 12,
273 | "metadata": {},
274 | "output_type": "execute_result"
275 | }
276 | ],
277 | "source": [
278 | "np.allclose(base.aftertax_income_m.sum(), reform.aftertax_income_m.sum())"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "## Analyze\n",
286 | "\n",
287 | "Gini, FPL, distributional impact chart\n",
288 | "\n",
289 | "### Change to Gini index"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 13,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/plain": [
300 | "0.5032911973267852"
301 | ]
302 | },
303 | "execution_count": 13,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "mdf.gini(base, 'aftertax_income', 's006')"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 14,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "data": {
319 | "text/plain": [
320 | "0.48752755152259336"
321 | ]
322 | },
323 | "execution_count": 14,
324 | "metadata": {},
325 | "output_type": "execute_result"
326 | }
327 | ],
328 | "source": [
329 | "mdf.gini(reform, 'aftertax_income', 's006')"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "metadata": {},
335 | "source": [
336 | "### Change to poverty rate\n",
337 | "\n",
338 | "Add federal poverty line with `mdf.fpl`."
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 15,
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "base['fpl'] = mdf.fpl(base.XTOT)\n",
348 | "reform['fpl'] = mdf.fpl(reform.XTOT)\n",
349 | "\n",
350 | "base['fpl_XTOT_m'] = np.where(base.aftertax_income < base.fpl,\n",
351 | " base.XTOT_m, 0)\n",
352 | "reform['fpl_XTOT_m'] = np.where(reform.aftertax_income < reform.fpl,\n",
353 | " reform.XTOT_m, 0)"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 16,
359 | "metadata": {},
360 | "outputs": [
361 | {
362 | "data": {
363 | "text/plain": [
364 | "-0.022307196800575246"
365 | ]
366 | },
367 | "execution_count": 16,
368 | "metadata": {},
369 | "output_type": "execute_result"
370 | }
371 | ],
372 | "source": [
373 | "reform.fpl_XTOT_m.sum() / base.fpl_XTOT_m.sum() - 1"
374 | ]
375 | }
376 | ],
377 | "metadata": {
378 | "kernelspec": {
379 | "display_name": "Python 3",
380 | "language": "python",
381 | "name": "python3"
382 | },
383 | "language_info": {
384 | "codemirror_mode": {
385 | "name": "ipython",
386 | "version": 3
387 | },
388 | "file_extension": ".py",
389 | "mimetype": "text/x-python",
390 | "name": "python",
391 | "nbconvert_exporter": "python",
392 | "pygments_lexer": "ipython3",
393 | "version": "3.7.9"
394 | },
395 | "toc": {
396 | "base_numbering": 1,
397 | "nav_menu": {},
398 | "number_sections": true,
399 | "sideBar": true,
400 | "skip_h1_title": false,
401 | "title_cell": "Table of Contents",
402 | "title_sidebar": "Contents",
403 | "toc_cell": false,
404 | "toc_position": {},
405 | "toc_section_display": true,
406 | "toc_window_display": false
407 | }
408 | },
409 | "nbformat": 4,
410 | "nbformat_minor": 2
411 | }
--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
1 | Examples
2 | ========
3 |
4 | See these rendered Jupyter notebooks for examples of `microdf` usage.
--------------------------------------------------------------------------------
/docs/gini.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# `gini` example"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import microdf as mdf\n",
17 | "\n",
18 | "import pandas as pd"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "x = [-10, -1, 0, 5, 100]\n",
28 | "w = [1, 2, 3, 4, 5]\n",
29 | "df = pd.DataFrame({'x': x, 'w': w})"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "## Simple behavior"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "0.9617021276595745"
48 | ]
49 | },
50 | "execution_count": 3,
51 | "metadata": {},
52 | "output_type": "execute_result"
53 | }
54 | ],
55 | "source": [
56 | "mdf.gini(df, 'x')"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "## Dealing with negatives"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "This will be equivalent to `mdf.gini(pd.DataFrame({'x': [0, 0, 0, 5, 100]}))`."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/plain": [
81 | "0.780952380952381"
82 | ]
83 | },
84 | "execution_count": 4,
85 | "metadata": {},
86 | "output_type": "execute_result"
87 | }
88 | ],
89 | "source": [
90 | "mdf.gini(df, 'x', negatives='zero')"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 5,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "data": {
100 | "text/plain": [
101 | "0.780952380952381"
102 | ]
103 | },
104 | "execution_count": 5,
105 | "metadata": {},
106 | "output_type": "execute_result"
107 | }
108 | ],
109 | "source": [
110 | "mdf.gini(pd.DataFrame({'x': [0, 0, 0, 5, 100]}), 'x')"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "This will be equivalent to `mdf.gini(pd.DataFrame({'x': [0, 9, 10, 15, 110]}))`."
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": [
128 | "0.6277777777777778"
129 | ]
130 | },
131 | "execution_count": 6,
132 | "metadata": {},
133 | "output_type": "execute_result"
134 | }
135 | ],
136 | "source": [
137 | "mdf.gini(df, 'x', negatives='shift')"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 7,
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "data": {
147 | "text/plain": [
148 | "0.6277777777777778"
149 | ]
150 | },
151 | "execution_count": 7,
152 | "metadata": {},
153 | "output_type": "execute_result"
154 | }
155 | ],
156 | "source": [
157 | "mdf.gini(pd.DataFrame({'x': [0, 9, 10, 15, 110]}), 'x')"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "## Dealing with weights"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 8,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "0.6800524934383202"
176 | ]
177 | },
178 | "execution_count": 8,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "mdf.gini(df, 'x', 'w')"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 9,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "0.6800524934383202"
196 | ]
197 | },
198 | "execution_count": 9,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "mdf.gini(pd.DataFrame({'x': [-10,\n",
205 | " -1, -1,\n",
206 | " 0, 0, 0,\n",
207 | " 5, 5, 5, 5,\n",
208 | " 100, 100, 100, 100, 100]}),\n",
209 | " 'x')"
210 | ]
211 | }
212 | ],
213 | "metadata": {
214 | "kernelspec": {
215 | "display_name": "Python 3",
216 | "language": "python",
217 | "name": "python3"
218 | },
219 | "language_info": {
220 | "codemirror_mode": {
221 | "name": "ipython",
222 | "version": 3
223 | },
224 | "file_extension": ".py",
225 | "mimetype": "text/x-python",
226 | "name": "python",
227 | "nbconvert_exporter": "python",
228 | "pygments_lexer": "ipython3",
229 | "version": "3.7.9"
230 | },
231 | "toc": {
232 | "base_numbering": 1,
233 | "nav_menu": {},
234 | "number_sections": true,
235 | "sideBar": true,
236 | "skip_h1_title": false,
237 | "title_cell": "Table of Contents",
238 | "title_sidebar": "Contents",
239 | "toc_cell": false,
240 | "toc_position": {},
241 | "toc_section_display": true,
242 | "toc_window_display": false
243 | }
244 | },
245 | "nbformat": 4,
246 | "nbformat_minor": 2
247 | }
248 |
--------------------------------------------------------------------------------
/docs/home.md:
--------------------------------------------------------------------------------
1 | `microdf` documentation
2 | =======================
3 |
4 | This includes example notebooks, and in the future will also include function documentation.
--------------------------------------------------------------------------------
/docs/income_measures.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Income measures\n",
8 | "\n",
9 | "## Setup"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 6,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import pandas as pd\n",
20 | "\n",
21 | "import taxcalc as tc\n",
22 | "import microdf as mdf"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 7,
28 | "metadata": {},
29 | "outputs": [
30 | {
31 | "data": {
32 | "text/plain": [
33 | "'2.3.0'"
34 | ]
35 | },
36 | "execution_count": 7,
37 | "metadata": {},
38 | "output_type": "execute_result"
39 | }
40 | ],
41 | "source": [
42 | "tc.__version__"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## Load data\n",
50 | "\n",
51 | "Start with a `DataFrame` with `expanded_income` and the variables in `expanded_income` excluded from `tpc_eci`."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 8,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "df = mdf.calc_df(group_vars=['expanded_income', 'wic_ben', 'housing_ben', \n",
61 | " 'vet_ben', 'mcare_ben', 'mcaid_ben'],\n",
62 | " metric_vars=['XTOT'])"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "Calculate `tpc_eci`."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 9,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "df['tpc_eci'] = mdf.tpc_eci(df)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 10,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/html": [
89 | "\n",
90 | "\n",
103 | "
\n",
104 | " \n",
105 | " \n",
106 | " | \n",
107 | " snap_ben | \n",
108 | " vet_ben | \n",
109 | " mcaid_ben | \n",
110 | " mcare_ben | \n",
111 | " aftertax_income | \n",
112 | " e02300 | \n",
113 | " ssi_ben | \n",
114 | " wic_ben | \n",
115 | " s006 | \n",
116 | " expanded_income | \n",
117 | " ... | \n",
118 | " tanf_ben | \n",
119 | " other_ben | \n",
120 | " e02400 | \n",
121 | " XTOT | \n",
122 | " market_income | \n",
123 | " bens | \n",
124 | " tax | \n",
125 | " s006_m | \n",
126 | " XTOT_m | \n",
127 | " tpc_eci | \n",
128 | "
\n",
129 | " \n",
130 | " RECID | \n",
131 | " | \n",
132 | " | \n",
133 | " | \n",
134 | " | \n",
135 | " | \n",
136 | " | \n",
137 | " | \n",
138 | " | \n",
139 | " | \n",
140 | " | \n",
141 | " | \n",
142 | " | \n",
143 | " | \n",
144 | " | \n",
145 | " | \n",
146 | " | \n",
147 | " | \n",
148 | " | \n",
149 | " | \n",
150 | " | \n",
151 | " | \n",
152 | "
\n",
153 | " \n",
154 | " \n",
155 | " \n",
156 | " 1 | \n",
157 | " 0.00000 | \n",
158 | " 0.0 | \n",
159 | " 0.000000 | \n",
160 | " 0.000000 | \n",
161 | " 43371.012504 | \n",
162 | " 0.0 | \n",
163 | " 0.00000 | \n",
164 | " 0.0 | \n",
165 | " 250.14 | \n",
166 | " 53636.919015 | \n",
167 | " ... | \n",
168 | " 0.0 | \n",
169 | " 0.000000 | \n",
170 | " 0.000000 | \n",
171 | " 2.0 | \n",
172 | " 53636.919015 | \n",
173 | " 0.000000 | \n",
174 | " 10265.906511 | \n",
175 | " 0.000250 | \n",
176 | " 0.000500 | \n",
177 | " 53636.919015 | \n",
178 | "
\n",
179 | " \n",
180 | " 2 | \n",
181 | " 0.00000 | \n",
182 | " 0.0 | \n",
183 | " 0.000000 | \n",
184 | " 0.000000 | \n",
185 | " 20937.886511 | \n",
186 | " 0.0 | \n",
187 | " 0.00000 | \n",
188 | " 0.0 | \n",
189 | " 211.63 | \n",
190 | " 18650.034959 | \n",
191 | " ... | \n",
192 | " 0.0 | \n",
193 | " 0.000000 | \n",
194 | " 0.000000 | \n",
195 | " 3.0 | \n",
196 | " 18650.034959 | \n",
197 | " 0.000000 | \n",
198 | " -2287.851553 | \n",
199 | " 0.000212 | \n",
200 | " 0.000635 | \n",
201 | " 18650.034959 | \n",
202 | "
\n",
203 | " \n",
204 | " 3 | \n",
205 | " 1734.12939 | \n",
206 | " 0.0 | \n",
207 | " 8211.593627 | \n",
208 | " 13640.390612 | \n",
209 | " 52516.165397 | \n",
210 | " 0.0 | \n",
211 | " 3374.52239 | \n",
212 | " 0.0 | \n",
213 | " 323.50 | \n",
214 | " 52516.165397 | \n",
215 | " ... | \n",
216 | " 0.0 | \n",
217 | " 6663.701623 | \n",
218 | " 13227.079816 | \n",
219 | " 1.0 | \n",
220 | " 0.000000 | \n",
221 | " 52516.165397 | \n",
222 | " 0.000000 | \n",
223 | " 0.000324 | \n",
224 | " 0.000324 | \n",
225 | " 24999.433219 | \n",
226 | "
\n",
227 | " \n",
228 | " 4 | \n",
229 | " 0.00000 | \n",
230 | " 0.0 | \n",
231 | " 8211.593627 | \n",
232 | " 0.000000 | \n",
233 | " 36857.709188 | \n",
234 | " 0.0 | \n",
235 | " 0.00000 | \n",
236 | " 0.0 | \n",
237 | " 186.32 | \n",
238 | " 37764.286717 | \n",
239 | " ... | \n",
240 | " 0.0 | \n",
241 | " 3906.542368 | \n",
242 | " 0.000000 | \n",
243 | " 2.0 | \n",
244 | " 25646.150723 | \n",
245 | " 12118.135995 | \n",
246 | " 906.577529 | \n",
247 | " 0.000186 | \n",
248 | " 0.000373 | \n",
249 | " 29552.693091 | \n",
250 | "
\n",
251 | " \n",
252 | " 5 | \n",
253 | " 0.00000 | \n",
254 | " 0.0 | \n",
255 | " 0.000000 | \n",
256 | " 27280.781223 | \n",
257 | " 63941.158283 | \n",
258 | " 0.0 | \n",
259 | " 0.00000 | \n",
260 | " 0.0 | \n",
261 | " 343.08 | \n",
262 | " 63941.158283 | \n",
263 | " ... | \n",
264 | " 0.0 | \n",
265 | " 0.000000 | \n",
266 | " 35560.553286 | \n",
267 | " 2.0 | \n",
268 | " 1099.823774 | \n",
269 | " 62841.334509 | \n",
270 | " 0.000000 | \n",
271 | " 0.000343 | \n",
272 | " 0.000686 | \n",
273 | " 36660.377060 | \n",
274 | "
\n",
275 | " \n",
276 | "
\n",
277 | "
5 rows × 21 columns
\n",
278 | "
"
279 | ],
280 | "text/plain": [
281 | " snap_ben vet_ben mcaid_ben mcare_ben aftertax_income \\\n",
282 | "RECID \n",
283 | "1 0.00000 0.0 0.000000 0.000000 43371.012504 \n",
284 | "2 0.00000 0.0 0.000000 0.000000 20937.886511 \n",
285 | "3 1734.12939 0.0 8211.593627 13640.390612 52516.165397 \n",
286 | "4 0.00000 0.0 8211.593627 0.000000 36857.709188 \n",
287 | "5 0.00000 0.0 0.000000 27280.781223 63941.158283 \n",
288 | "\n",
289 | " e02300 ssi_ben wic_ben s006 expanded_income ... tanf_ben \\\n",
290 | "RECID ... \n",
291 | "1 0.0 0.00000 0.0 250.14 53636.919015 ... 0.0 \n",
292 | "2 0.0 0.00000 0.0 211.63 18650.034959 ... 0.0 \n",
293 | "3 0.0 3374.52239 0.0 323.50 52516.165397 ... 0.0 \n",
294 | "4 0.0 0.00000 0.0 186.32 37764.286717 ... 0.0 \n",
295 | "5 0.0 0.00000 0.0 343.08 63941.158283 ... 0.0 \n",
296 | "\n",
297 | " other_ben e02400 XTOT market_income bens \\\n",
298 | "RECID \n",
299 | "1 0.000000 0.000000 2.0 53636.919015 0.000000 \n",
300 | "2 0.000000 0.000000 3.0 18650.034959 0.000000 \n",
301 | "3 6663.701623 13227.079816 1.0 0.000000 52516.165397 \n",
302 | "4 3906.542368 0.000000 2.0 25646.150723 12118.135995 \n",
303 | "5 0.000000 35560.553286 2.0 1099.823774 62841.334509 \n",
304 | "\n",
305 | " tax s006_m XTOT_m tpc_eci \n",
306 | "RECID \n",
307 | "1 10265.906511 0.000250 0.000500 53636.919015 \n",
308 | "2 -2287.851553 0.000212 0.000635 18650.034959 \n",
309 | "3 0.000000 0.000324 0.000324 24999.433219 \n",
310 | "4 906.577529 0.000186 0.000373 29552.693091 \n",
311 | "5 0.000000 0.000343 0.000686 36660.377060 \n",
312 | "\n",
313 | "[5 rows x 21 columns]"
314 | ]
315 | },
316 | "execution_count": 10,
317 | "metadata": {},
318 | "output_type": "execute_result"
319 | }
320 | ],
321 | "source": [
322 | "df.head()"
323 | ]
324 | }
325 | ],
326 | "metadata": {
327 | "kernelspec": {
328 | "display_name": "Python 3",
329 | "language": "python",
330 | "name": "python3"
331 | },
332 | "language_info": {
333 | "codemirror_mode": {
334 | "name": "ipython",
335 | "version": 3
336 | },
337 | "file_extension": ".py",
338 | "mimetype": "text/x-python",
339 | "name": "python",
340 | "nbconvert_exporter": "python",
341 | "pygments_lexer": "ipython3",
342 | "version": "3.7.3"
343 | },
344 | "toc": {
345 | "base_numbering": 1,
346 | "nav_menu": {},
347 | "number_sections": true,
348 | "sideBar": true,
349 | "skip_h1_title": false,
350 | "title_cell": "Table of Contents",
351 | "title_sidebar": "Contents",
352 | "toc_cell": false,
353 | "toc_position": {},
354 | "toc_section_display": true,
355 | "toc_window_display": false
356 | }
357 | },
358 | "nbformat": 4,
359 | "nbformat_minor": 2
360 | }
361 |
--------------------------------------------------------------------------------
/docs/microdf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PolicyEngine/microdf/ccf2e54e559ce7563ca9c19b144ab8d41986e1fb/docs/microdf_logo.png
--------------------------------------------------------------------------------
/docs/weighting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Weighting in taxcalc_helpers\n",
8 | "\n",
9 | "## Setup"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import pandas as pd\n",
20 | "\n",
21 | "import taxcalc as tc\n",
22 | "import microdf as mdf"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "metadata": {},
29 | "outputs": [
30 | {
31 | "data": {
32 | "text/plain": [
33 | "'3.0.0'"
34 | ]
35 | },
36 | "execution_count": 2,
37 | "metadata": {},
38 | "output_type": "execute_result"
39 | }
40 | ],
41 | "source": [
42 | "tc.__version__"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "## Load data\n",
50 | "\n",
51 | "Start with a `DataFrame` with `nu18` and `XTOT`, and also calculate `XTOT_m`."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "Index(['s006', 'other_ben', 'snap_ben', 'aftertax_income', 'mcaid_ben',\n",
63 | " 'mcare_ben', 'ssi_ben', 'e02300', 'nu18', 'expanded_income',\n",
64 | " 'housing_ben', 'vet_ben', 'wic_ben', 'e02400', 'tanf_ben', 'XTOT',\n",
65 | " 'market_income', 'bens', 'tax', 's006_m', 'XTOT_m'],\n",
66 | " dtype='object')"
67 | ]
68 | },
69 | "execution_count": 3,
70 | "metadata": {},
71 | "output_type": "execute_result"
72 | }
73 | ],
74 | "source": [
75 | "df = mdf.calc_df(group_vars=['nu18'], metric_vars=['XTOT'])\n",
76 | "df.columns"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "From this we can calculate the number of people and tax units by the tax unit's number of children."
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "data": {
93 | "text/html": [
94 | "\n",
95 | "\n",
108 | "
\n",
109 | " \n",
110 | " \n",
111 | " | \n",
112 | " s006_m | \n",
113 | " XTOT_m | \n",
114 | "
\n",
115 | " \n",
116 | " nu18 | \n",
117 | " | \n",
118 | " | \n",
119 | "
\n",
120 | " \n",
121 | " \n",
122 | " \n",
123 | " 0.0 | \n",
124 | " 152.988772 | \n",
125 | " 209.816367 | \n",
126 | "
\n",
127 | " \n",
128 | " 1.0 | \n",
129 | " 22.688253 | \n",
130 | " 54.115850 | \n",
131 | "
\n",
132 | " \n",
133 | " 2.0 | \n",
134 | " 18.859945 | \n",
135 | " 68.880292 | \n",
136 | "
\n",
137 | " \n",
138 | " 3.0 | \n",
139 | " 7.438481 | \n",
140 | " 34.795527 | \n",
141 | "
\n",
142 | " \n",
143 | " 4.0 | \n",
144 | " 2.371111 | \n",
145 | " 13.539261 | \n",
146 | "
\n",
147 | " \n",
148 | " 5.0 | \n",
149 | " 0.744276 | \n",
150 | " 5.015182 | \n",
151 | "
\n",
152 | " \n",
153 | " 6.0 | \n",
154 | " 0.216158 | \n",
155 | " 1.688063 | \n",
156 | "
\n",
157 | " \n",
158 | " 7.0 | \n",
159 | " 0.090332 | \n",
160 | " 0.790239 | \n",
161 | "
\n",
162 | " \n",
163 | " 8.0 | \n",
164 | " 0.026501 | \n",
165 | " 0.258552 | \n",
166 | "
\n",
167 | " \n",
168 | " 9.0 | \n",
169 | " 0.012238 | \n",
170 | " 0.134320 | \n",
171 | "
\n",
172 | " \n",
173 | " 10.0 | \n",
174 | " 0.007196 | \n",
175 | " 0.084201 | \n",
176 | "
\n",
177 | " \n",
178 | " 12.0 | \n",
179 | " 0.000265 | \n",
180 | " 0.003715 | \n",
181 | "
\n",
182 | " \n",
183 | "
\n",
184 | "
"
185 | ],
186 | "text/plain": [
187 | " s006_m XTOT_m\n",
188 | "nu18 \n",
189 | "0.0 152.988772 209.816367\n",
190 | "1.0 22.688253 54.115850\n",
191 | "2.0 18.859945 68.880292\n",
192 | "3.0 7.438481 34.795527\n",
193 | "4.0 2.371111 13.539261\n",
194 | "5.0 0.744276 5.015182\n",
195 | "6.0 0.216158 1.688063\n",
196 | "7.0 0.090332 0.790239\n",
197 | "8.0 0.026501 0.258552\n",
198 | "9.0 0.012238 0.134320\n",
199 | "10.0 0.007196 0.084201\n",
200 | "12.0 0.000265 0.003715"
201 | ]
202 | },
203 | "execution_count": 4,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "df.groupby('nu18')[['s006_m', 'XTOT_m']].sum()"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "What if we also want to calculate the total number of *children* by the tax unit's number of children?\n",
217 | "\n",
218 | "For this we can use `add_weighted_metrics`, the function called within `calc_df`."
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 5,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "mdf.add_weighted_metrics(df, ['nu18'])"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "Now we can do the same thing as before, with the new `nu18_m` column."
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 6,
240 | "metadata": {},
241 | "outputs": [
242 | {
243 | "data": {
244 | "text/html": [
245 | "\n",
246 | "\n",
259 | "
\n",
260 | " \n",
261 | " \n",
262 | " | \n",
263 | " nu18_m | \n",
264 | "
\n",
265 | " \n",
266 | " nu18 | \n",
267 | " | \n",
268 | "
\n",
269 | " \n",
270 | " \n",
271 | " \n",
272 | " 0.0 | \n",
273 | " 0.000000 | \n",
274 | "
\n",
275 | " \n",
276 | " 1.0 | \n",
277 | " 22.688253 | \n",
278 | "
\n",
279 | " \n",
280 | " 2.0 | \n",
281 | " 37.719889 | \n",
282 | "
\n",
283 | " \n",
284 | " 3.0 | \n",
285 | " 22.315444 | \n",
286 | "
\n",
287 | " \n",
288 | " 4.0 | \n",
289 | " 9.484444 | \n",
290 | "
\n",
291 | " \n",
292 | " 5.0 | \n",
293 | " 3.721381 | \n",
294 | "
\n",
295 | " \n",
296 | " 6.0 | \n",
297 | " 1.296949 | \n",
298 | "
\n",
299 | " \n",
300 | " 7.0 | \n",
301 | " 0.632325 | \n",
302 | "
\n",
303 | " \n",
304 | " 8.0 | \n",
305 | " 0.212008 | \n",
306 | "
\n",
307 | " \n",
308 | " 9.0 | \n",
309 | " 0.110139 | \n",
310 | "
\n",
311 | " \n",
312 | " 10.0 | \n",
313 | " 0.071958 | \n",
314 | "
\n",
315 | " \n",
316 | " 12.0 | \n",
317 | " 0.003184 | \n",
318 | "
\n",
319 | " \n",
320 | "
\n",
321 | "
"
322 | ],
323 | "text/plain": [
324 | " nu18_m\n",
325 | "nu18 \n",
326 | "0.0 0.000000\n",
327 | "1.0 22.688253\n",
328 | "2.0 37.719889\n",
329 | "3.0 22.315444\n",
330 | "4.0 9.484444\n",
331 | "5.0 3.721381\n",
332 | "6.0 1.296949\n",
333 | "7.0 0.632325\n",
334 | "8.0 0.212008\n",
335 | "9.0 0.110139\n",
336 | "10.0 0.071958\n",
337 | "12.0 0.003184"
338 | ]
339 | },
340 | "execution_count": 6,
341 | "metadata": {},
342 | "output_type": "execute_result"
343 | }
344 | ],
345 | "source": [
346 | "df.groupby('nu18')[['nu18_m']].sum()"
347 | ]
348 | },
349 | {
350 | "cell_type": "markdown",
351 | "metadata": {},
352 | "source": [
353 | "We can also calculate weighted sums without adding the weighted metric."
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 7,
359 | "metadata": {},
360 | "outputs": [
361 | {
362 | "data": {
363 | "text/plain": [
364 | "'Total children: 98M.'"
365 | ]
366 | },
367 | "execution_count": 7,
368 | "metadata": {},
369 | "output_type": "execute_result"
370 | }
371 | ],
372 | "source": [
373 | "total_children = mdf.weighted_sum(df, 'nu18', 's006')\n",
374 | "# Fix this decimal.\n",
375 | "'Total children: ' + str(round(total_children / 1e6)) + 'M.'"
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "metadata": {},
381 | "source": [
382 | "We can also calculate the weighted mean and median."
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 8,
388 | "metadata": {},
389 | "outputs": [
390 | {
391 | "data": {
392 | "text/plain": [
393 | "0.4782626894263673"
394 | ]
395 | },
396 | "execution_count": 8,
397 | "metadata": {},
398 | "output_type": "execute_result"
399 | }
400 | ],
401 | "source": [
402 | "mdf.weighted_mean(df, 'nu18', 's006')"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 9,
408 | "metadata": {},
409 | "outputs": [
410 | {
411 | "ename": "TypeError",
412 | "evalue": "weighted_quantile() missing 1 required positional argument: 'quantiles'",
413 | "output_type": "error",
414 | "traceback": [
415 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
416 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
417 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweighted_median\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nu18'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m's006'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
418 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/microdf/weighted.py\u001b[0m in \u001b[0;36mweighted_median\u001b[0;34m(df, col, w)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \"\"\"\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mweighted_quantile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
419 | "\u001b[0;31mTypeError\u001b[0m: weighted_quantile() missing 1 required positional argument: 'quantiles'"
420 | ]
421 | }
422 | ],
423 | "source": [
424 | "mdf.weighted_median(df, 'nu18', 's006')"
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "metadata": {},
430 | "source": [
431 | "We can also look at more quantiles.\n",
432 | "\n",
433 | "*Note that weighted quantiles have a different interface.*"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": null,
439 | "metadata": {},
440 | "outputs": [],
441 | "source": [
442 | "decile_bounds = np.arange(0, 1.1, 0.1)\n",
443 | "deciles = mdf.weighted_quantile(df, 'nu18', 's006', decile_bounds)\n",
444 | "pd.DataFrame(deciles, index=decile_bounds)"
445 | ]
446 | }
447 | ],
448 | "metadata": {
449 | "kernelspec": {
450 | "display_name": "Python 3",
451 | "language": "python",
452 | "name": "python3"
453 | },
454 | "language_info": {
455 | "codemirror_mode": {
456 | "name": "ipython",
457 | "version": 3
458 | },
459 | "file_extension": ".py",
460 | "mimetype": "text/x-python",
461 | "name": "python",
462 | "nbconvert_exporter": "python",
463 | "pygments_lexer": "ipython3",
464 | "version": "3.7.9"
465 | },
466 | "toc": {
467 | "base_numbering": 1,
468 | "nav_menu": {},
469 | "number_sections": true,
470 | "sideBar": true,
471 | "skip_h1_title": false,
472 | "title_cell": "Table of Contents",
473 | "title_sidebar": "Contents",
474 | "toc_cell": false,
475 | "toc_position": {},
476 | "toc_section_display": true,
477 | "toc_window_display": false
478 | }
479 | },
480 | "nbformat": 4,
481 | "nbformat_minor": 2
482 | }
483 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: microdf
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - codecov
6 | - flake8
7 | - matplotlib
8 | - numpy
9 | - pandas
10 | - pip
11 | - pytest
12 | - seaborn
13 | - setuptools
14 | - pip:
15 | - jupyter-book
16 |
--------------------------------------------------------------------------------
/microdf/__init__.py:
--------------------------------------------------------------------------------
1 | from .agg import agg, combine_base_reform, pctchg_base_reform
2 | from .chart_utils import dollar_format, currency_format
3 | from .charts import quantile_pct_chg_plot
4 | from .concat import concat
5 | from .constants import (
6 | BENS,
7 | ECI_REMOVE_COLS,
8 | HOUSING_CASH_SHARE,
9 | MCAID_CASH_SHARE,
10 | MCARE_CASH_SHARE,
11 | MED_BENS,
12 | OTHER_CASH_SHARE,
13 | SNAP_CASH_SHARE,
14 | SSI_CASH_SHARE,
15 | TANF_CASH_SHARE,
16 | VET_CASH_SHARE,
17 | WIC_CASH_SHARE,
18 | )
19 | from .custom_taxes import (
20 | CARBON_TAX_INCIDENCE,
21 | FTT_INCIDENCE,
22 | VAT_INCIDENCE,
23 | add_carbon_tax,
24 | add_custom_tax,
25 | add_ftt,
26 | add_vat,
27 | )
28 | from .income_measures import cash_income, market_income, tpc_eci
29 | from .inequality import (
30 | bottom_50_pct_share,
31 | bottom_x_pct_share,
32 | gini,
33 | t10_b50,
34 | top_0_1_pct_share,
35 | top_10_pct_share,
36 | top_1_pct_share,
37 | top_50_pct_share,
38 | top_x_pct_share,
39 | )
40 | from .io import read_stata_zip
41 | from .poverty import (
42 | fpl,
43 | poverty_rate,
44 | deep_poverty_rate,
45 | poverty_gap,
46 | squared_poverty_gap,
47 | deep_poverty_gap,
48 | )
49 | from .style import AXIS_COLOR, DPI, GRID_COLOR, TITLE_COLOR, set_plot_style
50 | from .tax import mtr, tax_from_mtrs
51 | from .taxcalc import (
52 | add_weighted_metrics,
53 | calc_df,
54 | n65,
55 | recalculate,
56 | static_baseline_calc,
57 | )
58 | from .ubi import ubi_or_bens
59 | from .utils import (
60 | cartesian_product,
61 | dedup_list,
62 | flatten,
63 | listify,
64 | ordinal_label,
65 | )
66 | from .weighted import (
67 | add_weighted_quantiles,
68 | quantile_chg,
69 | weight,
70 | weighted_mean,
71 | weighted_median,
72 | weighted_quantile,
73 | weighted_sum,
74 | )
75 | from .generic import MicroDataFrame, MicroSeries
76 |
77 | name = "microdf"
78 | __version__ = "0.1.0"
79 |
80 | __all__ = [
81 | # agg.py
82 | "combine_base_reform",
83 | "pctchg_base_reform",
84 | "agg",
85 | # chart_utils.py
86 | "dollar_format",
87 | "currency_format",
88 | # charts.py
89 | "quantile_pct_chg_plot",
90 | # concat.py
91 | "concat",
92 | # constants.py
93 | "BENS",
94 | "ECI_REMOVE_COLS",
95 | "HOUSING_CASH_SHARE",
96 | "MCAID_CASH_SHARE",
97 | "MCARE_CASH_SHARE",
98 | "MED_BENS",
99 | "OTHER_CASH_SHARE",
100 | "SNAP_CASH_SHARE",
101 | "SSI_CASH_SHARE",
102 | "TANF_CASH_SHARE",
103 | "VET_CASH_SHARE",
104 | "WIC_CASH_SHARE",
105 | # custom_taxes.py
106 | "CARBON_TAX_INCIDENCE",
107 | "FTT_INCIDENCE",
108 | "VAT_INCIDENCE",
109 | "add_custom_tax",
110 | "add_vat",
111 | "add_carbon_tax",
112 | "add_ftt",
113 | # income_measures.py
114 | "cash_income",
115 | "tpc_eci",
116 | "market_income",
117 | # inequality.py
118 | "gini",
119 | "top_x_pct_share",
120 | "bottom_x_pct_share",
121 | "bottom_50_pct_share",
122 | "top_10_pct_share",
123 | "top_1_pct_share",
124 | "top_0_1_pct_share",
125 | "top_50_pct_share",
126 | "t10_b50",
127 | # io.py
128 | "read_stata_zip",
129 | # poverty.py
130 | "fpl",
131 | "poverty_rate",
132 | "deep_poverty_rate",
133 | "poverty_gap",
134 | "squared_poverty_gap",
135 | "deep_poverty_gap",
136 | # style.py
137 | "AXIS_COLOR",
138 | "DPI",
139 | "GRID_COLOR",
140 | "TITLE_COLOR",
141 | "set_plot_style",
142 | # tax.py
143 | "mtr",
144 | "tax_from_mtrs",
145 | # taxcalc.py
146 | "static_baseline_calc",
147 | "add_weighted_metrics",
148 | "n65",
149 | "calc_df",
150 | "recalculate",
151 | # ubi.py
152 | "ubi_or_bens",
153 | # utils.py
154 | "ordinal_label",
155 | "dedup_list",
156 | "listify",
157 | "flatten",
158 | "cartesian_product",
159 | # weighted.py
160 | "weight",
161 | "weighted_sum",
162 | "weighted_mean",
163 | "weighted_quantile",
164 | "weighted_median",
165 | "add_weighted_quantiles",
166 | "quantile_chg",
167 | # generic.py
168 | "MicroSeries",
169 | "MicroDataFrame",
170 | ]
171 |
--------------------------------------------------------------------------------
/microdf/_optional.py:
--------------------------------------------------------------------------------
1 | import distutils.version
2 | import importlib
3 | import types
4 | import warnings
5 |
6 |
7 | # Adapted from:
8 | # https://github.com/pandas-dev/pandas/blob/master/pandas/compat/_optional.py
9 |
10 | VERSIONS = {
11 | "taxcalc": "2.0.0",
12 | }
13 |
14 |
15 | def _get_version(module: types.ModuleType) -> str:
16 | """
17 |
18 | :param module: types.ModuleType:
19 | :param module: types.ModuleType:
20 |
21 | """
22 | version = getattr(module, "__version__", None)
23 | if version is None:
24 | # xlrd uses a capitalized attribute name
25 | version = getattr(module, "__VERSION__", None)
26 |
27 | if version is None:
28 | raise ImportError(f"Can't determine version for {module.__name__}")
29 | return version
30 |
31 |
32 | def import_optional_dependency(
33 | name: str,
34 | extra: str = "",
35 | raise_on_missing: bool = True,
36 | on_version: str = "raise",
37 | ):
38 | """Import an optional dependency.
39 | By default, if a dependency is missing an ImportError with a nice
40 | message will be raised. If a dependency is present, but too old,
41 | we raise.
42 |
43 | :param name: The module name. This should be top-level only, so that the
44 | version may be checked.
45 | :type name: str
46 | :param extra: Additional text to include in the ImportError message.
47 | :type extra: str
48 | :param raise_on_missing: Whether to raise if the optional dependency is
49 | not found. When False and the module is not present, None is returned.
50 | :type raise_on_missing: bool, default True
51 | :param on_version: What to do when a dependency's version is too old.
52 | * raise : Raise an ImportError
53 | * warn : Warn that the version is too old. Returns None
54 | * ignore: Return the module, even if the version is too old.
55 | It's expected that users validate the version locally when
56 | :type on_version: str {'raise', 'warn'}
57 | """
58 | msg = (
59 | f"Missing optional dependency '{name}'. {extra} "
60 | f"Use pip or conda to install {name}."
61 | )
62 | try:
63 | module = importlib.import_module(name)
64 | except ImportError:
65 | if raise_on_missing:
66 | raise ImportError(msg) from None
67 | else:
68 | return None
69 |
70 | minimum_version = VERSIONS.get(name)
71 | if minimum_version:
72 | version = _get_version(module)
73 | if distutils.version.LooseVersion(version) < minimum_version:
74 | assert on_version in {"warn", "raise", "ignore"}
75 | msg = (
76 | f"microdf requires version '{minimum_version}' or newer of "
77 | f"'{name}' "
78 | f"(version '{version}' currently installed)."
79 | )
80 | if on_version == "warn":
81 | warnings.warn(msg, UserWarning)
82 | return None
83 | elif on_version == "raise":
84 | raise ImportError(msg)
85 |
86 | return module
87 |
--------------------------------------------------------------------------------
/microdf/agg.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from typing import Optional
3 |
4 | import microdf as mdf
5 |
6 |
7 | def combine_base_reform(
8 | base: pd.DataFrame,
9 | reform: pd.DataFrame,
10 | base_cols: Optional[list],
11 | cols: Optional[list],
12 | reform_cols: Optional[list],
13 | ) -> pd.DataFrame:
14 | """Combine base and reform with certain columns.
15 |
16 | :param base: Base DataFrame. Index must match reform.
17 | :type base: pd.DataFrame
18 | :param reform: Reform DataFrame. Index must match base.
19 | :type reform: pd.DataFrame
20 | :param base_cols: Columns in base to keep.
21 | :type base_cols: list, optional
22 | :param cols: Columns to keep from both base and reform.
23 | :type cols: list, optional
24 | :param reform_cols: Columns in reform to keep.
25 | :type reform_cols: list, optional
26 | :returns: DataFrame with columns for base ("_base") and reform ("_reform").
27 | :rtype: pd.DataFrame
28 |
29 | """
30 | all_base_cols = mdf.listify([base_cols] + [cols])
31 | all_reform_cols = mdf.listify([reform_cols] + [cols])
32 | return base[all_base_cols].join(
33 | reform[all_reform_cols], lsuffix="_base", rsuffix="_reform"
34 | )
35 |
36 |
37 | def pctchg_base_reform(combined: pd.DataFrame, metric: str) -> pd.Series:
38 | """Calculates the percentage change in a metric for a combined
39 | dataset.
40 |
41 | :param combined: Combined DataFrame with _base and _reform columns.
42 | :type combined: pd.DataFrame
43 | :param metric: String of the column to calculate the difference.
44 | Must exist as metric_m_base and metric_m_reform in combined.
45 | :type metric: str
46 | :returns: Series with percentage change.
47 | :rtype: pd.Series
48 |
49 | """
50 | return combined[metric + "_m_reform"] / combined[metric + "_m_base"] - 1
51 |
52 |
53 | def agg(
54 | base: pd.DataFrame,
55 | reform: pd.DataFrame,
56 | groupby: str,
57 | metrics: list,
58 | base_metrics: Optional[list],
59 | reform_metrics: Optional[list],
60 | ) -> pd.DataFrame:
61 | """Aggregates differences between base and reform.
62 |
63 | :param base: Base DataFrame. Index must match reform.
64 | :type base: pd.DataFrame
65 | :param reform: Reform DataFrame. Index must match base.
66 | :type reform: pd.DataFrame
67 | :param groupby: Variable in base to group on.
68 | :type groupby: str
69 | :param metrics: List of variables to agg and calculate the % change of.
70 | These should have associated weighted columns ending in _m in base
71 | and reform.
72 | :type metrics: list
73 | :param base_metrics: List of variables from base to sum.
74 | :type base_metrics: Optional[list]
75 | :param reform_metrics: List of variables from reform to sum.
76 | :type reform_metrics: Optional[list]
77 | :returns: DataFrame with groupby and metrics, and _pctchg metrics.
78 | :rtype: pd.DataFrame
79 |
80 | """
81 | metrics = mdf.listify(metrics)
82 | metrics_m = [i + "_m" for i in metrics]
83 | combined = combine_base_reform(
84 | base,
85 | reform,
86 | base_cols=mdf.listify([groupby, base_metrics]),
87 | cols=mdf.listify(metrics_m),
88 | reform_cols=mdf.listify(reform_metrics),
89 | )
90 | grouped = combined.groupby(groupby).sum()
91 | for metric in metrics:
92 | grouped[metric + "_pctchg"] = pctchg_base_reform(grouped, metric)
93 | return grouped
94 |
--------------------------------------------------------------------------------
/microdf/chart_utils.py:
--------------------------------------------------------------------------------
1 | def dollar_format(suffix=""):
2 | """Dollar formatter for matplotlib.
3 |
4 | :param suffix: Suffix to append, e.g. 'B'. Defaults to ''.
5 | :returns: FuncFormatter.
6 |
7 | """
8 | return currency_format(currency="USD", suffix=suffix)
9 |
10 |
11 | def currency_format(currency="USD", suffix=""):
12 | """Currency formatter for matplotlib.
13 |
14 | :param currency: Name of the currency, e.g. 'USD', 'GBP'.
15 | :param suffix: Suffix to append, e.g. 'B'. Defaults to ''.
16 | :returns: FuncFormatter.
17 |
18 | """
19 | try:
20 | import matplotlib as mpl
21 | except ImportError:
22 | raise ImportError(
23 | "The function you've called requires extra dependencies. " +
24 | "Please install microdf with the 'charts' extra by running " +
25 | "'pip install microdf[charts]'"
26 | )
27 |
28 | prefix = {"USD": "$", "GBP": "£"}[currency]
29 |
30 | return mpl.ticker.FuncFormatter(
31 | lambda x, _: prefix + format(int(x), ",") + suffix
32 | )
33 |
--------------------------------------------------------------------------------
/microdf/charts.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import microdf as mdf
4 |
5 |
6 | def quantile_pct_chg_plot(df1, df2, col1, col2, w1=None, w2=None, q=None):
7 | """Create stem plot with percent change in decile boundaries.
8 |
9 | :param df1: DataFrame with first set of values.
10 | :param df2: DataFrame with second set of values.
11 | :param col1: Name of columns with values in df1.
12 | :param col2: Name of columns with values in df2.
13 | :param w1: Name of weight column in df1.
14 | :param w2: Name of weight column in df2.
15 | :param q: Quantiles. Defaults to decile boundaries.
16 | :returns: Axis.
17 |
18 | """
19 | try:
20 | import seaborn as sns
21 | import matplotlib as mpl
22 | import matplotlib.pyplot as plt
23 | except ImportError:
24 | raise ImportError(
25 | "The function you've called requires extra dependencies. " +
26 | "Please install microdf with the 'charts' extra by running " +
27 | "'pip install microdf[charts]'"
28 | )
29 |
30 | if q is None:
31 | q = np.arange(0.1, 1, 0.1)
32 | # Calculate weighted quantiles.
33 | df = mdf.quantile_chg(df1, df2, col1, col2, w1, w2, q).transpose()
34 | # Prepare dataset for plotting.
35 | df.columns = ["base", "reform"]
36 | df["pct_chg"] = df.reform / df.base - 1
37 | # Multiply by 100 pending github.com/matplotlib/matplotlib/issues/17113
38 | df.pct_chg *= 100
39 | df["index_newline"] = np.where(
40 | df.index == "50th (median)", "50th\n(median)", df.index
41 | )
42 | # Plot.
43 | fig, ax = plt.subplots()
44 | markerline, stemlines, baseline = ax.stem(
45 | df.index_newline, df.pct_chg
46 | )
47 | plt.setp(baseline, color="gray", linewidth=0)
48 | ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))
49 | ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=100))
50 | plt.title("Change to percentiles", loc="left")
51 | plt.ylabel("Change at the percentile boundary")
52 | plt.xlabel("Percentile")
53 | sns.despine(left=True, bottom=True)
54 | ax.grid(color=mdf.GRID_COLOR, axis="y")
55 | plt.xticks(rotation=0)
56 | return ax
57 |
--------------------------------------------------------------------------------
/microdf/concat.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import inspect
3 | import microdf as mdf
4 |
5 |
6 | def concat(*args, **kwargs):
7 | """Concatenates MicroDataFrame objects, preserving weights.
8 | If concatenating horizontally, the first set of weights are used.
9 | All args and kwargs are passed to pd.concat.
10 |
11 | :return: MicroDataFrame with concatenated weights.
12 | :rtype: mdf.MicroDataFrame
13 | """
14 | # Extract args with respect to pd.concat.
15 | pd_args = inspect.getcallargs(pd.concat, *args, **kwargs)
16 | objs = pd_args["objs"]
17 | axis = pd_args["axis"]
18 | # Create result, starting with pd.concat.
19 | res = mdf.MicroDataFrame(pd.concat(*args, **kwargs))
20 | # Assign weights depending on axis.
21 | if axis == 0:
22 | res.weights = pd.concat([obj.weights for obj in objs])
23 | else:
24 | # If concatenating horizontally, use the first set of weights.
25 | res.weights = objs[0].weights
26 | return res
27 |
--------------------------------------------------------------------------------
/microdf/constants.py:
--------------------------------------------------------------------------------
1 | # Constants for share of each benefit that is cash.
2 | HOUSING_CASH_SHARE = 0.0
3 | MCAID_CASH_SHARE = 0.0
4 | MCARE_CASH_SHARE = 0.0
5 | # https://github.com/open-source-economics/taxdata/issues/148
6 | # https://docs.google.com/spreadsheets/d/1g_YdFd5idgLL764G0pZBiBnIlnCBGyxBmapXCOZ1OV4
7 | OTHER_CASH_SHARE = 0.35
8 | SNAP_CASH_SHARE = 0.0
9 | SSI_CASH_SHARE = 1.0
10 | TANF_CASH_SHARE = 0.25
11 | # https://github.com/open-source-economics/C-TAM/issues/62.
12 | VET_CASH_SHARE = 0.48
13 | WIC_CASH_SHARE = 0.0
14 |
15 | # Columns to remove from expanded_income to approximate TPC's Expanded Cash
16 | # Income.
17 | ECI_REMOVE_COLS = [
18 | "wic_ben",
19 | "housing_ben",
20 | "vet_ben",
21 | "mcare_ben",
22 | "mcaid_ben",
23 | ]
24 |
25 | # Benefits.
26 | BENS = [
27 | "housing_ben",
28 | "mcaid_ben",
29 | "mcare_ben",
30 | "vet_ben",
31 | "other_ben",
32 | "snap_ben",
33 | "ssi_ben",
34 | "tanf_ben",
35 | "wic_ben",
36 | "e02400", # Social Security (OASDI).
37 | "e02300", # Unemployment insurance.
38 | ]
39 |
40 | MED_BENS = ["mcaid_ben", "mcare_ben", "vet_ben"]
41 |
--------------------------------------------------------------------------------
/microdf/custom_taxes.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions and data for estimating taxes outside the income tax system.
3 | Examples include value added tax, financial transaction tax, and carbon tax.
4 | """
5 |
6 | import microdf as mdf
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 |
12 | # Source:
13 | # https://www.taxpolicycenter.org/briefing-book/who-would-bear-burden-vat
14 | VAT_INCIDENCE = pd.Series(
15 | index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9],
16 | data=[3.9, 3.9, 3.6, 3.6, 3.6, 3.6, 3.6, 3.4, 3.4, 3.2, 2.8, 2.5, 2.5],
17 | )
18 | VAT_INCIDENCE /= 100
19 |
20 | # Source: Table 5 in
21 | # https://www.treasury.gov/resource-center/tax-policy/tax-analysis/Documents/WP-115.pdf
22 | CARBON_TAX_INCIDENCE = pd.Series(
23 | index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9],
24 | data=[0.8, 1.2, 1.4, 1.5, 1.6, 1.7, 1.8, 1.8, 1.8, 1.8, 1.6, 1.4, 0.7],
25 | )
26 | CARBON_TAX_INCIDENCE /= 100
27 |
28 | # Source: Figure 1 in
29 | # https://www.taxpolicycenter.org/sites/default/files/alfresco/publication-pdfs/2000587-financial-transaction-taxes.pdf
30 | FTT_INCIDENCE = pd.Series(
31 | index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9],
32 | data=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.8, 1.0],
33 | )
34 | FTT_INCIDENCE /= 100
35 |
36 |
37 | def add_custom_tax(
38 | df,
39 | segment_income,
40 | w,
41 | base_income,
42 | incidence,
43 | name,
44 | total=None,
45 | ratio=None,
46 | verbose=True,
47 | ):
48 | """Add a custom tax based on incidence analysis driven by percentiles.
49 |
50 | :param df: DataFrame.
51 | :param segment_income: Income measure used to segment tax units into
52 | quantiles.
53 | :param w: Weight used to segment into quantiles (either s006 or XTOT_m).
54 | :param base_income: Income measure by which incidence is multiplied to
55 | estimate liability.
56 | :param incidence: pandas Series indexed on the floor of an income
57 | percentile, with values for the tax rate.
58 | :param name: Name of the column to add.
59 | :param total: Total amount the tax should generate. If not provided,
60 | liabilities are calculated only based on the incidence schedule.
61 | (Default value = None)
62 | :param ratio: Ratio to adjust the tax by, compared to the original tax.
63 | This acts as a multiplier for the incidence argument.
64 | (Default value = None)
65 | :param verbose: Whether to print the tax adjustment factor if needed.
66 | Defaults to True.
67 | :returns: Nothing. Adds the column name to df representing the tax
68 | liability. df is also sorted by segment_income.
69 |
70 | """
71 | if ratio is not None:
72 | incidence = incidence * ratio
73 | assert total is None, "ratio and total cannot both be provided."
74 | df.sort_values(segment_income, inplace=True)
75 | income_percentile = 100 * df[w].cumsum() / df[w].sum()
76 | tu_incidence = incidence.iloc[
77 | pd.cut(
78 | income_percentile,
79 | # Add a right endpoint. Should be 100 but sometimes a decimal
80 | # gets added.
81 | bins=incidence.index.tolist() + [101],
82 | labels=False,
83 | )
84 | ].values
85 | df[name] = np.maximum(0, tu_incidence * df[base_income])
86 | if total is not None:
87 | initial_total = mdf.weighted_sum(df, name, "s006")
88 | if verbose:
89 | print(
90 | "Multiplying tax by "
91 | + str(round(total / initial_total, 2))
92 | + "."
93 | )
94 | df[name] *= total / initial_total
95 |
96 |
97 | def add_vat(
98 | df,
99 | segment_income="tpc_eci",
100 | w="XTOT_m",
101 | base_income="aftertax_income",
102 | incidence=VAT_INCIDENCE,
103 | name="vat",
104 | **kwargs
105 | ):
106 | """Add value added tax based on incidence estimate from Tax Policy Center.
107 |
108 | :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income.
109 | :param Other: arguments: Args to add_custom_tax with VAT defaults.
110 | :param segment_income: Default value = "tpc_eci")
111 | :param w: Default value = "XTOT_m")
112 | :param base_income: Default value = "aftertax_income")
113 | :param incidence: Default value = VAT_INCIDENCE)
114 | :param name: Default value = "vat")
115 | :param **kwargs: Other arguments passed to add_custom_tax().
116 | :returns: Nothing. Adds vat to df.
117 | df is also sorted by tpc_eci.
118 |
119 | """
120 | add_custom_tax(
121 | df, segment_income, w, base_income, incidence, name, **kwargs
122 | )
123 |
124 |
125 | def add_carbon_tax(
126 | df,
127 | segment_income="tpc_eci",
128 | w="XTOT_m",
129 | base_income="aftertax_income",
130 | incidence=CARBON_TAX_INCIDENCE,
131 | name="carbon_tax",
132 | **kwargs
133 | ):
134 | """Add carbon tax based on incidence estimate from the US Treasury
135 | Department.
136 |
137 | :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income.
138 | :param Other: arguments: Args to add_custom_tax with carbon tax defaults.
139 | :param segment_income: Default value = "tpc_eci")
140 | :param w: Default value = "XTOT_m")
141 | :param base_income: Default value = "aftertax_income")
142 | :param incidence: Default value = CARBON_TAX_INCIDENCE)
143 | :param name: Default value = "carbon_tax")
144 | :param **kwargs: Other arguments passed to add_custom_tax().
145 | :returns: Nothing. Adds carbon_tax to df.
146 | df is also sorted by tpc_eci.
147 |
148 | """
149 | add_custom_tax(
150 | df, segment_income, w, base_income, incidence, name, **kwargs
151 | )
152 |
153 |
154 | def add_ftt(
155 | df,
156 | segment_income="tpc_eci",
157 | w="XTOT_m",
158 | base_income="aftertax_income",
159 | incidence=FTT_INCIDENCE,
160 | name="ftt",
161 | **kwargs
162 | ):
163 | """Add financial transaction tax based on incidence estimate from Tax
164 | Policy Center.
165 |
166 | :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income.
167 | :param Other: arguments: Args to add_custom_tax with FTT defaults.
168 | :param segment_income: Default value = "tpc_eci")
169 | :param w: Default value = "XTOT_m")
170 | :param base_income: Default value = "aftertax_income")
171 | :param incidence: Default value = FTT_INCIDENCE)
172 | :param name: Default value = "ftt")
173 | :param **kwargs: Other arguments passed to add_custom_tax().
174 | :returns: Nothing. Adds ftt to df.
175 | df is also sorted by tpc_eci.
176 |
177 | """
178 | add_custom_tax(
179 | df, segment_income, w, base_income, incidence, name, **kwargs
180 | )
181 |
--------------------------------------------------------------------------------
/microdf/generic.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, Union
2 | from functools import wraps
3 | import warnings
4 | import copy
5 | import numpy as np
6 | import pandas as pd
7 |
8 |
9 | class MicroSeries(pd.Series):
10 | def __init__(self, *args, weights: np.array = None, **kwargs):
11 | """A Series-inheriting class for weighted microdata.
12 | Weights can be provided at initialisation, or using set_weights.
13 |
14 | :param weights: Array of weights.
15 | :type weights: np.array
16 | """
17 | super().__init__(*args, **kwargs)
18 | self.set_weights(weights)
19 |
20 | def weighted_function(fn: Callable) -> Callable:
21 | @wraps(fn)
22 | def safe_fn(*args, **kwargs):
23 | try:
24 | return fn(*args, **kwargs)
25 | except ZeroDivisionError:
26 | return np.NaN
27 |
28 | return safe_fn
29 |
30 | @weighted_function
31 | def scalar_function(fn: Callable) -> Callable:
32 | fn._rtype = float
33 | return fn
34 |
35 | @weighted_function
36 | def vector_function(fn: Callable) -> Callable:
37 | fn._rtype = pd.Series
38 | return fn
39 |
40 | def set_weights(self, weights: np.array) -> None:
41 | """Sets the weight values.
42 |
43 | :param weights: Array of weights.
44 | :type weights: np.array.
45 | """
46 | if weights is None:
47 | self.weights = pd.Series(np.ones_like(self.values), dtype=float)
48 | else:
49 | self.weights = pd.Series(weights, dtype=float)
50 |
51 | @vector_function
52 | def weight(self) -> pd.Series:
53 | """Calculates the weighted value of the MicroSeries.
54 |
55 | :returns: A Series multiplying the MicroSeries by its weight.
56 | :rtype: pd.Series
57 | """
58 | return self.multiply(self.weights)
59 |
60 | @scalar_function
61 | def sum(self) -> float:
62 | """Calculates the weighted sum of the MicroSeries.
63 |
64 | :returns: The weighted sum.
65 | :rtype: float
66 | """
67 | return self.multiply(self.weights).sum()
68 |
69 | @scalar_function
70 | def count(self) -> float:
71 | """Calculates the weighted count of the MicroSeries.
72 |
73 | :returns: The weighted count.
74 | """
75 | return self.weights.sum()
76 |
77 | @scalar_function
78 | def mean(self) -> float:
79 | """Calculates the weighted mean of the MicroSeries
80 |
81 | :returns: The weighted mean.
82 | :rtype: float
83 | """
84 | return np.average(self.values, weights=self.weights)
85 |
86 | def quantile(self, q: np.array) -> pd.Series:
87 | """Calculates weighted quantiles of the MicroSeries.
88 |
89 | Doesn't exactly match unweighted quantiles of stacked values.
90 | See stackoverflow.com/q/21844024#comment102342137_29677616.
91 |
92 | :param q: Array of quantiles to calculate.
93 | :type q: np.array
94 |
95 | :return: Array of weighted quantiles.
96 | :rtype: pd.Series
97 | """
98 | values = np.array(self.values)
99 | quantiles = np.array(q)
100 | sample_weight = np.array(self.weights)
101 | assert np.all(quantiles >= 0) and np.all(
102 | quantiles <= 1
103 | ), "quantiles should be in [0, 1]"
104 | sorter = np.argsort(values)
105 | values = values[sorter]
106 | sample_weight = sample_weight[sorter]
107 | weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
108 | weighted_quantiles /= np.sum(sample_weight)
109 | result = np.interp(quantiles, weighted_quantiles, values)
110 | if quantiles.shape == ():
111 | return result
112 | return pd.Series(result, index=quantiles)
113 |
114 | @scalar_function
115 | def median(self) -> float:
116 | """Calculates the weighted median of the MicroSeries.
117 |
118 | :returns: The weighted median of a DataFrame's column.
119 | :rtype: float
120 | """
121 | return self.quantile(0.5)
122 |
123 | @scalar_function
124 | def gini(self, negatives: str = None) -> float:
125 | """Calculates Gini index.
126 |
127 | :param negatives: An optional string indicating how to treat negative
128 | values of x:
129 | 'zero' replaces negative values with zeroes.
130 | 'shift' subtracts the minimum value from all values of x,
131 | when this minimum is negative. That is, it adds the absolute
132 | minimum value.
133 | Defaults to None, which leaves negative values as they are.
134 | :type q: str
135 | :returns: Gini index.
136 | :rtype: float
137 | """
138 | x = np.array(self).astype("float")
139 | if negatives == "zero":
140 | x[x < 0] = 0
141 | if negatives == "shift" and np.amin(x) < 0:
142 | x -= np.amin(x)
143 | if (self.weights != np.ones(len(self))).any(): # Varying weights.
144 | sorted_indices = np.argsort(self)
145 | sorted_x = np.array(self[sorted_indices])
146 | sorted_w = np.array(self.weights[sorted_indices])
147 | cumw = np.cumsum(sorted_w)
148 | cumxw = np.cumsum(sorted_x * sorted_w)
149 | return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
150 | cumxw[-1] * cumw[-1]
151 | )
152 | else:
153 | sorted_x = np.sort(self)
154 | n = len(x)
155 | cumxw = np.cumsum(sorted_x)
156 | # The above formula, with all weights equal to 1 simplifies to:
157 | return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n
158 |
159 | @scalar_function
160 | def top_x_pct_share(self, top_x_pct: float) -> float:
161 | """Calculates top x% share.
162 |
163 | :param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1,
164 | 0.001.
165 | :type top_x_pct: float
166 | :returns: The weighted share held by the top x%.
167 | :rtype: float
168 | """
169 | threshold = self.quantile(1 - top_x_pct)
170 | top_x_pct_sum = self[self >= threshold].sum()
171 | total_sum = self.sum()
172 | return top_x_pct_sum / total_sum
173 |
174 | @scalar_function
175 | def bottom_x_pct_share(self, bottom_x_pct) -> float:
176 | """Calculates bottom x% share.
177 |
178 | :param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1,
179 | 0.001.
180 | :type bottom_x_pct: float
181 | :returns: The weighted share held by the bottom x%.
182 | :rtype: float
183 | """
184 | return 1 - self.top_x_pct_share(1 - bottom_x_pct)
185 |
186 | @scalar_function
187 | def bottom_50_pct_share(self) -> float:
188 | """Calculates bottom 50% share.
189 |
190 | :returns: The weighted share held by the bottom 50%.
191 | :rtype: float
192 | """
193 | return self.bottom_x_pct_share(0.5)
194 |
195 | @scalar_function
196 | def top_50_pct_share(self) -> float:
197 | """Calculates top 50% share.
198 |
199 | :returns: The weighted share held by the top 50%.
200 | :rtype: float
201 | """
202 | return self.top_x_pct_share(0.5)
203 |
204 | @scalar_function
205 | def top_10_pct_share(self) -> float:
206 | """Calculates top 10% share.
207 |
208 | :returns: The weighted share held by the top 10%.
209 | :rtype: float
210 | """
211 | return self.top_x_pct_share(0.1)
212 |
213 | @scalar_function
214 | def top_1_pct_share(self) -> float:
215 | """Calculates top 1% share.
216 |
217 | :returns: The weighted share held by the top 50%.
218 | :rtype: float
219 | """
220 | return self.top_x_pct_share(0.01)
221 |
222 | @scalar_function
223 | def top_0_1_pct_share(self) -> float:
224 | """Calculates top 0.1% share.
225 |
226 | :returns: The weighted share held by the top 0.1%.
227 | :rtype: float
228 | """
229 | return self.top_x_pct_share(0.001)
230 |
231 | @scalar_function
232 | def t10_b50(self) -> float:
233 | """Calculates ratio between the top 10% and bottom 50% shares.
234 |
235 | :returns: The weighted share held by the top 10% divided by
236 | the weighted share held by the bottom 50%.
237 |
238 | """
239 | t10 = self.top_10_pct_share()
240 | b50 = self.bottom_50_pct_share()
241 | return t10 / b50
242 |
243 | @vector_function
244 | def cumsum(self) -> pd.Series:
245 | return pd.Series(self * self.weights).cumsum()
246 |
247 | @vector_function
248 | def rank(self, pct=False) -> pd.Series:
249 | order = np.argsort(self.values)
250 | inverse_order = np.argsort(order)
251 | ranks = np.array(self.weights.values)[order].cumsum()[inverse_order]
252 | if pct:
253 | ranks /= self.weights.values.sum()
254 | np.where(ranks > 1.0, 1.0, ranks)
255 | return pd.Series(ranks, index=self.index)
256 |
257 | @vector_function
258 | def decile_rank(self):
259 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 10), 10))
260 |
261 | @vector_function
262 | def quintile_rank(self):
263 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 5), 5))
264 |
265 | @vector_function
266 | def quartile_rank(self):
267 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 4), 4))
268 |
269 | @vector_function
270 | def percentile_rank(self):
271 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 100), 100))
272 |
273 | def groupby(self, *args, **kwargs):
274 | gb = super().groupby(*args, **kwargs)
275 | gb.__class__ = MicroSeriesGroupBy
276 | gb._init()
277 | gb.weights = pd.Series(self.weights).groupby(*args, **kwargs)
278 | return gb
279 |
280 | def copy(self, deep=True):
281 | res = super().copy(deep)
282 | res = MicroSeries(res, weights=self.weights.copy(deep))
283 | return res
284 |
285 | def equals(self, other) -> bool:
286 | equal_values = super().equals(other)
287 | equal_weights = self.weights.equals(other.weights)
288 | return equal_values and equal_weights
289 |
290 | def __getitem__(self, key):
291 | result = super().__getitem__(key)
292 | if isinstance(result, pd.Series):
293 | weights = self.weights.__getitem__(key)
294 | return MicroSeries(result, weights=weights)
295 | return result
296 |
297 | def __getattr__(self, name):
298 | return MicroSeries(super().__getattr__(name), weights=self.weights)
299 |
300 | # operators
301 |
302 | def __add__(self, other):
303 | return MicroSeries(super().__add__(other), weights=self.weights)
304 |
305 | def __sub__(self, other):
306 | return MicroSeries(super().__sub__(other), weights=self.weights)
307 |
308 | def __mul__(self, other):
309 | return MicroSeries(super().__mul__(other), weights=self.weights)
310 |
311 | def __floordiv__(self, other):
312 | return MicroSeries(super().__floordiv__(other), weights=self.weights)
313 |
314 | def __truediv__(self, other):
315 | return MicroSeries(super().__truediv__(other), weights=self.weights)
316 |
317 | def __mod__(self, other):
318 | return MicroSeries(super().__mod__(other), weights=self.weights)
319 |
320 | def __pow__(self, other):
321 | return MicroSeries(super().__pow__(other), weights=self.weights)
322 |
323 | # comparators
324 |
325 | def __lt__(self, other):
326 | return MicroSeries(super().__lt__(other), weights=self.weights)
327 |
328 | def __le__(self, other):
329 | return MicroSeries(super().__le__(other), weights=self.weights)
330 |
331 | def __eq__(self, other):
332 | return MicroSeries(super().__eq__(other), weights=self.weights)
333 |
334 | def __ne__(self, other):
335 | return MicroSeries(super().__ne__(other), weights=self.weights)
336 |
337 | def __ge__(self, other):
338 | return MicroSeries(super().__ge__(other), weights=self.weights)
339 |
340 | def __gt__(self, other):
341 | return MicroSeries(super().__gt__(other), weights=self.weights)
342 |
343 | # assignment operators
344 |
345 | def __iadd__(self, other):
346 | return MicroSeries(super().__iadd__(other), weights=self.weights)
347 |
348 | def __isub__(self, other):
349 | return MicroSeries(super().__isub__(other), weights=self.weights)
350 |
351 | def __imul__(self, other):
352 | return MicroSeries(super().__imul__(other), weights=self.weights)
353 |
354 | def __ifloordiv__(self, other):
355 | return MicroSeries(super().__ifloordiv__(other), weights=self.weights)
356 |
357 | def __idiv__(self, other):
358 | return MicroSeries(super().__idiv__(other), weights=self.weights)
359 |
360 | def __itruediv__(self, other):
361 | return MicroSeries(super().__itruediv__(other), weights=self.weights)
362 |
363 | def __imod__(self, other):
364 | return MicroSeries(super().__imod__(other), weights=self.weights)
365 |
366 | def __ipow__(self, other):
367 | return MicroSeries(super().__ipow__(other), weights=self.weights)
368 |
369 | # other
370 |
371 | def __neg__(self, other):
372 | return MicroSeries(super().__neg__(other), weights=self.weights)
373 |
374 | def __pos__(self, other):
375 | return MicroSeries(super().__pos__(other), weights=self.weights)
376 |
377 | def __repr__(self):
378 | return pd.DataFrame(
379 | dict(value=self.values, weight=self.weights.values)
380 | ).__repr__()
381 |
382 |
383 | MicroSeries.SCALAR_FUNCTIONS = [
384 | fn
385 | for fn in dir(MicroSeries)
386 | if "_rtype" in dir(getattr(MicroSeries, fn))
387 | and getattr(getattr(MicroSeries, fn), "_rtype") == float
388 | ]
389 | MicroSeries.VECTOR_FUNCTIONS = [
390 | fn
391 | for fn in dir(MicroSeries)
392 | if "_rtype" in dir(getattr(MicroSeries, fn))
393 | and getattr(getattr(MicroSeries, fn), "_rtype") == pd.Series
394 | ]
395 | MicroSeries.AGNOSTIC_FUNCTIONS = ["quantile"]
396 | MicroSeries.FUNCTIONS = sum(
397 | [
398 | MicroSeries.SCALAR_FUNCTIONS,
399 | MicroSeries.VECTOR_FUNCTIONS,
400 | MicroSeries.AGNOSTIC_FUNCTIONS,
401 | ],
402 | [],
403 | )
404 |
405 |
406 | class MicroSeriesGroupBy(pd.core.groupby.generic.SeriesGroupBy):
407 | def _init(self):
408 | def _weighted_agg(name) -> Callable:
409 | def via_micro_series(row, *args, **kwargs):
410 | return getattr(MicroSeries(row.a, weights=row.w), name)(
411 | *args, **kwargs
412 | )
413 |
414 | fn = getattr(MicroSeries, name)
415 |
416 | @wraps(fn)
417 | def _weighted_agg_fn(*args, **kwargs):
418 | arrays = self.apply(np.array)
419 | weights = self.weights.apply(np.array)
420 | df = pd.DataFrame(dict(a=arrays, w=weights))
421 | is_array = len(args) > 0 and hasattr(args[0], "__len__")
422 | if (
423 | name in MicroSeries.SCALAR_FUNCTIONS
424 | or name in MicroSeries.AGNOSTIC_FUNCTIONS
425 | and not is_array
426 | ):
427 | result = df.agg(
428 | lambda row: via_micro_series(row, *args, **kwargs),
429 | axis=1,
430 | )
431 | elif (
432 | name in MicroSeries.VECTOR_FUNCTIONS
433 | or name in MicroSeries.AGNOSTIC_FUNCTIONS
434 | and is_array
435 | ):
436 | result = df.apply(
437 | lambda row: via_micro_series(row, *args, **kwargs),
438 | axis=1,
439 | )
440 | return result.stack()
441 | return result
442 |
443 | return _weighted_agg_fn
444 |
445 | for fn_name in MicroSeries.FUNCTIONS:
446 | setattr(self, fn_name, _weighted_agg(fn_name))
447 |
448 |
449 | class MicroDataFrameGroupBy(pd.core.groupby.generic.DataFrameGroupBy):
450 | def _init(self, by: Union[str, list]):
451 | self.columns = list(self.obj.columns)
452 | if isinstance(by, list):
453 | for column in by:
454 | self.columns.remove(column)
455 | elif isinstance(by, str):
456 | self.columns.remove(by)
457 | self.columns.remove("__tmp_weights")
458 | for fn_name in MicroSeries.SCALAR_FUNCTIONS:
459 |
460 | def get_fn(name):
461 | def fn(*args, **kwargs):
462 | return MicroDataFrame(
463 | {
464 | col: getattr(getattr(self, col), name)(
465 | *args, **kwargs
466 | )
467 | for col in self.columns
468 | }
469 | )
470 |
471 | return fn
472 |
473 | setattr(self, fn_name, get_fn(fn_name))
474 | for fn_name in MicroSeries.VECTOR_FUNCTIONS:
475 |
476 | def get_fn(name):
477 | def fn(*args, **kwargs):
478 | return MicroDataFrame(
479 | {
480 | col: getattr(getattr(self, col), name)(
481 | *args, **kwargs
482 | )
483 | for col in self.columns
484 | }
485 | )
486 |
487 | return fn
488 |
489 | setattr(self, fn_name, get_fn(fn_name))
490 |
491 |
492 | class MicroDataFrame(pd.DataFrame):
493 | def __init__(self, *args, weights=None, **kwargs):
494 | """A DataFrame-inheriting class for weighted microdata.
495 | Weights can be provided at initialisation, or using set_weights or
496 | set_weight_col.
497 |
498 | :param weights: Array of weights.
499 | :type weights: np.array
500 | """
501 | super().__init__(*args, **kwargs)
502 | self.weights = None
503 | self.set_weights(weights)
504 | self._link_all_weights()
505 | self.override_df_functions()
506 |
507 | def override_df_functions(self):
508 | for name in MicroSeries.FUNCTIONS:
509 |
510 | def get_fn(name):
511 | def fn(*args, **kwargs):
512 | is_array = len(args) > 0 and hasattr(args[0], "__len__")
513 | if (
514 | name in MicroSeries.SCALAR_FUNCTIONS
515 | or name in MicroSeries.AGNOSTIC_FUNCTIONS
516 | and not is_array
517 | ):
518 | results = pd.Series(
519 | [
520 | getattr(self[col], name)(*args, **kwargs)
521 | for col in self.columns
522 | ]
523 | )
524 | results.index = self.columns
525 | return results
526 | elif (
527 | name in MicroSeries.VECTOR_FUNCTIONS
528 | or name in MicroSeries.AGNOSTIC_FUNCTIONS
529 | and is_array
530 | ):
531 | results = pd.DataFrame(
532 | [
533 | getattr(self[col], name)(*args, **kwargs)
534 | for col in self.columns
535 | ]
536 | )
537 | results.index = self.columns
538 | return results
539 |
540 | return fn
541 |
542 | setattr(self, name, get_fn(name))
543 |
544 | def get_args_as_micro_series(*kwarg_names: tuple) -> Callable:
545 | """Decorator for auto-parsing column names into MicroSeries objects.
546 | If given, kwarg_names limits arguments checked to keyword arguments
547 | specified.
548 |
549 | :param arg_names: argument names to restrict to.
550 | :type arg_names: str
551 | """
552 |
553 | def arg_series_decorator(fn):
554 | @wraps(fn)
555 | def series_function(self, *args, **kwargs):
556 | new_args = []
557 | new_kwargs = {}
558 | if len(kwarg_names) == 0:
559 | for value in args:
560 | if isinstance(value, str):
561 | if value not in self.columns:
562 | raise Exception("Column not found")
563 | new_args += [self[value]]
564 | else:
565 | new_args += [value]
566 | for name, value in kwargs.items():
567 | if isinstance(value, str) and (
568 | len(kwarg_names) == 0 or name in kwarg_names
569 | ):
570 | if value not in self.columns:
571 | raise Exception("Column not found")
572 | new_kwargs[name] = self[value]
573 | else:
574 | new_kwargs[name] = value
575 | return fn(self, *new_args, **new_kwargs)
576 |
577 | return series_function
578 |
579 | return arg_series_decorator
580 |
581 | def __setitem__(self, *args, **kwargs):
582 | super().__setitem__(*args, **kwargs)
583 | self._link_all_weights()
584 |
585 | def _link_weights(self, column):
586 | # self[column] = ... triggers __setitem__, which forces pd.Series
587 | # this workaround avoids that
588 | self[column].__class__ = MicroSeries
589 | self[column].set_weights(self.weights)
590 |
591 | def _link_all_weights(self):
592 | if self.weights is None:
593 | self.set_weights(np.ones((len(self))))
594 | for column in self.columns:
595 | if column != self.weights_col:
596 | self._link_weights(column)
597 |
598 | def set_weights(self, weights) -> None:
599 | """Sets the weights for the MicroDataFrame. If a string is received,
600 | it will be assumed to be the column name of the weight column.
601 |
602 | :param weights: Array of weights.
603 | :type weights: np.array
604 | """
605 | if isinstance(weights, str):
606 | self.weights_col = weights
607 | self.weights = pd.Series(self[weights], dtype=float)
608 | elif weights is not None:
609 | self.weights_col = None
610 | with warnings.catch_warnings():
611 | warnings.filterwarnings("ignore", category=UserWarning)
612 | self.weights = pd.Series(weights, dtype=float)
613 | self._link_all_weights()
614 |
615 | def set_weight_col(self, column: str) -> None:
616 | """Sets the weights for the MicroDataFrame by specifying the name of
617 | the weight column.
618 |
619 | :param weights: Array of weights.
620 | :type weights: np.array
621 | """
622 | self.weights = np.array(self[column])
623 | self.weight_col = column
624 | self._link_all_weights()
625 |
626 | def __getitem__(self, key):
627 | result = super().__getitem__(key)
628 | if isinstance(result, pd.DataFrame):
629 | try:
630 | weights = self.weights[key]
631 | except Exception:
632 | weights = self.weights
633 | return MicroDataFrame(result, weights=weights)
634 | return result
635 |
636 | def catch_series_relapse(self):
637 | for col in self.columns:
638 | if self[col].__class__ == pd.Series:
639 | self._link_weights(col)
640 |
641 | def __setattr__(self, key, value):
642 | super().__setattr__(key, value)
643 | self.catch_series_relapse()
644 |
645 | def reset_index(self):
646 | res = super().reset_index()
647 | res = MicroDataFrame(res, weights=self.weights)
648 | return res
649 |
650 | def copy(self, deep=True):
651 | res = super().copy(deep)
652 | # This changes the original columns to Series. Undo it:
653 | for col in self.columns:
654 | self[col] = MicroSeries(self[col])
655 | res = MicroDataFrame(res, weights=self.weights.copy(deep))
656 | return res
657 |
658 | def equals(self, other) -> bool:
659 | equal_values = super().equals(other)
660 | equal_weights = self.weights.equals(other.weights)
661 | return equal_values and equal_weights
662 |
663 | @get_args_as_micro_series()
664 | def groupby(self, by: Union[str, list], *args, **kwargs):
665 | """
666 | Returns a GroupBy object with MicroSeriesGroupBy objects for
667 | each column
668 |
669 | :param by: column to group by
670 | :type by: Union[str, list]
671 |
672 | return: DataFrameGroupBy object with columns using weights
673 | rtype: DataFrameGroupBy
674 | """
675 | self["__tmp_weights"] = self.weights
676 | gb = super().groupby(by, *args, **kwargs)
677 | weights = copy.deepcopy(gb["__tmp_weights"])
678 | for col in self.columns: # df.groupby(...)[col]s use weights
679 | res = gb[col]
680 | res.__class__ = MicroSeriesGroupBy
681 | res._init()
682 | res.weights = weights
683 | setattr(gb, col, res)
684 | gb.__class__ = MicroDataFrameGroupBy
685 | gb._init(by)
686 | return gb
687 |
688 | @get_args_as_micro_series()
689 | def poverty_rate(self, income: str, threshold: str) -> float:
690 | """Calculate poverty rate, i.e., the population share with income
691 | below their poverty threshold.
692 |
693 | :param income: Column indicating income.
694 | :type income: str
695 | :param threshold: Column indicating threshold.
696 | :type threshold: str
697 | :return: Poverty rate between zero and one.
698 | :rtype: float
699 | """
700 | pov = income < threshold
701 | return pov.sum() / pov.count()
702 |
703 | @get_args_as_micro_series()
704 | def deep_poverty_rate(self, income: str, threshold: str) -> float:
705 | """Calculate deep poverty rate, i.e., the population share with income
706 | below half their poverty threshold.
707 |
708 | :param income: Column indicating income.
709 | :type income: str
710 | :param threshold: Column indicating threshold.
711 | :type threshold: str
712 | :return: Deep poverty rate between zero and one.
713 | :rtype: float
714 | """
715 | pov = income < (threshold / 2)
716 | return pov.sum() / pov.count()
717 |
718 | @get_args_as_micro_series()
719 | def poverty_gap(self, income: str, threshold: str) -> float:
720 | """Calculate poverty gap, i.e., the total gap between income and
721 | poverty thresholds for all people in poverty.
722 |
723 | :param income: Column indicating income.
724 | :type income: str
725 | :param threshold: Column indicating threshold.
726 | :type threshold: str
727 | :return: Poverty gap.
728 | :rtype: float
729 | """
730 | gaps = (threshold - income)[threshold > income]
731 | return gaps.sum()
732 |
733 | @get_args_as_micro_series()
734 | def deep_poverty_gap(self, income: str, threshold: str) -> float:
735 | """Calculate deep poverty gap, i.e., the total gap between income and
736 | half of poverty thresholds for all people in deep poverty.
737 |
738 | :param income: Column indicating income.
739 | :type income: str
740 | :param threshold: Column indicating threshold.
741 | :type threshold: str
742 | :return: Deep poverty gap.
743 | :rtype: float
744 | """
745 | deep_threshold = threshold / 2
746 | gaps = (deep_threshold - income)[deep_threshold > income]
747 | return gaps.sum()
748 |
749 | @get_args_as_micro_series()
750 | def squared_poverty_gap(self, income: str, threshold: str) -> float:
751 | """Calculate squared poverty gap, i.e., the total squared gap between
752 | income and poverty thresholds for all people in poverty.
753 | Also known as the poverty severity index.
754 |
755 | :param income: Column indicating income.
756 | :type income: str
757 | :param threshold: Column indicating threshold.
758 | :type threshold: str
759 | :return: Squared poverty gap.
760 | :rtype: float
761 | """
762 | gaps = (threshold - income)[threshold > income]
763 | squared_gaps = gaps ** 2
764 | return squared_gaps.sum()
765 |
766 | @get_args_as_micro_series()
767 | def poverty_count(
768 | self,
769 | income: Union[MicroSeries, str],
770 | threshold: Union[MicroSeries, str],
771 | ) -> int:
772 | """
773 | Calculates the number of entities with income below a poverty
774 | threshold.
775 |
776 | :param income: income array or column name
777 | :type income: Union[MicroSeries, str]
778 |
779 | :param threshold: threshold array or column name
780 | :type threshold: Union[MicroSeries, str]
781 |
782 | return: number of entities in poverty
783 | rtype: int
784 | """
785 | in_poverty = income < threshold
786 | return in_poverty.sum()
787 |
788 | def __repr__(self):
789 | df = pd.DataFrame(self)
790 | df["weight"] = self.weights
791 | return df[[df.columns[-1]] + list(df.columns[:-1])].__repr__()
792 |
--------------------------------------------------------------------------------
/microdf/income_measures.py:
--------------------------------------------------------------------------------
1 | import microdf as mdf
2 |
3 | # See
4 | # https://docs.google.com/spreadsheets/d/1I-Qe8uD58bLnPkimc9eaPgs4AE7x5FZYmTZwVX_WyT8
5 | # for a comparison of income measures used here.
6 |
7 |
8 | def cash_income(df):
9 | """Calculates income after taxes and cash transfers.
10 |
11 | Defined as aftertax_income minus non-cash benefits.
12 |
13 | :param df: A Tax-Calculator pandas DataFrame with columns for
14 | * aftertax_income
15 | * housing_ben
16 | * mcaid_ben
17 | * mcare_ben
18 | * other_ben
19 | * snap_ben
20 | * ssi_bn
21 | * tanf_ben
22 | * vet_ben
23 | * wic_ben
24 | :returns: A pandas Series with the cash income for each row in df.
25 |
26 | """
27 | return (
28 | df.aftertax_income
29 | - (1 - mdf.HOUSING_CASH_SHARE) * df.housing_ben
30 | - (1 - mdf.MCAID_CASH_SHARE) * df.mcaid_ben
31 | - (1 - mdf.MCARE_CASH_SHARE) * df.mcare_ben
32 | - (1 - mdf.OTHER_CASH_SHARE) * df.other_ben
33 | - (1 - mdf.SNAP_CASH_SHARE) * df.snap_ben
34 | - (1 - mdf.SSI_CASH_SHARE) * df.ssi_ben
35 | - (1 - mdf.TANF_CASH_SHARE) * df.tanf_ben
36 | - (1 - mdf.VET_CASH_SHARE) * df.vet_ben
37 | - (1 - mdf.WIC_CASH_SHARE) * df.wic_ben
38 | )
39 |
40 |
41 | def tpc_eci(df):
42 | """Approximates Tax Policy Center's Expanded Cash Income measure.
43 |
44 | Subtracts WIC, housing assistance, veteran's benefits, Medicare, and
45 | Medicaid from expanded_income. ECI adds income measures not modeled in
46 | Tax-Calculator, so these are ignored and will create a discrepancy
47 | compared to TPC's ECI.
48 |
49 | :param df: DataFrame with columns from Tax-Calculator.
50 | :returns: pandas Series with TPC's ECI.
51 |
52 | """
53 | return df.expanded_income - df[mdf.ECI_REMOVE_COLS].sum(axis=1)
54 |
55 |
56 | def market_income(df):
57 | """Approximates CBO's market income concept, which is income
58 | before social insurance, means-tested transfers, and taxes.
59 |
60 | :param df: DataFrame with expanded_income and benefits.
61 | :returns: pandas Series of the same length as df.
62 |
63 | """
64 | return df.expanded_income - df[mdf.BENS].sum(axis=1)
65 |
--------------------------------------------------------------------------------
/microdf/inequality.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import microdf as mdf
4 |
5 |
6 | def gini(df, col, w=None, negatives=None, groupby=None):
7 | """Calculates Gini index.
8 |
9 | :param df: DataFrame.
10 | :param col: Name of column in df representing value.
11 | :param w: Column representing weight in df.
12 | :param negatives: An optional string indicating how to treat negative
13 | values of x:
14 | 'zero' replaces negative values with zeroes.
15 | 'shift' subtracts the minimum value from all values of x,
16 | when this minimum is negative. That is, it adds the absolute
17 | minimum value.
18 | Defaults to None, which leaves negative values as they are.
19 | :param groupby: Column, or list of columns, to group by.
20 | :returns: A float, the Gini index.
21 |
22 | """
23 |
24 | def _gini(df, col, w=None, negatives=None):
25 | # Requires float numpy arrays (not pandas Series or lists) to work.
26 | x = np.array(df[col]).astype("float")
27 | if negatives == "zero":
28 | x[x < 0] = 0
29 | if negatives == "shift" and np.amin(x) < 0:
30 | x -= np.amin(x)
31 | if w is not None:
32 | w = np.array(df[w]).astype("float")
33 | sorted_indices = np.argsort(x)
34 | sorted_x = x[sorted_indices]
35 | sorted_w = w[sorted_indices]
36 | cumw = np.cumsum(sorted_w)
37 | cumxw = np.cumsum(sorted_x * sorted_w)
38 | return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
39 | cumxw[-1] * cumw[-1]
40 | )
41 | else:
42 | sorted_x = np.sort(x)
43 | n = len(x)
44 | cumxw = np.cumsum(sorted_x)
45 | # The above formula, with all weights equal to 1 simplifies to:
46 | return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n
47 |
48 | if groupby is None:
49 | return _gini(df, col, w, negatives)
50 | return df.groupby(groupby).apply(lambda x: _gini(x, col, w, negatives))
51 |
52 |
53 | def top_x_pct_share(df, col, top_x_pct, w=None, groupby=None):
54 | """Calculates top x% share.
55 |
56 | :param df: DataFrame.
57 | :param col: Name of column in df representing value.
58 | :param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
59 | :param w: Column representing weight in df.
60 | :param groupby: Column, or list of columns, to group by.
61 | :returns: The share of w-weighted val held by the top x%.
62 |
63 | """
64 |
65 | def _top_x_pct_share(df, col, top_x_pct, w=None):
66 | threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct)
67 | top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w)
68 | total_sum = mdf.weighted_sum(df, col, w)
69 | return top_x_pct_sum / total_sum
70 |
71 | if groupby is None:
72 | return _top_x_pct_share(df, col, top_x_pct, w)
73 | return df.groupby(groupby).apply(
74 | lambda x: _top_x_pct_share(x, col, top_x_pct, w)
75 | )
76 |
77 |
78 | def bottom_x_pct_share(df, col, bottom_x_pct, w=None, groupby=None):
79 | """Calculates bottom x% share.
80 |
81 | :param df: DataFrame.
82 | :param col: Name of column in df representing value.
83 | :param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
84 | :param w: Column representing weight in df.
85 | :param groupby: Column, or list of columns, to group by.
86 | :returns: The share of w-weighted val held by the bottom x%.
87 |
88 | """
89 | return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, groupby)
90 |
91 |
92 | def bottom_50_pct_share(df, col, w=None, groupby=None):
93 | """Calculates bottom 50% share.
94 |
95 | :param df: DataFrame.
96 | :param col: Name of column in df representing value.
97 | :param w: Column representing weight in df.
98 | :param groupby: Column, or list of columns, to group by.
99 | :returns: The share of w-weighted val held by the bottom 50%.
100 |
101 | """
102 | return bottom_x_pct_share(df, col, 0.5, w, groupby)
103 |
104 |
105 | def top_50_pct_share(df, col, w=None, groupby=None):
106 | """Calculates top 50% share.
107 |
108 | :param df: DataFrame.
109 | :param col: Name of column in df representing value.
110 | :param w: Column representing weight in df.
111 | :param groupby: Column, or list of columns, to group by.
112 | :returns: The share of w-weighted val held by the top 50%.
113 |
114 | """
115 | return top_x_pct_share(df, col, 0.5, w, groupby)
116 |
117 |
118 | def top_10_pct_share(df, col, w=None, groupby=None):
119 | """Calculates top 10% share.
120 |
121 | :param df: DataFrame.
122 | :param col: Name of column in df representing value.
123 | :param w: Column representing weight in df.
124 | :param groupby: Column, or list of columns, to group by.
125 | :returns: The share of w-weighted val held by the top 10%.
126 |
127 | """
128 | return top_x_pct_share(df, col, 0.1, w, groupby)
129 |
130 |
131 | def top_1_pct_share(df, col, w=None, groupby=None):
132 | """Calculates top 1% share.
133 |
134 | :param df: DataFrame.
135 | :param col: Name of column in df representing value.
136 | :param w: Column representing weight in df.
137 | :param groupby: Column, or list of columns, to group by.
138 | :returns: The share of w-weighted val held by the top 1%.
139 |
140 | """
141 | return top_x_pct_share(df, col, 0.01, w, groupby)
142 |
143 |
144 | def top_0_1_pct_share(df, col, w=None, groupby=None):
145 | """Calculates top 0.1% share.
146 |
147 | :param df: DataFrame.
148 | :param col: Name of column in df representing value.
149 | :param w: Column representing weight in df.
150 | :param groupby: Column, or list of columns, to group by.
151 | :returns: The share of w-weighted val held by the top 0.1%.
152 |
153 | """
154 | return top_x_pct_share(df, col, 0.001, w, groupby)
155 |
156 |
157 | def t10_b50(df, col, w=None, groupby=None):
158 | """Calculates ratio between the top 10% and bottom 50% shares.
159 |
160 | :param df: DataFrame.
161 | :param col: Name of column in df representing value.
162 | :param w: Column representing weight in df.
163 | :param groupby: Column, or list of columns, to group by.
164 | :returns: The share of w-weighted val held by the top 10% divided by
165 | the share of w-weighted val held by the bottom 50%.
166 |
167 | """
168 | t10 = top_10_pct_share(df, col, w, groupby)
169 | b50 = bottom_50_pct_share(df, col, w, groupby)
170 | return t10 / b50
171 |
--------------------------------------------------------------------------------
/microdf/io.py:
--------------------------------------------------------------------------------
1 | import io
2 | import zipfile
3 | import requests
4 | import pandas as pd
5 |
6 | HEADER = {
7 | "User-Agent":
8 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) " +
9 | "AppleWebKit/537.36 (KHTML, like Gecko) " +
10 | "Chrome/50.0.2661.102 Safari/537.36"
11 | }
12 |
13 |
14 | def read_stata_zip(url: str, **kwargs) -> pd.DataFrame:
15 | """Reads zipped Stata file by URL.
16 |
17 | From https://stackoverflow.com/a/59122689/1840471
18 |
19 | Pending native support in
20 | https://github.com/pandas-dev/pandas/issues/26599.
21 |
22 | :param url: URL string of .zip file containing a single
23 | .dta file.
24 | :param **kwargs: Arguments passed to pandas.read_stata().
25 | :returns: DataFrame.
26 |
27 | """
28 | r = requests.get(url, headers=HEADER)
29 | data = io.BytesIO(r.content)
30 | with zipfile.ZipFile(data) as archive:
31 | with archive.open(archive.namelist()[0]) as stata:
32 | return pd.read_stata(stata, **kwargs)
33 |
--------------------------------------------------------------------------------
/microdf/poverty.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | def fpl(people: int):
6 | """Calculates the federal poverty guideline for a household of a certain
7 | size.
8 |
9 | :param XTOT: The number of people in the household.
10 | :param people: returns: The federal poverty guideline for the contiguous
11 | 48 states.
12 | :returns: The federal poverty guideline for the contiguous 48 states.
13 |
14 | """
15 | return 7820 + 4320 * people
16 |
17 |
18 | def poverty_rate(
19 | df: pd.DataFrame, income: str, threshold: str, w: str = None
20 | ) -> float:
21 | """Calculate poverty rate, i.e., the population share with income
22 | below their poverty threshold.
23 |
24 | :param df: DataFrame with income, threshold, and possibly weight columns
25 | for each person/household.
26 | :type df: pd.DataFrame
27 | :param income: Column indicating income.
28 | :type income: str
29 | :param threshold: Column indicating threshold.
30 | :type threshold: str
31 | :param w: Column indicating weight, defaults to None (unweighted).
32 | :type w: str, optional
33 | :return: Poverty rate between zero and one.
34 | :rtype: float
35 | """
36 | pov = df[income] < df[threshold]
37 | if w is None:
38 | return pov.mean()
39 | return (pov * df[w]).sum() / df[w].sum()
40 |
41 |
42 | def deep_poverty_rate(
43 | df: pd.DataFrame, income: str, threshold: str, w: str = None
44 | ) -> float:
45 | """Calculate deep poverty rate, i.e., the population share with income
46 | below half their poverty threshold.
47 |
48 | :param df: DataFrame with income, threshold, and possibly weight columns
49 | for each person/household.
50 | :type df: pd.DataFrame
51 | :param income: Column indicating income.
52 | :type income: str
53 | :param threshold: Column indicating threshold.
54 | :type threshold: str
55 | :param w: Column indicating weight, defaults to None (unweighted).
56 | :type w: str, optional
57 | :return: Deep poverty rate between zero and one.
58 | :rtype: float
59 | """
60 | pov = df[income] < df[threshold] / 2
61 | if w is None:
62 | return pov.mean()
63 | return (pov * df[w]).sum() / df[w].sum()
64 |
65 |
66 | def poverty_gap(
67 | df: pd.DataFrame, income: str, threshold: str, w: str = None
68 | ) -> float:
69 | """Calculate poverty gap, i.e., the total gap between income and poverty
70 | thresholds for all people in poverty.
71 |
72 | :param df: DataFrame with income, threshold, and possibly weight columns
73 | for each household (data should represent households, not persons).
74 | :type df: pd.DataFrame
75 | :param income: Column indicating income.
76 | :type income: str
77 | :param threshold: Column indicating threshold.
78 | :type threshold: str
79 | :param w: Column indicating weight, defaults to None (unweighted).
80 | :type w: str, optional
81 | :return: Poverty gap.
82 | :rtype: float
83 | """
84 | gap = np.maximum(df[threshold] - df[income], 0)
85 | if w is None:
86 | return gap.sum()
87 | return (gap * df[w]).sum()
88 |
89 |
90 | def squared_poverty_gap(
91 | df: pd.DataFrame, income: str, threshold: str, w: str = None
92 | ) -> float:
93 | """Calculate squared poverty gap, i.e., the total squared gap between
94 | income and poverty thresholds for all people in poverty.
95 | Also known as poverty severity index.
96 |
97 | :param df: DataFrame with income, threshold, and possibly weight columns
98 | for each household (data should represent households, not persons).
99 | :type df: pd.DataFrame
100 | :param income: Column indicating income.
101 | :type income: str
102 | :param threshold: Column indicating threshold.
103 | :type threshold: str
104 | :param w: Column indicating weight, defaults to None (unweighted).
105 | :type w: str, optional
106 | :return: Squared poverty gap.
107 | :rtype: float
108 | """
109 | gap = np.maximum(df[threshold] - df[income], 0)
110 | sq_gap = np.power(gap, 2)
111 | if w is None:
112 | return sq_gap.sum()
113 | return (sq_gap * df[w]).sum()
114 |
115 |
116 | def deep_poverty_gap(
117 | df: pd.DataFrame, income: str, threshold: str, w: str = None
118 | ) -> float:
119 | """Calculate deep poverty gap, i.e., the total gap between income and
120 | halved poverty thresholds for all people in deep poverty.
121 |
122 | :param df: DataFrame with income, threshold, and possibly weight columns
123 | for each household (data should represent households, not persons).
124 | :type df: pd.DataFrame
125 | :param income: Column indicating income.
126 | :type income: str
127 | :param threshold: Column indicating threshold.
128 | :type threshold: str
129 | :param w: Column indicating weight, defaults to None (unweighted).
130 | :type w: str, optional
131 | :return: Deep poverty gap.
132 | :rtype: float
133 | """
134 | gap = np.maximum((df[threshold] / 2) - df[income], 0)
135 | if w is None:
136 | return gap.sum()
137 | return (gap * df[w]).sum()
138 |
--------------------------------------------------------------------------------
/microdf/style.py:
--------------------------------------------------------------------------------
1 | TITLE_COLOR = "#212121"
2 | AXIS_COLOR = "#757575"
3 | GRID_COLOR = "#eeeeee" # Previously lighter #f5f5f5.
4 | DPI = 200
5 |
6 |
7 | def set_plot_style(dpi: int = DPI):
8 | """Set plot style.
9 |
10 | :param dpi: DPI for saving and displaying figures, defaults to microdf.DPI
11 | (200).
12 | :type dpi: int, optional
13 | """
14 | try:
15 | import seaborn as sns
16 | import matplotlib as mpl
17 | import matplotlib.font_manager as fm
18 | except ImportError:
19 | raise ImportError(
20 | "The function you've called requires extra dependencies. " +
21 | "Please install microdf with the 'charts' extra by running " +
22 | "'pip install microdf[charts]'"
23 | )
24 |
25 | sns.set_style("white")
26 |
27 | # Set up Roboto. Must be downloaded in the current directory.
28 | # See https://stackoverflow.com/a/51844978/1840471.
29 | fm.fontManager.ttflist += fm.createFontList(["Roboto-Regular.ttf"])
30 |
31 | STYLE = {
32 | "savefig.dpi": dpi,
33 | "figure.dpi": dpi,
34 | "figure.figsize": (6.4, 4.8), # Default.
35 | "font.sans-serif": "Roboto",
36 | "font.family": "sans-serif",
37 | # Set title text color to dark gray (https://material.io/color) not
38 | # black.
39 | "text.color": TITLE_COLOR,
40 | # Axis titles and tick marks are medium gray.
41 | "axes.labelcolor": AXIS_COLOR,
42 | "xtick.color": AXIS_COLOR,
43 | "ytick.color": AXIS_COLOR,
44 | # Grid is light gray.
45 | "axes.grid": True,
46 | "grid.color": GRID_COLOR,
47 | # Equivalent to seaborn.despine(left=True, bottom=True).
48 | "axes.spines.left": False,
49 | "axes.spines.right": False,
50 | "axes.spines.top": False,
51 | "axes.spines.bottom": False,
52 | }
53 |
54 | mpl.rcParams.update(STYLE)
55 |
--------------------------------------------------------------------------------
/microdf/tax.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | def mtr(val, brackets, rates):
6 | """Calculates the marginal tax rate applied to a value depending on a
7 | tax schedule.
8 |
9 | :param val: Value to assess tax on, e.g. wealth or income (list or Series).
10 | :param brackets: Left side of each bracket (list or Series).
11 | :param rates: Rate corresponding to each bracket.
12 | :returns: Series of the size of val representing the marginal tax rate.
13 |
14 | """
15 | df_tax = pd.DataFrame({"brackets": brackets, "rates": rates})
16 | df_tax["base_tax"] = (
17 | df_tax.brackets.sub(df_tax.brackets.shift(fill_value=0))
18 | .mul(df_tax.rates.shift(fill_value=0))
19 | .cumsum()
20 | )
21 | rows = df_tax.brackets.searchsorted(val, side="right") - 1
22 | income_bracket_df = df_tax.loc[rows].reset_index(drop=True)
23 | return income_bracket_df.rates
24 |
25 |
26 | def tax_from_mtrs(
27 | val,
28 | brackets,
29 | rates,
30 | avoidance_rate=0,
31 | avoidance_elasticity=0,
32 | avoidance_elasticity_flat=0,
33 | ):
34 | """Calculates tax liability based on a marginal tax rate schedule.
35 |
36 | :param val: Value to assess tax on, e.g. wealth or income (list or Series).
37 | :param brackets: Left side of each bracket (list or Series).
38 | :param rates: Rate corresponding to each bracket.
39 | :param avoidance_rate: Constant avoidance/evasion rate in percentage terms.
40 | Defaults to zero.
41 | :param avoidance_elasticity: Avoidance/evasion elasticity.
42 | Response of log taxable value with respect
43 | to tax rate.
44 | Defaults to zero. Should be positive.
45 | :param avoidance_elasticity_flat: Response of taxable value with respect
46 | to tax rate.
47 | Use avoidance_elasticity in most cases.
48 | Defaults to zero. Should be positive.
49 | :returns: Series of tax liabilities with the same size as val.
50 |
51 | """
52 | assert (
53 | avoidance_rate == 0
54 | or avoidance_elasticity == 0
55 | or avoidance_elasticity_flat == 0
56 | ), "Cannot supply multiple avoidance parameters."
57 | assert (
58 | avoidance_elasticity >= 0
59 | ), "Provide nonnegative avoidance_elasticity."
60 | df_tax = pd.DataFrame({"brackets": brackets, "rates": rates})
61 | df_tax["base_tax"] = (
62 | df_tax.brackets.sub(df_tax.brackets.shift(fill_value=0))
63 | .mul(df_tax.rates.shift(fill_value=0))
64 | .cumsum()
65 | )
66 | if avoidance_rate == 0: # Only need MTRs if elasticity is supplied.
67 | mtrs = mtr(val, brackets, rates)
68 | if avoidance_elasticity > 0:
69 | avoidance_rate = 1 - np.exp(-avoidance_elasticity * mtrs)
70 | if avoidance_elasticity_flat > 0:
71 | avoidance_rate = avoidance_elasticity_flat * mtrs
72 | taxable = pd.Series(val) * (1 - avoidance_rate)
73 | rows = df_tax.brackets.searchsorted(taxable, side="right") - 1
74 | income_bracket_df = df_tax.loc[rows].reset_index(drop=True)
75 | return (
76 | pd.Series(taxable)
77 | .sub(income_bracket_df.brackets)
78 | .mul(income_bracket_df.rates)
79 | .add(income_bracket_df.base_tax)
80 | )
81 |
--------------------------------------------------------------------------------
/microdf/taxcalc.py:
--------------------------------------------------------------------------------
1 | import microdf as mdf
2 | from microdf._optional import import_optional_dependency
3 |
4 |
5 | def static_baseline_calc(recs, year):
6 | """Creates a static Calculator object.
7 |
8 | :param recs: Records object.
9 | :param year: Year to advance to.
10 | :returns: Calculator object.
11 |
12 | """
13 | tc = import_optional_dependency("taxcalc")
14 | calc = tc.Calculator(records=recs, policy=tc.Policy())
15 | calc.advance_to_year(year)
16 | calc.calc_all()
17 | return calc
18 |
19 |
20 | def add_weighted_metrics(df, metric_vars, w="s006", divisor=1e6, suffix="_m"):
21 | """Adds weighted metrics in millions to a Tax-Calculator pandas DataFrame.
22 |
23 | Columns are renamed to *_m.
24 |
25 | :param df: A pandas DataFrame containing Tax-Calculator data.
26 | :param metric_vars: A list of column names to weight, or a single column
27 | name.
28 | :param w: Weight column. Defaults to s006.
29 | :param divisor: Number by which the product is divided. Defaults to 1e6.
30 | :param suffix: Suffix to add to each weighted total. Defaults to '_m'
31 | to match divisor default of 1e6.
32 | :returns: Nothing. Weighted columns are added in place.
33 |
34 | """
35 | df[w + suffix] = df[w] / divisor
36 | metric_vars = mdf.listify(metric_vars)
37 | for metric_var in metric_vars:
38 | df[metric_var + suffix] = df[metric_var] * df[w + suffix]
39 |
40 |
41 | def n65(age_head, age_spouse, elderly_dependents):
42 | """Calculates number of people in the tax unit age 65 or older.
43 |
44 | :param age_head: Series representing age_head from taxcalc data.
45 | :param age_spouse: Series representing age_spouse from taxcalc data.
46 | :param elderly_dependents: Series representing elderly_dependents from
47 | taxcalc data.
48 | :returns: Series representing the number of people age 65 or older.
49 |
50 | """
51 | return (
52 | (age_head >= 65).astype(int)
53 | + (age_spouse >= 65).astype(int)
54 | + elderly_dependents
55 | )
56 |
57 |
58 | def calc_df(
59 | records=None,
60 | policy=None,
61 | year=2020,
62 | reform=None,
63 | group_vars=None,
64 | metric_vars=None,
65 | group_n65=False,
66 | ):
67 | """Creates a pandas DataFrame for given Tax-Calculator data.
68 |
69 | s006 is always included, and RECID is used as an index.
70 |
71 | :param records: An optional Records object. If not provided, uses CPS
72 | records. (Default value = None)
73 | :param policy: An optional Policy object. If not provided, uses default
74 | Policy.
75 | :param year: An optional year to advance to. If not provided, defaults to
76 | 2020.
77 | :param reform: An optional reform to implement for the Policy object.
78 | (Default value = None)
79 | :param group_vars: An optional list of column names to include in the
80 | DataFrame. (Default value = None)
81 | :param metric_vars: An optional list of column names to include and
82 | calculate weighted sums of (in millions named as *_m) in the DataFrame.
83 | (Default value = None)
84 | :param group_n65: Whether to calculate and group by n65. Defaults to False.
85 | :returns: A pandas DataFrame. market_income is also always calculated.
86 |
87 | """
88 | tc = import_optional_dependency("taxcalc")
89 | # Assign defaults.
90 | if records is None:
91 | records = tc.Records.cps_constructor()
92 | if policy is None:
93 | policy = tc.Policy()
94 | if reform is not None:
95 | policy.implement_reform(reform)
96 | # Calculate.
97 | calc = tc.Calculator(records=records, policy=policy, verbose=False)
98 | calc.advance_to_year(year)
99 | calc.calc_all()
100 | # Get a deduplicated list of all columns.
101 | if group_n65:
102 | group_vars = group_vars + [
103 | "age_head",
104 | "age_spouse",
105 | "elderly_dependents",
106 | ]
107 | # Include expanded_income and benefits to produce market_income.
108 | all_cols = mdf.listify(
109 | [
110 | "RECID",
111 | "s006",
112 | "expanded_income",
113 | "aftertax_income",
114 | mdf.BENS,
115 | group_vars,
116 | metric_vars,
117 | ]
118 | )
119 | df = calc.dataframe(all_cols)
120 | # Create core elements.
121 | df["market_income"] = mdf.market_income(df)
122 | df["bens"] = df[mdf.BENS].sum(axis=1)
123 | df["tax"] = df.expanded_income - df.aftertax_income
124 | if group_n65:
125 | df["n65"] = n65(df.age_head, df.age_spouse, df.elderly_dependents)
126 | df.drop(
127 | ["age_head", "age_spouse", "elderly_dependents"],
128 | axis=1,
129 | inplace=True,
130 | )
131 | # Add calculated columns for metrics.
132 | mdf.add_weighted_metrics(df, metric_vars)
133 | # Set RECID to int and set it as index before returning.
134 | df["RECID"] = df.RECID.map(int)
135 | return df.set_index("RECID")
136 |
137 |
138 | def recalculate(df):
139 | """Recalculates fields in the DataFrame for after components have changed.
140 |
141 | :param df: DataFrame for use in microdf.
142 | :returns: Nothing. Updates the DataFrame in place.
143 |
144 | """
145 | # Recalculate TPC's Expanded Cash Income measure.
146 | cols = df.columns
147 | if "tpc_eci" in cols:
148 | df.tpc_eci = mdf.tpc_eci(df)
149 | # Recalculate weighted metrics (anything ending in _m).
150 | mcols = cols[cols.str.endswith("_m")]
151 | mdf.add_weighted_metrics(df, mcols)
152 |
--------------------------------------------------------------------------------
/microdf/tests/__pycache__/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.pythonPath": "/home/mghenis/anaconda3/bin/python3"
3 | }
--------------------------------------------------------------------------------
/microdf/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="session")
7 | def tests_path():
8 | """ """
9 | return os.path.abspath(os.path.dirname(__file__))
10 |
--------------------------------------------------------------------------------
/microdf/tests/test_compare.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | import microdf as mdf
7 |
8 |
9 | def differences(actual, expected, f_actual, f_expected):
10 | """Check for differences between results in afilename and efilename files.
11 |
12 | :param actual: Actual DataFrame.
13 | :param expected: Expected DataFrame.
14 | :param f_actual: Filename of the actual CSV.
15 | :param f_expected: Filename of the expected CSV.
16 | """
17 | if not np.allclose(actual, expected):
18 | msg = "COMPARE RESULTS DIFFER\n"
19 | msg += "-------------------------------------------------\n"
20 | msg += "--- NEW RESULTS IN {} FILE ---\n"
21 | msg += "--- if new OK, copy {} to ---\n"
22 | msg += "--- {} ---\n"
23 | msg += "--- and rerun test. ---\n"
24 | msg += "-------------------------------------------------\n"
25 | raise ValueError(msg.format(f_actual, f_actual, f_expected))
26 |
27 |
28 | def test_percentile_agg_compare(tests_path):
29 | """
30 | :param tests_path: Folder path to write test results.
31 | """
32 | N = 1000
33 | np.random.seed(0)
34 | df = pd.DataFrame({"val": np.random.rand(N), "w": np.random.rand(N)})
35 | mdf.add_weighted_quantiles(df, "val", "w")
36 | percentile_sum = df.groupby("val_percentile")[["val", "w"]].sum()
37 | F_ACTUAL = "test_percentile_actual.csv"
38 | F_EXPECTED = "test_percentile_expected.csv"
39 | percentile_sum.to_csv(os.path.join(tests_path, F_ACTUAL))
40 | # Re-read as CSV to remove index and ensure CSVs are equal.
41 | actual = pd.read_csv(os.path.join(tests_path, F_ACTUAL))
42 | expected = pd.read_csv(os.path.join(tests_path, F_EXPECTED))
43 | differences(actual, expected, F_ACTUAL, F_EXPECTED)
44 |
--------------------------------------------------------------------------------
/microdf/tests/test_generic.py:
--------------------------------------------------------------------------------
1 | from microdf.generic import MicroDataFrame, MicroSeries
2 | import numpy as np
3 | import microdf as mdf
4 | import pandas as pd
5 |
6 |
7 | def test_df_init():
8 | arr = np.array([0, 1, 1])
9 | w = np.array([3, 0, 9])
10 | df = mdf.MicroDataFrame({"a": arr}, weights=w)
11 | assert df.a.mean() == np.average(arr, weights=w)
12 |
13 | df = mdf.MicroDataFrame()
14 | df["a"] = arr
15 | df.set_weights(w)
16 | assert df.a.mean() == np.average(arr, weights=w)
17 |
18 | df = mdf.MicroDataFrame()
19 | df["a"] = arr
20 | df["w"] = w
21 | df.set_weight_col("w")
22 | assert df.a.mean() == np.average(arr, weights=w)
23 |
24 |
25 | def test_series_getitem():
26 | arr = np.array([0, 1, 1])
27 | w = np.array([3, 0, 9])
28 | s = mdf.MicroSeries(arr, weights=w)
29 | assert s[[1, 2]].sum() == np.sum(arr[[1, 2]] * w[[1, 2]])
30 |
31 | assert s[1:3].sum() == np.sum(arr[1:3] * w[1:3])
32 |
33 |
34 | def test_sum():
35 | arr = np.array([0, 1, 1])
36 | w = np.array([3, 0, 9])
37 | series = mdf.MicroSeries(arr, weights=w)
38 | assert series.sum() == (arr * w).sum()
39 |
40 | arr = np.linspace(-20, 100, 100)
41 | w = np.linspace(1, 3, 100)
42 | series = mdf.MicroSeries(arr)
43 | series.set_weights(w)
44 | assert series.sum() == (arr * w).sum()
45 |
46 | # Verify that an error is thrown when passing weights of different size
47 | # from the values.
48 | w = np.linspace(1, 3, 101)
49 | series = mdf.MicroSeries(arr)
50 | try:
51 | series.set_weights(w)
52 | assert False
53 | except Exception:
54 | pass
55 |
56 |
57 | def test_mean():
58 | arr = np.array([3, 0, 2])
59 | w = np.array([4, 1, 1])
60 | series = mdf.MicroSeries(arr, weights=w)
61 | assert series.mean() == np.average(arr, weights=w)
62 |
63 | arr = np.linspace(-20, 100, 100)
64 | w = np.linspace(1, 3, 100)
65 | series = mdf.MicroSeries(arr)
66 | series.set_weights(w)
67 | assert series.mean() == np.average(arr, weights=w)
68 |
69 | w = np.linspace(1, 3, 101)
70 | series = mdf.MicroSeries(arr)
71 | try:
72 | series.set_weights(w)
73 | assert False
74 | except Exception:
75 | pass
76 |
77 |
78 | def test_poverty_count():
79 | arr = np.array([10000, 20000, 50000])
80 | w = np.array([1123, 1144, 2211])
81 | df = MicroDataFrame(weights=w)
82 | df["income"] = arr
83 | df["threshold"] = 16000
84 | assert df.poverty_count("income", "threshold") == w[0]
85 |
86 |
87 | def test_median():
88 | # 1, 2, 3, 4, *4*, 4, 5, 5, 5
89 | arr = np.array([1, 2, 3, 4, 5])
90 | w = np.array([1, 1, 1, 3, 3])
91 | series = mdf.MicroSeries(arr, weights=w)
92 | assert series.median() == 4
93 |
94 |
95 | def test_unweighted_groupby():
96 | df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]})
97 | assert (df.groupby("x").z.sum().values == np.array([5.0, 6.0])).all()
98 |
99 |
100 | def test_multiple_groupby():
101 | df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]})
102 | assert (df.groupby(["x", "y"]).z.sum() == np.array([5, 6])).all()
103 |
104 |
105 | def test_concat():
106 | df1 = mdf.MicroDataFrame({"x": [1, 2]}, weights=[3, 4])
107 | df2 = mdf.MicroDataFrame({"y": [5, 6]}, weights=[7, 8])
108 | # Verify that pd.concat returns DataFrame (probably no way to fix this).
109 | pd_long = pd.concat([df1, df2])
110 | assert isinstance(pd_long, pd.DataFrame)
111 | assert not isinstance(pd_long, mdf.MicroDataFrame)
112 | # Verify that mdf.concat works.
113 | mdf_long = mdf.concat([df1, df2])
114 | assert isinstance(mdf_long, mdf.MicroDataFrame)
115 | # Weights should be preserved.
116 | assert mdf_long.weights.equals(pd.concat([df1.weights, df2.weights]))
117 | # Verify it works horizontally too (take the first set of weights).
118 | mdf_wide = mdf.concat([df1, df2], axis=1)
119 | assert isinstance(mdf_wide, mdf.MicroDataFrame)
120 | assert mdf_wide.weights.equals(df1.weights)
121 |
122 |
123 | def test_set_index():
124 | d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6])
125 | assert d.x.__class__ == MicroSeries
126 | d.index = [1, 2, 3]
127 | assert d.x.__class__ == MicroSeries
128 |
129 |
130 | def test_reset_index():
131 | d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6])
132 | assert d.reset_index().__class__ == MicroDataFrame
133 |
134 |
135 | def test_cumsum():
136 | s = mdf.MicroSeries([1, 2, 3], weights=[4, 5, 6])
137 | assert np.array_equal(s.cumsum().values, [4, 14, 32])
138 |
139 | s = mdf.MicroSeries([2, 1, 3], weights=[5, 4, 6])
140 | assert np.array_equal(s.cumsum().values, [10, 14, 32])
141 |
142 | s = mdf.MicroSeries([3, 1, 2], weights=[6, 4, 5])
143 | assert np.array_equal(s.cumsum().values, [18, 22, 32])
144 |
145 |
146 | def test_rank():
147 | s = mdf.MicroSeries([1, 2, 3], weights=[4, 5, 6])
148 | assert np.array_equal(s.rank().values, [4, 9, 15])
149 |
150 | s = mdf.MicroSeries([3, 1, 2], weights=[6, 4, 5])
151 | assert np.array_equal(s.rank().values, [15, 4, 9])
152 |
153 | s = mdf.MicroSeries([2, 1, 3], weights=[5, 4, 6])
154 | assert np.array_equal(s.rank().values, [9, 4, 15])
155 |
156 |
157 | def test_percentile_rank():
158 | s = mdf.MicroSeries([4, 2, 3, 1], weights=[20, 40, 20, 20])
159 | assert np.array_equal(s.percentile_rank().values, [100, 60, 80, 20])
160 |
161 |
162 | def test_quartile_rank():
163 | s = mdf.MicroSeries([4, 2, 3], weights=[25, 50, 25])
164 | assert np.array_equal(s.quartile_rank().values, [4, 2, 3])
165 |
166 |
167 | def test_quintile_rank():
168 | s = mdf.MicroSeries([4, 2, 3], weights=[20, 60, 20])
169 | assert np.array_equal(s.quintile_rank().values, [5, 3, 4])
170 |
171 |
172 | def test_decile_rank_rank():
173 | s = mdf.MicroSeries(
174 | [5, 4, 3, 2, 1, 6, 7, 8, 9],
175 | weights=[10, 20, 10, 10, 10, 10, 10, 10, 10, 10],
176 | )
177 | assert np.array_equal(s.decile_rank().values, [6, 5, 3, 2, 1, 7, 8, 9, 10])
178 |
179 |
180 | def test_copy_equals():
181 | d = mdf.MicroDataFrame(
182 | {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8]
183 | )
184 | d_copy = d.copy()
185 | d_copy_diff_weights = d_copy.copy()
186 | d_copy_diff_weights.weights *= 2
187 | assert d.equals(d_copy)
188 | assert not d.equals(d_copy_diff_weights)
189 | # Same for a MicroSeries.
190 | assert d.x.equals(d_copy.x)
191 | assert not d.x.equals(d_copy_diff_weights.x)
192 |
193 |
194 | def test_subset():
195 | df = mdf.MicroDataFrame(
196 | {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8]
197 | )
198 | df_no_z = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4]}, weights=[7, 8])
199 | assert df[["x", "y"]].equals(df_no_z)
200 | df_no_z_diff_weights = df_no_z.copy()
201 | df_no_z_diff_weights.weights += 1
202 | assert not df[["x", "y"]].equals(df_no_z_diff_weights)
203 |
204 |
205 | def test_value_subset():
206 | d = mdf.MicroDataFrame({"x": [1, 2, 3], "y": [1, 2, 2]}, weights=[4, 5, 6])
207 | d2 = d[d.y > 1]
208 | assert d2.y.shape == d2.weights.shape
209 |
--------------------------------------------------------------------------------
/microdf/tests/test_inequality.py:
--------------------------------------------------------------------------------
1 | import microdf as mdf
2 |
3 | import pandas as pd
4 |
5 |
6 | def test_top_pct():
7 | x = list(range(1, 11)) # 1 to 10. Sum = 10 * 11 / 2 = 55.
8 | df = pd.DataFrame({"x": x})
9 | ms = mdf.MicroSeries(x)
10 | RES = 10 / 55
11 | assert mdf.top_10_pct_share(df, "x") == RES
12 | assert ms.top_10_pct_share() == RES
13 | x = list(range(1, 4))
14 | df = pd.DataFrame({"x": x, "w": x})
15 | ms = mdf.MicroSeries(x, weights=x)
16 | # This is equivalent to [1, 2, 2, 3, 3, 3]
17 | # Sum = 14, top half is 9.
18 | RES = 9 / 14
19 | assert mdf.top_50_pct_share(df, "x", "w") == RES
20 | assert ms.top_50_pct_share() == RES
21 |
--------------------------------------------------------------------------------
/microdf/tests/test_io.py:
--------------------------------------------------------------------------------
1 | import microdf as mdf
2 |
3 |
4 | def test_read_stata_zip():
5 | """ """
6 | SCF2016 = "https://www.federalreserve.gov/econres/files/scfp2016s.zip"
7 | COLS = ["wgt", "networth"]
8 | df = mdf.read_stata_zip(SCF2016, columns=COLS)
9 | assert df.columns.tolist() == COLS
10 | assert df.shape[0] > 0
11 |
--------------------------------------------------------------------------------
/microdf/tests/test_optional_dependency.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import types
3 |
4 | import pytest
5 |
6 | from microdf._optional import VERSIONS, import_optional_dependency
7 |
8 |
9 | def test_import_optional():
10 | """ """
11 | match = "Missing .*notapackage.* pip .* conda .* notapackage"
12 | with pytest.raises(ImportError, match=match):
13 | import_optional_dependency("notapackage")
14 |
15 | result = import_optional_dependency("notapackage", raise_on_missing=False)
16 | assert result is None
17 |
18 |
19 | def test_xlrd_version_fallback():
20 | """ """
21 | pytest.importorskip("xlrd")
22 | import_optional_dependency("xlrd")
23 |
24 |
25 | def test_bad_version():
26 | """ """
27 | name = "fakemodule"
28 | module = types.ModuleType(name)
29 | module.__version__ = "0.9.0"
30 | sys.modules[name] = module
31 | VERSIONS[name] = "1.0.0"
32 |
33 | match = "microdf requires .*1.0.0.* of .fakemodule.*'0.9.0'"
34 | with pytest.raises(ImportError, match=match):
35 | import_optional_dependency("fakemodule")
36 |
37 | with pytest.warns(UserWarning):
38 | result = import_optional_dependency("fakemodule", on_version="warn")
39 | assert result is None
40 |
41 | module.__version__ = "1.0.0" # exact match is OK
42 | result = import_optional_dependency("fakemodule")
43 | assert result is module
44 |
45 |
46 | def test_no_version_raises():
47 | """ """
48 | name = "fakemodule"
49 | module = types.ModuleType(name)
50 | sys.modules[name] = module
51 | VERSIONS[name] = "1.0.0"
52 |
53 | with pytest.raises(ImportError, match="Can't determine .* fakemodule"):
54 | import_optional_dependency(name)
55 |
--------------------------------------------------------------------------------
/microdf/tests/test_percentile_actual.csv:
--------------------------------------------------------------------------------
1 | val_percentile,val,w
2 | 1,0.04936696707980226,5.0030480466019815
3 | 2,0.17830779834685495,5.114684559704431
4 | 3,0.2857988855674821,5.003202737366776
5 | 4,0.33805302460864795,4.733066370107501
6 | 5,0.5703591960673162,5.239285359053462
7 | 6,0.6096331244255117,5.429420284539545
8 | 7,0.5985039643349068,4.348564066721563
9 | 8,0.7510884464659845,5.440450569600465
10 | 9,1.0876105767407074,5.499343558417599
11 | 10,0.8899045403172029,5.1538348751709435
12 | 11,1.394683443411786,5.23502927227519
13 | 12,0.9305982112361821,4.417722946149524
14 | 13,0.9650569353812546,5.348799899633306
15 | 14,1.3214822270450117,5.3325704209630524
16 | 15,1.2715100776802908,5.061289605022857
17 | 16,1.6720424048893725,5.305386843295495
18 | 17,1.9424737512447825,5.255981017771979
19 | 18,1.0348899262653468,4.336843836714941
20 | 19,1.7975116284606523,5.1773144696335915
21 | 20,2.1006171498258643,5.589603555523628
22 | 21,1.8709513510005413,5.335531722226071
23 | 22,1.7207120623536305,4.695751363757509
24 | 23,2.00187368264572,5.054306807303892
25 | 24,2.313857097420784,4.884140680676319
26 | 25,2.896700637851371,5.5236380227234845
27 | 26,2.489820947315488,4.791953107597212
28 | 27,2.8411561635459712,5.5661465432344635
29 | 28,2.4261357418555654,4.446073644558097
30 | 29,3.357591023039818,5.291769801304535
31 | 30,2.314506073869091,5.149625279108836
32 | 31,3.551962771845865,5.402587912747316
33 | 32,4.002233897939344,5.093801654006394
34 | 33,3.527022677246467,5.10624451980372
35 | 34,2.654138701549258,5.043397568469286
36 | 35,2.7068858476732176,4.636131024815089
37 | 36,4.511798986257631,5.713008105710415
38 | 37,3.583675870814159,4.894200392814213
39 | 38,3.669836624431939,5.033490962922036
40 | 39,3.363028108896736,5.715252956264232
41 | 40,2.269128705743366,4.687099540814201
42 | 41,3.0962247363582,5.0205442523389685
43 | 42,5.586938042279173,5.2375613191718084
44 | 43,4.062902694908744,5.482945818496652
45 | 44,3.3160969156123503,4.938943405325701
46 | 45,4.241531465516574,5.242988142385608
47 | 46,5.632990791952207,4.604524871392653
48 | 47,4.003109598155357,5.449983106760843
49 | 48,3.627194217198549,4.5252648036434655
50 | 49,4.574489241426403,5.827965751590956
51 | 50,3.26114249800974,5.07472388942889
52 | 51,5.770693937000146,4.924561771632174
53 | 52,6.928803378451052,5.390871201576556
54 | 53,5.605614198095166,4.867506840251817
55 | 54,5.712794397822728,4.802840821886715
56 | 55,6.330048423625907,5.043215026134606
57 | 56,6.49917632389808,5.549363062730124
58 | 57,4.455249905003633,5.388617265562158
59 | 58,3.9747504049202647,4.804030090560459
60 | 59,4.591734071798827,4.79723029172527
61 | 60,9.365137265130915,5.674437752031215
62 | 61,7.18714959246965,4.968027795024001
63 | 62,5.514056987390571,5.21537151235826
64 | 63,6.199762618472243,5.007944951253934
65 | 64,4.396249378665468,5.013683226930631
66 | 65,7.678394207429429,5.4324805777516385
67 | 66,8.535490990338737,4.983177889646899
68 | 67,7.380492997015747,5.160276660633111
69 | 68,7.495870406959786,5.166303035743688
70 | 69,6.23459148472051,4.800326067433167
71 | 70,4.8795947527845165,5.144612976891027
72 | 71,6.331517082169763,4.923377667981699
73 | 72,10.028408222423991,4.7524585511425315
74 | 73,7.267010600317143,5.928033173228412
75 | 74,5.856718614341236,5.153535683492981
76 | 75,6.6513086587256645,4.4540875011243095
77 | 76,6.009374270984309,5.563696325737205
78 | 77,6.903278042059779,4.9610776312587195
79 | 78,9.33170057141403,5.442361429739662
80 | 79,4.741840898628048,4.385127095815581
81 | 80,10.441998036333615,5.681838547736346
82 | 81,8.15549502111278,5.3122709245136726
83 | 82,8.264309038308452,4.777889650212625
84 | 83,5.887391058178539,4.86549347601554
85 | 84,7.6746735994204345,5.1933876664286505
86 | 85,8.617091500906668,5.330823383917326
87 | 86,5.205242021670235,5.116263366906492
88 | 87,6.110525427774986,5.0877466081133615
89 | 88,10.555886286183668,4.846433766268649
90 | 89,9.79517076482145,5.551267106013298
91 | 90,8.999880920529858,4.8964913320490835
92 | 91,7.269834366073169,5.094073872052731
93 | 92,8.288552906946475,5.482297252107353
94 | 93,9.264743724824317,4.4854074855836235
95 | 94,8.42071960033443,5.3698127342621085
96 | 95,10.393061379249968,5.397446566018766
97 | 96,9.545057271504412,5.083199029497632
98 | 97,9.61390230771744,5.147118076577214
99 | 98,8.719700395822183,4.537389380232294
100 | 99,14.670850665733171,5.177534031309767
101 | 100,12.905505941170981,5.722129244713919
102 |
--------------------------------------------------------------------------------
/microdf/tests/test_percentile_expected.csv:
--------------------------------------------------------------------------------
1 | val_percentile,val,w
2 | 1,0.04936696707980226,5.0030480466019815
3 | 2,0.17830779834685495,5.114684559704431
4 | 3,0.2857988855674821,5.003202737366776
5 | 4,0.33805302460864795,4.733066370107501
6 | 5,0.5703591960673162,5.239285359053462
7 | 6,0.6096331244255117,5.429420284539545
8 | 7,0.5985039643349068,4.348564066721563
9 | 8,0.7510884464659845,5.440450569600465
10 | 9,1.0876105767407074,5.499343558417599
11 | 10,0.8899045403172029,5.1538348751709435
12 | 11,1.394683443411786,5.23502927227519
13 | 12,0.9305982112361821,4.417722946149524
14 | 13,0.9650569353812546,5.348799899633306
15 | 14,1.3214822270450117,5.3325704209630524
16 | 15,1.2715100776802908,5.061289605022857
17 | 16,1.6720424048893725,5.305386843295495
18 | 17,1.9424737512447825,5.255981017771979
19 | 18,1.0348899262653468,4.336843836714941
20 | 19,1.7975116284606523,5.1773144696335915
21 | 20,2.1006171498258643,5.589603555523628
22 | 21,1.8709513510005413,5.335531722226071
23 | 22,1.7207120623536305,4.695751363757509
24 | 23,2.00187368264572,5.054306807303892
25 | 24,2.313857097420784,4.884140680676319
26 | 25,2.896700637851371,5.5236380227234845
27 | 26,2.489820947315488,4.791953107597212
28 | 27,2.8411561635459712,5.5661465432344635
29 | 28,2.4261357418555654,4.446073644558097
30 | 29,3.357591023039818,5.291769801304535
31 | 30,2.314506073869091,5.149625279108836
32 | 31,3.551962771845865,5.402587912747316
33 | 32,4.002233897939344,5.093801654006394
34 | 33,3.527022677246467,5.10624451980372
35 | 34,2.654138701549258,5.043397568469286
36 | 35,2.7068858476732176,4.636131024815089
37 | 36,4.511798986257631,5.713008105710415
38 | 37,3.583675870814159,4.894200392814213
39 | 38,3.669836624431939,5.033490962922036
40 | 39,3.363028108896736,5.715252956264232
41 | 40,2.269128705743366,4.687099540814201
42 | 41,3.0962247363582,5.0205442523389685
43 | 42,5.586938042279173,5.2375613191718084
44 | 43,4.062902694908744,5.482945818496652
45 | 44,3.3160969156123503,4.938943405325701
46 | 45,4.241531465516574,5.242988142385608
47 | 46,5.632990791952207,4.604524871392653
48 | 47,4.003109598155357,5.449983106760843
49 | 48,3.627194217198549,4.5252648036434655
50 | 49,4.574489241426403,5.827965751590956
51 | 50,3.26114249800974,5.07472388942889
52 | 51,5.770693937000146,4.924561771632174
53 | 52,6.928803378451052,5.390871201576556
54 | 53,5.605614198095166,4.867506840251817
55 | 54,5.712794397822728,4.802840821886715
56 | 55,6.330048423625907,5.043215026134606
57 | 56,6.49917632389808,5.549363062730124
58 | 57,4.455249905003633,5.388617265562158
59 | 58,3.9747504049202647,4.804030090560459
60 | 59,4.591734071798827,4.79723029172527
61 | 60,9.365137265130915,5.674437752031215
62 | 61,7.18714959246965,4.968027795024001
63 | 62,5.514056987390571,5.21537151235826
64 | 63,6.199762618472243,5.007944951253934
65 | 64,4.396249378665468,5.013683226930631
66 | 65,7.678394207429429,5.4324805777516385
67 | 66,8.535490990338737,4.983177889646899
68 | 67,7.380492997015747,5.160276660633111
69 | 68,7.495870406959786,5.166303035743688
70 | 69,6.23459148472051,4.800326067433167
71 | 70,4.8795947527845165,5.144612976891027
72 | 71,6.331517082169763,4.923377667981699
73 | 72,10.028408222423991,4.7524585511425315
74 | 73,7.267010600317143,5.928033173228412
75 | 74,5.856718614341236,5.153535683492981
76 | 75,6.6513086587256645,4.4540875011243095
77 | 76,6.009374270984309,5.563696325737205
78 | 77,6.903278042059779,4.9610776312587195
79 | 78,9.33170057141403,5.442361429739662
80 | 79,4.741840898628048,4.385127095815581
81 | 80,10.441998036333615,5.681838547736346
82 | 81,8.15549502111278,5.3122709245136726
83 | 82,8.264309038308452,4.777889650212625
84 | 83,5.887391058178539,4.86549347601554
85 | 84,7.6746735994204345,5.1933876664286505
86 | 85,8.617091500906668,5.330823383917326
87 | 86,5.205242021670235,5.116263366906492
88 | 87,6.110525427774986,5.0877466081133615
89 | 88,10.555886286183668,4.846433766268649
90 | 89,9.79517076482145,5.551267106013298
91 | 90,8.999880920529858,4.8964913320490835
92 | 91,7.269834366073169,5.094073872052731
93 | 92,8.288552906946475,5.482297252107353
94 | 93,9.264743724824317,4.4854074855836235
95 | 94,8.42071960033443,5.3698127342621085
96 | 95,10.393061379249968,5.397446566018766
97 | 96,9.545057271504412,5.083199029497632
98 | 97,9.61390230771744,5.147118076577214
99 | 98,8.719700395822183,4.537389380232294
100 | 99,14.670850665733171,5.177534031309767
101 | 100,12.905505941170981,5.722129244713919
102 |
--------------------------------------------------------------------------------
/microdf/tests/test_poverty.py:
--------------------------------------------------------------------------------
1 | import microdf as mdf
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | df = pd.DataFrame(
7 | {
8 | "income": [-10, 0, 10, 20],
9 | "threshold": [15, 10, 15, 10],
10 | "weight": [1, 2, 3, 4],
11 | }
12 | )
13 | md = mdf.MicroDataFrame(df[["income", "threshold"]], weights=df.weight)
14 |
15 |
16 | def test_poverty_rate():
17 | # Unweighted
18 | assert np.allclose(mdf.poverty_rate(df, "income", "threshold"), 3 / 4)
19 | # Weighted
20 | assert np.allclose(
21 | mdf.poverty_rate(df, "income", "threshold", "weight"), 6 / 10
22 | )
23 | assert np.allclose(md.poverty_rate("income", "threshold"), 6 / 10)
24 |
25 |
26 | def test_deep_poverty_rate():
27 | # Unweighted
28 | assert np.allclose(mdf.deep_poverty_rate(df, "income", "threshold"), 2 / 4)
29 | # Weighted
30 | assert np.allclose(
31 | mdf.deep_poverty_rate(df, "income", "threshold", "weight"), 3 / 10
32 | )
33 | assert np.allclose(md.deep_poverty_rate("income", "threshold"), 3 / 10)
34 |
35 |
36 | def test_poverty_gap():
37 | # Unweighted
38 | assert np.allclose(mdf.poverty_gap(df, "income", "threshold"), 25 + 10 + 5)
39 | # Weighted
40 | RES = 25 * 1 + 10 * 2 + 5 * 3
41 | assert np.allclose(
42 | mdf.poverty_gap(df, "income", "threshold", "weight"), RES
43 | )
44 | assert np.allclose(md.poverty_gap("income", "threshold"), RES)
45 |
46 |
47 | def test_squared_poverty_gap():
48 | # Unweighted
49 | assert np.allclose(
50 | mdf.squared_poverty_gap(df, "income", "threshold"),
51 | 25 ** 2 + 10 ** 2 + 5 ** 2,
52 | )
53 | # Weighted
54 | RES = 1 * (25 ** 2) + 2 * (10 ** 2) + 3 * (5 ** 2)
55 | assert np.allclose(
56 | mdf.squared_poverty_gap(df, "income", "threshold", "weight"), RES,
57 | )
58 | assert np.allclose(md.squared_poverty_gap("income", "threshold"), RES)
59 |
60 |
61 | def test_deep_poverty_gap():
62 | # Unweighted
63 | assert np.allclose(
64 | mdf.deep_poverty_gap(df, "income", "threshold"), 17.5 + 5 + 0 + 0
65 | )
66 | # Weighted
67 | RES = 17.5 * 1 + 5 * 2 + 0 * 3 + 0 * 4
68 | assert np.allclose(
69 | mdf.deep_poverty_gap(df, "income", "threshold", "weight"), RES
70 | )
71 | # Same in MicroDataFrame.
72 | assert np.allclose(md.deep_poverty_gap("income", "threshold"), RES)
73 |
--------------------------------------------------------------------------------
/microdf/tests/test_quantile_chg.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | import microdf as mdf
4 |
5 |
6 | V1 = [1, 2, 3]
7 | V2 = [4, 5, 6]
8 | W1 = [7, 8, 9]
9 | W2 = [10, 11, 12]
10 | DF1 = pd.DataFrame({"v": V1, "w": W1})
11 | DF2 = pd.DataFrame({"v": V2, "w": W2})
12 |
13 |
14 | def test_quantile_chg():
15 | mdf.quantile_chg(DF1, DF2, "v", "w", "v", "w")
16 |
17 |
18 | def test_quantile_pct_chg_plot():
19 | mdf.quantile_pct_chg_plot(DF1, DF2, "v", "w", "v", "w")
20 |
--------------------------------------------------------------------------------
/microdf/tests/test_tax.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 |
5 | import microdf as mdf
6 |
7 |
8 | def test_tax():
9 | """ """
10 | # Consider a MTR schedule of 0% up to 10,000, then 10% after that.
11 | BRACKETS = [0, 10e3]
12 | RATES = [0, 0.1]
13 | INCOME = [0, 5e3, 10e3, 10e3 + 1, 20e3]
14 | EXPECTED = [0, 0, 0, 0.1, 1e3]
15 | res = mdf.tax_from_mtrs(INCOME, BRACKETS, RATES)
16 | pd.testing.assert_series_equal(res, pd.Series(EXPECTED))
17 | # Try with 10% avoidance.
18 | EXPECTED_10PCT_AVOIDANCE = [0, 0, 0, 0, 800.0]
19 | res_10pct_avoidance = mdf.tax_from_mtrs(INCOME, BRACKETS, RATES, 0.1)
20 | pd.testing.assert_series_equal(
21 | res_10pct_avoidance, pd.Series(EXPECTED_10PCT_AVOIDANCE)
22 | )
23 | # Try with avoidance elasticity of 2.
24 | EXPECTED_E2_AVOIDANCE = [
25 | 0,
26 | 0,
27 | 0,
28 | 0, # Taxable base becomes (10e3 + 1) * (1 - 2 * 0.1)
29 | # Taxable base becomes 20e3 * (exp(-2 * 0.1)).
30 | 0.1 * (20e3 * np.exp(-0.2) - 10e3),
31 | ]
32 | res_e2_avoidance = mdf.tax_from_mtrs(
33 | INCOME, BRACKETS, RATES, avoidance_elasticity=2
34 | )
35 | pd.testing.assert_series_equal(
36 | res_e2_avoidance, pd.Series(EXPECTED_E2_AVOIDANCE)
37 | )
38 | # Try with flat avoidance elasticity of 2.
39 | EXPECTED_E2_AVOIDANCE_FLAT = [
40 | 0,
41 | 0,
42 | 0,
43 | 0, # Taxable base becomes (10e3 + 1) * (1 - 2 * 0.1)
44 | 600.0,
45 | ] # Taxable base becomes 20e3 * (1 - 2 * 0.1) = 16e3.
46 | res_e2_avoidance_flat = mdf.tax_from_mtrs(
47 | INCOME, BRACKETS, RATES, avoidance_elasticity_flat=2
48 | )
49 | pd.testing.assert_series_equal(
50 | res_e2_avoidance_flat, pd.Series(EXPECTED_E2_AVOIDANCE_FLAT)
51 | )
52 | # Ensure error when passing both rate and elasticity.
53 | with pytest.raises(Exception):
54 | mdf.tax_from_mtrs(INCOME, BRACKETS, RATES, 0.1, 2)
55 |
--------------------------------------------------------------------------------
/microdf/tests/test_taxcalc.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import microdf as mdf
4 |
5 |
6 | try:
7 | import taxcalc as tc
8 |
9 | _HAVE_TAXCALC = True
10 | except ImportError:
11 | _HAVE_TAXCALC = False
12 |
13 |
14 | def test_calc_df():
15 | """ """
16 | if not _HAVE_TAXCALC:
17 | pytest.skip("taxcalc is not installed")
18 | mdf.calc_df()
19 |
20 |
21 | def test_static_baseline_calc():
22 | """ """
23 | if not _HAVE_TAXCALC:
24 | pytest.skip("taxcalc is not installed")
25 | recs = tc.Records.cps_constructor()
26 | mdf.static_baseline_calc(recs, 2020)
27 |
--------------------------------------------------------------------------------
/microdf/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | import microdf as mdf
4 |
5 |
6 | def test_cartesian_product():
7 | """ """
8 | res = mdf.cartesian_product(
9 | {"a": [1, 2, 3], "b": ["val1", "val2"], "c": [100, 101]}
10 | )
11 | EXPECTED = pd.DataFrame(
12 | {
13 | "a": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
14 | "b": [
15 | "val1",
16 | "val1",
17 | "val2",
18 | "val2",
19 | "val1",
20 | "val1",
21 | "val2",
22 | "val2",
23 | "val1",
24 | "val1",
25 | "val2",
26 | "val2",
27 | ],
28 | "c": [100, 101, 100, 101, 100, 101, 100, 101, 100, 101, 100, 101],
29 | }
30 | )
31 | pd.testing.assert_frame_equal(res, EXPECTED)
32 |
33 |
34 | def test_flatten():
35 | """ """
36 | L = [[[1, 2, 3], [4, 5]], 6]
37 | res = list(mdf.flatten(L))
38 | EXPECTED = [1, 2, 3, 4, 5, 6]
39 | assert res == EXPECTED
40 |
--------------------------------------------------------------------------------
/microdf/tests/test_weighted.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 | import microdf as mdf
5 |
6 |
7 | X = [1, 5, 2]
8 | Y = [0, -6, 3]
9 | W = [4, 1, 1]
10 | df = pd.DataFrame({"x": X, "y": Y, "w": W})
11 | ms = mdf.MicroSeries(X, weights=W)
12 | md = mdf.MicroDataFrame(df[["x", "y"]], weights=W)
13 | # Also make a version with groups.
14 | df2 = df.copy(deep=True)
15 | df2.x *= 2
16 | df2.y *= 1.5
17 | dfg = pd.concat([df, df2])
18 | dfg["g"] = ["a"] * 3 + ["b"] * 3
19 | mdg = mdf.MicroDataFrame(dfg[["x", "y", "g"]], weights=W)
20 |
21 |
22 | def test_weighted_quantile():
23 | Q = [0, 0.5, 1]
24 | mdf.weighted_quantile(df, "x", "w", Q).tolist()
25 |
26 |
27 | def test_weighted_median():
28 | assert mdf.weighted_median(df, "x") == 2
29 | mdf.weighted_median(df, "x", "w")
30 | # Test with groups.
31 | mdf.weighted_median(dfg, "x", "w", "g")
32 |
33 |
34 | def test_weighted_mean():
35 | # Test umweighted.
36 | assert mdf.weighted_mean(df, "x") == 8 / 3
37 | # Test weighted.
38 | assert mdf.weighted_mean(df, "x", "w") == 11 / 6
39 | # Test weighted with multiple columns.
40 | assert mdf.weighted_mean(df, ["x", "y"], "w").tolist() == [11 / 6, -3 / 6]
41 | # Test grouped.
42 | mdf.weighted_mean(dfg, "x", "w", "g")
43 | mdf.weighted_mean(dfg, ["x", "y"], "w", "g")
44 |
45 |
46 | def test_weighted_sum():
47 | # Test unweighted.
48 | assert mdf.weighted_sum(df, "x") == 8
49 | # Test weighted.
50 | assert mdf.weighted_sum(df, "x", "w") == 11
51 | # Test weighted with multiple columns.
52 | assert mdf.weighted_sum(df, ["x", "y"], "w").tolist() == [11, -3]
53 | # Test grouped.
54 | mdf.weighted_sum(dfg, "x", "w", "g")
55 | mdf.weighted_sum(dfg, ["x", "y"], "w", "g")
56 |
57 |
58 | def test_gini():
59 | # Test nothing breaks.
60 | ms.gini()
61 | # Unweighted.
62 | mdf.gini(df, "x")
63 | # Weighted
64 | mdf.gini(df, "x", "w")
65 | # Unweighted, grouped
66 | mdf.gini(dfg, "x", groupby="g")
67 | # Weighted, grouped
68 | mdf.gini(dfg, "x", "w", groupby="g")
69 | # Test old and new match.
70 | assert ms.gini() == mdf.gini(df, "x", "w")
71 |
72 |
73 | def test_add_weighted_quantiles():
74 | with pytest.deprecated_call():
75 | mdf.add_weighted_quantiles(df, "x", "w")
76 |
--------------------------------------------------------------------------------
/microdf/ubi.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import microdf as mdf
4 |
5 |
6 | def ubi_or_bens(
7 | df,
8 | ben_cols,
9 | max_ubi="max_ubi",
10 | ubi="ubi",
11 | bens="bens",
12 | update_income_measures=None,
13 | ):
14 | """Calculates whether a tax unit will take UBI or benefits,
15 | and adjusts values accordingly.
16 |
17 | :param df: DataFrame.
18 | :param ben_cols: List of columns for benefits.
19 | :param max_ubi: Column name of the maximum UBI, before accounting
20 | for benefits. Defaults to 'max_ubi'.
21 | :param ubi: Column name to add representing the UBI. Defaults to 'ubi'.
22 | :param bens: Column name to add representing total benefits (after
23 | adjustment). Defaults to 'bens'.
24 | :param update_income_measures: List of income measures to update.
25 | Defaults to ['expanded_income', 'aftertax_income'].
26 | :returns: Nothing. Benefits in ben_cols are adjusted, ubi and bens columns
27 | are added, and expanded_income and aftertax_income are updated
28 | according to the net difference.
29 |
30 | """
31 | if update_income_measures is None:
32 | update_income_measures = ["expanded_income", "aftertax_income"]
33 | # Prep list args.
34 | update_income_measures = mdf.listify(update_income_measures)
35 | total_bens = df[ben_cols].sum(axis=1)
36 | take_ubi = df[max_ubi] > total_bens
37 | df[ubi] = np.where(take_ubi, df[max_ubi], 0)
38 | for ben in ben_cols:
39 | df[ben] *= np.where(take_ubi, 0, 1)
40 | df[bens] = df[ben_cols].sum(axis=1)
41 | # Update expanded and aftertax income.
42 | diff = df.ubi + df.bens - total_bens
43 | for i in update_income_measures:
44 | df[i] += diff
45 |
--------------------------------------------------------------------------------
/microdf/utils.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | import pandas as pd
4 |
5 |
6 | def ordinal_label(n):
7 | """Creates ordinal label from number.
8 |
9 | Adapted from https://stackoverflow.com/a/20007730/1840471.
10 |
11 | :param n: Number.
12 | :returns: Ordinal label, e.g., 1st, 3rd, 24th, etc.
13 |
14 | """
15 | n = int(n)
16 | ix = (n / 10 % 10 != 1) * (n % 10 < 4) * n % 10
17 | return "%d%s" % (n, "tsnrhtdd"[ix::4])
18 |
19 |
20 | def dedup_list(lst):
21 | """Remove duplicate items from a list.
22 |
23 | :param lst: List.
24 | :returns: List with duplicate items removed from lst.
25 |
26 | """
27 | return list(set(lst))
28 |
29 |
30 | def listify(x, dedup=True):
31 | """Return x as a list, if it isn't one already.
32 |
33 | :param x: A single item or a list
34 | :param dedup: Default value = True)
35 | :returns: x if x is a list, otherwise [x]. Also flattens the list
36 | and removes Nones.
37 |
38 | """
39 | if not isinstance(x, list):
40 | x = [x]
41 | res = flatten(x)
42 | res = [x for x in res if x is not None]
43 | if dedup:
44 | return dedup_list(res)
45 | return res
46 |
47 |
48 | def flatten(lst):
49 | """Flatten list. From https://stackoverflow.com/a/2158532/1840471.
50 |
51 | :param lst: List.
52 | :returns: Flattened version.
53 |
54 | """
55 | for el in lst:
56 | if isinstance(el, collections.abc.Iterable) and not isinstance(
57 | el, (str, bytes)
58 | ):
59 | yield from flatten(el)
60 | else:
61 | yield el
62 |
63 |
64 | def cartesian_product(d):
65 | """Produces a DataFrame as a Cartesian product of dictionary
66 | keys and values.
67 |
68 | :param d: Dictionary where each item's key corresponds to a column
69 | name, and each value is a list of values.
70 | :returns: DataFrame with a Cartesian product of each dictionary item.
71 |
72 | """
73 | index = pd.MultiIndex.from_product(d.values(), names=d.keys())
74 | return pd.DataFrame(index=index).reset_index()
75 |
--------------------------------------------------------------------------------
/microdf/weighted.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import warnings
4 |
5 | import microdf as mdf
6 |
7 |
8 | def weight(df, col, w=None):
9 | """Calculates the weighted value of a column in a DataFrame.
10 |
11 | :param df: A pandas DataFrame.
12 | :param col: A string indicating the column in the DataFrame to weight.
13 | Can also be a list of column strings.
14 | :param w: Weight column.
15 | :returns: A pandas Series multiplying the column by its weight.
16 |
17 | """
18 | if w is None:
19 | return df[col]
20 | return df[col].multiply(df[w], axis="index")
21 |
22 |
23 | def weighted_sum(df, col, w=None, groupby=None):
24 | """Calculates the weighted sum of a column in a DataFrame.
25 |
26 | :param df: A pandas DataFrame.
27 | :param col: A string indicating the column in the DataFrame.
28 | Can also be a list of column strings.
29 | :param w: Weight column.
30 | :param groupby: Groupby column.
31 | :returns: The weighted sum of a DataFrame's column.
32 |
33 | """
34 |
35 | def _weighted_sum(df, col, w):
36 | """ For weighted sum with provided weight. """
37 | return weight(df, col, w).sum()
38 |
39 | if groupby is None:
40 | if w is None:
41 | return df[col].sum()
42 | return _weighted_sum(df, col, w)
43 | # If grouping.
44 | if w is None:
45 | return df.groupby(groupby)[col].sum()
46 | return df.groupby(groupby).apply(lambda x: _weighted_sum(x, col, w))
47 |
48 |
49 | def weighted_mean(df, col, w=None, groupby=None):
50 | """Calculates the weighted mean of a column in a DataFrame.
51 |
52 | :param df: A pandas DataFrame.
53 | :param col: A string indicating the column in the DataFrame.
54 | Can also be a list of column strings.
55 | :param w: Weight column.
56 | :param groupby: Groupby column.
57 | :returns: The weighted mean of a DataFrame's column.
58 |
59 | """
60 |
61 | def _weighted_mean(df, col, w=None):
62 | """ For weighted mean with provided weight. """
63 | return weighted_sum(df, col, w) / df[w].sum()
64 |
65 | if groupby is None:
66 | if w is None:
67 | return df[col].mean()
68 | return _weighted_mean(df, col, w)
69 | # Group.
70 | if w is None:
71 | return df.groupby(groupby)[col].mean()
72 | return df.groupby(groupby).apply(lambda x: _weighted_mean(x, col, w))
73 |
74 |
75 | def weighted_quantile(df: pd.DataFrame, col: str, w: str, quantiles: np.array):
76 | """Calculates weighted quantiles of a set of values.
77 |
78 | Doesn't exactly match unweighted quantiles of stacked values.
79 | See stackoverflow.com/q/21844024#comment102342137_29677616.
80 |
81 | :param df: DataFrame to calculate weighted quantiles from.
82 | :type df: pd.DataFrame
83 | :param col: Name of numeric column in df to calculate weighted quantiles
84 | from.
85 | :type col: str
86 | :param w: Name of weight column in df.
87 | :type w: str
88 | :param quantiles: Array of quantiles to calculate.
89 | :type quantiles: np.array
90 | :return: Array of weighted quantiles.
91 | :rtype: np.array
92 | """
93 | values = np.array(df[col])
94 | quantiles = np.array(quantiles)
95 | if w is None:
96 | sample_weight = np.ones(len(values))
97 | else:
98 | sample_weight = np.array(df[w])
99 | assert np.all(quantiles >= 0) and np.all(
100 | quantiles <= 1
101 | ), "quantiles should be in [0, 1]"
102 | sorter = np.argsort(values)
103 | values = values[sorter]
104 | sample_weight = sample_weight[sorter]
105 | weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
106 | weighted_quantiles /= np.sum(sample_weight)
107 | return np.interp(quantiles, weighted_quantiles, values)
108 |
109 |
110 | def weighted_median(df, col, w=None, groupby=None):
111 | """Calculates the weighted median of a column in a DataFrame.
112 |
113 | :param df: A pandas DataFrame containing Tax-Calculator data.
114 | :param col: A string indicating the column in the DataFrame.
115 | :param w: Weight column.
116 | :returns: The weighted median of a DataFrame's column.
117 |
118 | """
119 |
120 | def _weighted_median(df, col, w):
121 | """ For weighted median with provided weight. """
122 | return weighted_quantile(df, col, w, 0.5)
123 |
124 | if groupby is None:
125 | if w is None:
126 | return df[col].median()
127 | return _weighted_median(df, col, w)
128 | # Group.
129 | if w is None:
130 | return df.groupby(groupby)[col].median()
131 | return df.groupby(groupby).apply(lambda x: _weighted_median(x, col, w))
132 |
133 |
134 | def add_weighted_quantiles(df, col, w):
135 | """Adds weighted quantiles of a column to a DataFrame.
136 | This will be deprecated in the next minor release. Please use
137 | MicroSeries.rank instead.
138 |
139 | Adds columns for each of these types of quantiles to a DataFrame:
140 | * *_percentile_exact: Exact percentile.
141 | * *_percentile: Integer percentile (ceiling).
142 | * *_2percentile: Integer percentile (ceiling, for each two percentiles).
143 | * *_ventile: Integer percentile (ceiling, for each five percentiles).
144 | * *_decile: Integer decile.
145 | * *_quintile: Integer quintile.
146 | * *_quartile: Integer quartile.
147 |
148 | Negative values are assigned -1.
149 |
150 | :param df: A pandas DataFrame.
151 | :param col: A string indicating the column in the DataFrame to calculate.
152 | :param w: Weight column.
153 | :returns: Nothing. Columns are added in place. Also sorts df by col.
154 | """
155 | warnings.warn(
156 | "This will be deprecated in the next minor release. "
157 | "Please use MicroSeries.rank instead.",
158 | DeprecationWarning,
159 | )
160 | df.sort_values(by=col, inplace=True)
161 | col_pctile = col + "_percentile_exact"
162 | df[col_pctile] = 100 * df[w].cumsum() / df[w].sum()
163 | # "Null out" negatives using -1, since integer arrays can't be NaN.
164 | df[col_pctile] = np.where(df[col] >= 0, df[col_pctile], 0)
165 | # Reduce top record, otherwise it's incorrectly rounded up.
166 | df[col_pctile] = np.where(
167 | df[col_pctile] >= 99.99999, 99.99999, df[col_pctile]
168 | )
169 | df[col + "_percentile"] = np.ceil(df[col_pctile]).astype(int)
170 | df[col + "_2percentile"] = 2 * np.ceil(df[col_pctile] / 2).astype(int)
171 | df[col + "_ventile"] = 5 * np.ceil(df[col_pctile] / 5).astype(int)
172 | df[col + "_decile"] = np.ceil(df[col_pctile] / 10).astype(int)
173 | df[col + "_quintile"] = np.ceil(df[col_pctile] / 20).astype(int)
174 | df[col + "_quartile"] = np.ceil(df[col_pctile] / 25).astype(int)
175 |
176 |
177 | def quantile_chg(df1, df2, col1, col2, w1=None, w2=None, q=None):
178 | """Create table with two sets of quantiles.
179 |
180 | :param df1: DataFrame with first set of values.
181 | :param df2: DataFrame with second set of values.
182 | :param col1: Name of columns with values in df1.
183 | :param col2: Name of columns with values in df2.
184 | :param w1: Name of weight column in df1.
185 | :param w2: Name of weight column in df2.
186 | :param q: Quantiles. Defaults to decile boundaries.
187 | :returns: DataFrame with two rows and a column for each quantile.
188 | Column labels are "xth percentile" and a label is added
189 | to the median.
190 |
191 | """
192 | if q is None:
193 | q = np.arange(0.1, 1, 0.1)
194 | q1 = weighted_quantile(df1, col1, w1, q)
195 | q2 = weighted_quantile(df2, col2, w2, q)
196 | qdf = pd.DataFrame([q1, q2])
197 | # Set decile labels.
198 | q_print = [mdf.ordinal_label((i * 100)) for i in q]
199 | try: # List index throws an error if the value is not found.
200 | median_index = q.tolist().index(0.5)
201 | q_print[median_index] += " (median)"
202 | except ValueError:
203 | pass # Don't assign median to any label.
204 | qdf.columns = q_print
205 | return qdf
206 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name="microdf-python",
5 | version="0.4.3",
6 | description="Survey microdata as DataFrames.",
7 | url="http://github.com/PSLmodels/microdf",
8 | author="Max Ghenis",
9 | author_email="max@ubicenter.org",
10 | license="MIT",
11 | packages=["microdf"],
12 | install_requires=[
13 | "numpy",
14 | "pandas",
15 | ],
16 | extras_require={
17 | "taxcalc": ["taxcalc"],
18 | "charts": [
19 | "seaborn",
20 | "matplotlib",
21 | "matplotlib-label-lines"
22 | ]
23 | },
24 | zip_safe=False,
25 | )
26 |
--------------------------------------------------------------------------------