├── .github └── workflows │ ├── build_and_test.yml │ ├── check_jupyterbook.yml │ ├── codecov.yml │ ├── deploy_jupyterbook.yml │ └── linting.yml ├── .gitignore ├── LICENSE ├── PSL_catalog.json ├── README.md ├── ROADMAP.md ├── codecov.yml ├── docs ├── _config.yml ├── _toc.yml ├── agg.ipynb ├── charts.ipynb ├── custom_taxes.ipynb ├── demo.ipynb ├── examples.md ├── gini.ipynb ├── home.md ├── income_measures.ipynb ├── microdf_logo.png └── weighting.ipynb ├── environment.yml ├── microdf ├── __init__.py ├── _optional.py ├── agg.py ├── chart_utils.py ├── charts.py ├── concat.py ├── constants.py ├── custom_taxes.py ├── generic.py ├── income_measures.py ├── inequality.py ├── io.py ├── poverty.py ├── style.py ├── tax.py ├── taxcalc.py ├── tests │ ├── __pycache__ │ │ └── .vscode │ │ │ └── settings.json │ ├── conftest.py │ ├── test_compare.py │ ├── test_generic.py │ ├── test_inequality.py │ ├── test_io.py │ ├── test_optional_dependency.py │ ├── test_percentile_actual.csv │ ├── test_percentile_expected.csv │ ├── test_poverty.py │ ├── test_quantile_chg.py │ ├── test_tax.py │ ├── test_taxcalc.py │ ├── test_utils.py │ └── test_weighted.py ├── ubi.py ├── utils.py └── weighted.py └── setup.py /.github/workflows/build_and_test.yml: -------------------------------------------------------------------------------- 1 | name: Build and test [Python 3.9, 3.10, 3.11] 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.9", "3.10", "3.11"] 11 | 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v2 15 | with: 16 | persist-credentials: false 17 | 18 | - name: Setup Miniconda using Python ${{ matrix.python-version }} 19 | uses: conda-incubator/setup-miniconda@v2 20 | with: 21 | activate-environment: microdf 22 | environment-file: environment.yml 23 | python-version: ${{ matrix.python-version }} 24 | auto-activate-base: false 25 | 26 | - name: Build 27 | shell: bash -l {0} 28 | run: pip install -e . 29 | 30 | - name: Test 31 | shell: bash -l {0} 32 | run: pytest 33 | -------------------------------------------------------------------------------- /.github/workflows/check_jupyterbook.yml: -------------------------------------------------------------------------------- 1 | name: Test that Jupyter-Book builds 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | if: github.repository == 'PSLmodels/microdf' 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Checkout 9 | uses: actions/checkout@v2 10 | with: 11 | persist-credentials: false 12 | 13 | - name: Setup Miniconda 14 | uses: conda-incubator/setup-miniconda@v2 15 | with: 16 | activate-environment: microdf 17 | environment-file: environment.yml 18 | python-version: 3.9 19 | auto-activate-base: false 20 | 21 | - name: Build # Build Jupyter Book 22 | shell: bash -l {0} 23 | run: | 24 | pip install -e . 25 | jb build docs/. 26 | -------------------------------------------------------------------------------- /.github/workflows/codecov.yml: -------------------------------------------------------------------------------- 1 | name: CodeCov 2 | on: [push, pull_request] 3 | jobs: 4 | run: 5 | runs-on: ubuntu-latest 6 | env: 7 | OS: ubuntu-latest 8 | PYTHON: '3.9' 9 | steps: 10 | - uses: checkout@v2 11 | with: 12 | fetch-depth: ‘2’ 13 | 14 | - name: Setup Python 15 | uses: actions/setup-python@master 16 | with: 17 | python-version: 3.9 18 | - name: Generate Report 19 | run: | 20 | pip install coverage 21 | coverage run -m unittest 22 | - name: Upload Coverage to Codecov 23 | uses: codecov/codecov-action@v1 24 | -------------------------------------------------------------------------------- /.github/workflows/deploy_jupyterbook.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy Jupyter Book 2 | on: 3 | push: 4 | branches: 5 | - master 6 | jobs: 7 | build-and-deploy: 8 | if: github.repository == 'PSLmodels/microdf' 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout 12 | uses: actions/checkout@v2 13 | with: 14 | persist-credentials: false 15 | 16 | - name: Setup Miniconda 17 | uses: conda-incubator/setup-miniconda@v2 18 | with: 19 | activate-environment: microdf 20 | environment-file: environment.yml 21 | python-version: 3.9 22 | auto-activate-base: false 23 | 24 | - name: Build 25 | shell: bash -l {0} 26 | run: | 27 | pip install -e . 28 | jb build docs/. 29 | 30 | - name: Deploy 31 | uses: JamesIves/github-pages-deploy-action@releases/v3 32 | with: 33 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 34 | BRANCH: gh-pages # The branch the action should deploy to. 35 | FOLDER: docs/_build/html # The folder the action should deploy. 36 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | paths: 6 | - '**.py' 7 | pull_request: 8 | paths: 9 | - '**.py' 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.9] 17 | 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | with: 22 | persist-credentials: false 23 | 24 | - name: Setup Miniconda using Python ${{ matrix.python-version }} 25 | uses: conda-incubator/setup-miniconda@v2 26 | with: 27 | activate-environment: microdf 28 | environment-file: environment.yml 29 | python-version: ${{ matrix.python-version }} 30 | auto-activate-base: false 31 | 32 | - name: Lint 33 | shell: bash -l {0} 34 | run: flake8 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | 4 | # Setuptools distribution folder. 5 | /dist/ 6 | 7 | # Python egg metadata, regenerated from source files by setuptools. 8 | /*.egg-info 9 | 10 | .ipynb_checkpoints 11 | 12 | # Built Jupyter-Book documentation. 13 | docs/_build 14 | 15 | .vscode/settings.json 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Max Ghenis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PSL_catalog.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "microdf", 3 | "img": "https://github.com/PSLmodels/microdf/blob/master/docs/microdf_logo.png?raw=true", 4 | "banner_title": "microdf", 5 | "banner_subtitle": "Analysis tools for working with survey microdata as DataFrames", 6 | "detailed_description": "microdf is a Python package for analyzing economic microdata as pandas DataFrames, with special functions for handling sampling weights.", 7 | "policy_area": "Survey data, data analysis", 8 | "geography": "Not specific", 9 | "language": "Python", 10 | "maintainers": [ 11 | { 12 | "name": "Max Ghenis", 13 | "image": "https://policyengine.org/static/media/max-ghenis.536762d4b2439bf591f5.png", 14 | "link": "mailto:max@policyengine.org" 15 | } 16 | ], 17 | "links": { 18 | "code_repository": "https://github.com/PSLmodels/microdf", 19 | "user_documentation": "http://pslmodels.github.io/microdf/", 20 | "contributor_documentation": "", 21 | "webapp": "", 22 | "recent_changes": "https://github.com/PSLmodels/microdf/releases" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build](https://github.com/PSLmodels/microdf/workflows/Build%20and%20test%20[Python%203.7,%203.8,%203.9]/badge.svg)](https://github.com/PSLmodels/microdf/actions?query=workflow%3A%22Build+and+test+%5BPython+3.7%2C+3.8%2C+3.9%5D%22) 2 | [![Codecov](https://codecov.io/gh/PSLmodels/microdf/branch/master/graph/badge.svg)](https://codecov.io/gh/PSLmodels/microdf) 3 | 4 | # microdf 5 | Analysis tools for working with survey microdata as DataFrames. 6 | 7 | *Disclaimer: `MicroSeries` and `MicroDataFrame` are experimental features and may not consider weights after performing some operations. See open issues.* 8 | 9 | ## Installation 10 | Install with: 11 | 12 | pip install git+git://github.com/PSLmodels/microdf.git 13 | 14 | ## Questions 15 | Contact the maintainer, Max Ghenis (mghenis@gmail.com). 16 | 17 | ## Citation 18 | You may cite the source of your analysis as "microdf release #.#.#, author's calculations." 19 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | # `microdf` roadmap 2 | 3 | `microdf` currently provides capabilities for analyzing weighted microdata, including statistics, distributional tables, graphs, and special functions for working with PSL Tax-Calculator. In the future, it will provide more functionality, including: 4 | * Charts showing distributional changes between a baseline and reform policy 5 | * Extending these charts to more than one reform 6 | * Presets for working with common datasets, e.g. suggesting the appropriate weight for SCF and CPS 7 | * Standard error calculations for surveys with replicate weight files 8 | 9 | See the [issues page](https://github.com/PSLmodels/microdf/issues) to view and suggest other items. 10 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PolicyEngine/microdf/ccf2e54e559ce7563ca9c19b144ab8d41986e1fb/codecov.yml -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | title: microdf documentation 3 | author: Max Ghenis 4 | logo: microdf_logo.png 5 | 6 | launch_buttons: 7 | colab_url: "https://colab.research.google.com" 8 | 9 | repository: 10 | url: https://github.com/PSLmodels/microdf 11 | branch: master 12 | path_to_book: docs 13 | 14 | html: 15 | use_edit_page_button : true 16 | use_repository_button : true 17 | use_issues_button : true 18 | -------------------------------------------------------------------------------- /docs/_toc.yml: -------------------------------------------------------------------------------- 1 | format: jb-article 2 | root: home 3 | sections: 4 | - file: examples 5 | sections: 6 | - file: agg 7 | - file: charts 8 | - file: custom_taxes 9 | - file: demo 10 | - file: gini 11 | - file: income_measures 12 | - file: weighting 13 | -------------------------------------------------------------------------------- /docs/agg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The `agg` function\n", 8 | "\n", 9 | "Use `agg` to see the effect of a $10,000 UBI by marital status.\n", 10 | "\n", 11 | "## Setup" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "\n", 23 | "import taxcalc as tc\n", 24 | "import microdf as mdf" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "'2.3.0'" 36 | ] 37 | }, 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "tc.__version__" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Load data\n", 52 | "\n", 53 | "Start with a standard `DataFrame`, then add a UBI manually in a reform copy." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "base = mdf.calc_df(group_vars=['expanded_income', 'MARS', 'XTOT'],\n", 63 | " metric_vars='aftertax_income')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "reform = base.copy(deep=True)\n", 73 | "UBI_PP = 10000\n", 74 | "reform['ubi'] = reform.XTOT * UBI_PP\n", 75 | "reform['aftertax_income'] = reform.aftertax_income + reform.ubi\n", 76 | "mdf.add_weighted_metrics(reform, 'aftertax_income')" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## `agg`\n", 84 | "\n", 85 | "### Change in aftertax income by marital status." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/html": [ 96 | "
\n", 97 | "\n", 110 | "\n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | "
aftertax_income_m_baseaftertax_income_m_reformaftertax_income_pctchg
MARS
1.03.916351e+064.939093e+060.261147
2.07.692072e+069.577865e+060.245161
4.08.531427e+051.275820e+060.495436
\n", 146 | "
" 147 | ], 148 | "text/plain": [ 149 | " aftertax_income_m_base aftertax_income_m_reform aftertax_income_pctchg\n", 150 | "MARS \n", 151 | "1.0 3.916351e+06 4.939093e+06 0.261147\n", 152 | "2.0 7.692072e+06 9.577865e+06 0.245161\n", 153 | "4.0 8.531427e+05 1.275820e+06 0.495436" 154 | ] 155 | }, 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "mdf.agg(base, reform, 'MARS', 'aftertax_income')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Also sum baseline `expanded_income`" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 6, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/html": [ 180 | "
\n", 181 | "\n", 194 | "\n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | "
aftertax_income_m_baseexpanded_incomeaftertax_income_m_reformaftertax_income_pctchg
MARS
1.03.916351e+061.593936e+104.939093e+060.261147
2.07.692072e+066.242669e+109.577865e+060.245161
4.08.531427e+052.210208e+091.275820e+060.495436
\n", 235 | "
" 236 | ], 237 | "text/plain": [ 238 | " aftertax_income_m_base expanded_income aftertax_income_m_reform \\\n", 239 | "MARS \n", 240 | "1.0 3.916351e+06 1.593936e+10 4.939093e+06 \n", 241 | "2.0 7.692072e+06 6.242669e+10 9.577865e+06 \n", 242 | "4.0 8.531427e+05 2.210208e+09 1.275820e+06 \n", 243 | "\n", 244 | " aftertax_income_pctchg \n", 245 | "MARS \n", 246 | "1.0 0.261147 \n", 247 | "2.0 0.245161 \n", 248 | "4.0 0.495436 " 249 | ] 250 | }, 251 | "execution_count": 6, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "mdf.agg(base, reform, 'MARS', 'aftertax_income', 'expanded_income')" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "### Also sum UBI amount" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 7, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/html": [ 275 | "
\n", 276 | "\n", 289 | "\n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | "
aftertax_income_m_baseaftertax_income_m_reformubi_maftertax_income_pctchg
MARS
1.03.916351e+064.939093e+061.022742e+060.261147
2.07.692072e+069.577865e+061.885793e+060.245161
4.08.531427e+051.275820e+064.226775e+050.495436
\n", 330 | "
" 331 | ], 332 | "text/plain": [ 333 | " aftertax_income_m_base aftertax_income_m_reform ubi_m \\\n", 334 | "MARS \n", 335 | "1.0 3.916351e+06 4.939093e+06 1.022742e+06 \n", 336 | "2.0 7.692072e+06 9.577865e+06 1.885793e+06 \n", 337 | "4.0 8.531427e+05 1.275820e+06 4.226775e+05 \n", 338 | "\n", 339 | " aftertax_income_pctchg \n", 340 | "MARS \n", 341 | "1.0 0.261147 \n", 342 | "2.0 0.245161 \n", 343 | "4.0 0.495436 " 344 | ] 345 | }, 346 | "execution_count": 7, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "mdf.add_weighted_metrics(reform, 'ubi') # Creates ubi_m = ubi * s006 / 1e6.\n", 353 | "\n", 354 | "mdf.agg(base, reform, 'MARS', 'aftertax_income', reform_metrics='ubi_m')" 355 | ] 356 | } 357 | ], 358 | "metadata": { 359 | "kernelspec": { 360 | "display_name": "Python 3", 361 | "language": "python", 362 | "name": "python3" 363 | }, 364 | "language_info": { 365 | "codemirror_mode": { 366 | "name": "ipython", 367 | "version": 3 368 | }, 369 | "file_extension": ".py", 370 | "mimetype": "text/x-python", 371 | "name": "python", 372 | "nbconvert_exporter": "python", 373 | "pygments_lexer": "ipython3", 374 | "version": "3.7.3" 375 | }, 376 | "toc": { 377 | "base_numbering": 1, 378 | "nav_menu": {}, 379 | "number_sections": true, 380 | "sideBar": true, 381 | "skip_h1_title": false, 382 | "title_cell": "Table of Contents", 383 | "title_sidebar": "Contents", 384 | "toc_cell": false, 385 | "toc_position": {}, 386 | "toc_section_display": true, 387 | "toc_window_display": false 388 | } 389 | }, 390 | "nbformat": 4, 391 | "nbformat_minor": 2 392 | } 393 | -------------------------------------------------------------------------------- /docs/custom_taxes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Custom taxes\n", 8 | "\n", 9 | "## Setup" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "ename": "ModuleNotFoundError", 19 | "evalue": "No module named 'taxcalc'", 20 | "output_type": "error", 21 | "traceback": [ 22 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 23 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 24 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtaxcalc\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmicrodf\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 25 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'taxcalc'" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "\n", 33 | "import taxcalc as tc\n", 34 | "import microdf as mdf" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "tc.__version__" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Load data\n", 51 | "\n", 52 | "Start with a `DataFrame` with `aftertax_income` and necessary ingredients of `tpc_eci`. " 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "df = mdf.calc_df(group_vars=['expanded_income', 'aftertax_income'] +\n", 62 | " mdf.ECI_REMOVE_COLS,\n", 63 | " metric_vars=['XTOT'])\n", 64 | "df.columns" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Calculate Tax Policy Center's Expanded Cash Income measure, used for the analysis." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "df['tpc_eci'] = mdf.tpc_eci(df)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Incidence of a VAT per Tax Policy Center." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "mdf.add_vat(df)\n", 97 | "df.columns" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "df.head() # Note these are zero because we block negative tax liability." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "df.sample(5)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Add carbon tax and financial transaction tax." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "mdf.add_carbon_tax(df)\n", 132 | "mdf.add_ftt(df)\n", 133 | "df.columns" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "df.sample(5)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "VAT with a custom amount generated." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "mdf.add_vat(df, total=500e9, name='vat2')\n", 159 | "df.columns" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "mdf.weighted_sum(df, 'vat', 's006') / 1e9" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "mdf.weighted_sum(df, 'vat2', 's006') / 1e9" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Calculate by hand using `add_custom_tax`." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "mdf.add_custom_tax(df, 'tpc_eci', 'XTOT_m', 'aftertax_income', \n", 194 | " mdf.VAT_INCIDENCE, 'vat3', 1e12)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "mdf.weighted_sum(df, 'vat3', 's006') / 1e9" 204 | ] 205 | } 206 | ], 207 | "metadata": { 208 | "kernelspec": { 209 | "display_name": "Python 3", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "codemirror_mode": { 215 | "name": "ipython", 216 | "version": 3 217 | }, 218 | "file_extension": ".py", 219 | "mimetype": "text/x-python", 220 | "name": "python", 221 | "nbconvert_exporter": "python", 222 | "pygments_lexer": "ipython3", 223 | "version": "3.7.9" 224 | }, 225 | "toc": { 226 | "base_numbering": 1, 227 | "nav_menu": {}, 228 | "number_sections": true, 229 | "sideBar": true, 230 | "skip_h1_title": false, 231 | "title_cell": "Table of Contents", 232 | "title_sidebar": "Contents", 233 | "toc_cell": false, 234 | "toc_position": {}, 235 | "toc_section_display": true, 236 | "toc_window_display": false 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 4 241 | } 242 | -------------------------------------------------------------------------------- /docs/demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `microdf` demo" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Setup" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "\n", 26 | "import taxcalc as tc\n", 27 | "import microdf as mdf\n", 28 | "\n", 29 | "import matplotlib as mpl\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "import seaborn as sns" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "Chart options." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stderr", 48 | "output_type": "stream", 49 | "text": [ 50 | "/home/mghenis/anaconda3/lib/python3.7/site-packages/microdf/style.py:24: MatplotlibDeprecationWarning: \n", 51 | "The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.\n", 52 | " fm.fontManager.ttflist += fm.createFontList([\"Roboto-Regular.ttf\"])\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "mdf.set_plot_style()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Generate data" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "base = mdf.calc_df(group_vars=['expanded_income', 'MARS'],\n", 74 | " metric_vars=['aftertax_income', 'XTOT'])" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "Index(['e02400', 'mcare_ben', 'aftertax_income', 'ssi_ben', 'expanded_income',\n", 86 | " 'snap_ben', 'vet_ben', 'housing_ben', 's006', 'other_ben', 'e02300',\n", 87 | " 'mcaid_ben', 'XTOT', 'tanf_ben', 'MARS', 'wic_ben', 'market_income',\n", 88 | " 'bens', 'tax', 's006_m', 'aftertax_income_m', 'XTOT_m'],\n", 89 | " dtype='object')" 90 | ] 91 | }, 92 | "execution_count": 4, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "base.columns" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "Define a reform that treats capital gains as ordinary income and sets the top marginal rate to 70%." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "CG_REFORM = {\n", 115 | " 'CG_nodiff': {2019: True},\n", 116 | " 'II_rt7': {2019: 0.7}\n", 117 | "}" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "reform = mdf.calc_df(reform=CG_REFORM, group_vars=['MARS'], group_n65=True, \n", 127 | " metric_vars=['aftertax_income', 'XTOT'])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 7, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "Index(['vet_ben', 's006', 'e02300', 'MARS', 'e02400', 'mcare_ben', 'ssi_ben',\n", 139 | " 'snap_ben', 'housing_ben', 'other_ben', 'aftertax_income',\n", 140 | " 'expanded_income', 'mcaid_ben', 'XTOT', 'tanf_ben', 'wic_ben',\n", 141 | " 'market_income', 'bens', 'tax', 'n65', 's006_m', 'aftertax_income_m',\n", 142 | " 'XTOT_m'],\n", 143 | " dtype='object')" 144 | ] 145 | }, 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "reform.columns" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "### Calculate senior UBI.\n", 160 | "\n", 161 | "Start with total revenue ($ billions)." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 8, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "326.110945495585" 173 | ] 174 | }, 175 | "execution_count": 8, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "new_rev_m = base.aftertax_income_m.sum() - reform.aftertax_income_m.sum()\n", 182 | "new_rev_m / 1e3" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "How many seniors are there?" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 9, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "59.21619976999999" 201 | ] 202 | }, 203 | "execution_count": 9, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "mdf.add_weighted_metrics(reform, 'n65')\n", 210 | "\n", 211 | "n65_total_m = reform.n65_m.sum()\n", 212 | "n65_total_m" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Divide." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 10, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "5507.123840473106" 231 | ] 232 | }, 233 | "execution_count": 10, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "senior_ubi = new_rev_m / reform.n65_m.sum()\n", 240 | "senior_ubi" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "### Add senior UBI to `aftertax_income` and recalculate" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 11, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "reform['ubi'] = senior_ubi * reform.n65\n", 257 | "reform['aftertax_income'] = reform.aftertax_income + reform.ubi\n", 258 | "mdf.add_weighted_metrics(reform, 'aftertax_income')" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 12, 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "True" 270 | ] 271 | }, 272 | "execution_count": 12, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "np.allclose(base.aftertax_income_m.sum(), reform.aftertax_income_m.sum())" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "## Analyze\n", 286 | "\n", 287 | "Gini, FPL, distributional impact chart\n", 288 | "\n", 289 | "### Change to Gini index" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 13, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "0.5032911973267852" 301 | ] 302 | }, 303 | "execution_count": 13, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "mdf.gini(base, 'aftertax_income', 's006')" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 14, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "0.48752755152259336" 321 | ] 322 | }, 323 | "execution_count": 14, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "mdf.gini(reform, 'aftertax_income', 's006')" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "### Change to poverty rate\n", 337 | "\n", 338 | "Add federal poverty line with `mdf.fpl`." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 15, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "base['fpl'] = mdf.fpl(base.XTOT)\n", 348 | "reform['fpl'] = mdf.fpl(reform.XTOT)\n", 349 | "\n", 350 | "base['fpl_XTOT_m'] = np.where(base.aftertax_income < base.fpl,\n", 351 | " base.XTOT_m, 0)\n", 352 | "reform['fpl_XTOT_m'] = np.where(reform.aftertax_income < reform.fpl,\n", 353 | " reform.XTOT_m, 0)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 16, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "data": { 363 | "text/plain": [ 364 | "-0.022307196800575246" 365 | ] 366 | }, 367 | "execution_count": 16, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "reform.fpl_XTOT_m.sum() / base.fpl_XTOT_m.sum() - 1" 374 | ] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.7.9" 394 | }, 395 | "toc": { 396 | "base_numbering": 1, 397 | "nav_menu": {}, 398 | "number_sections": true, 399 | "sideBar": true, 400 | "skip_h1_title": false, 401 | "title_cell": "Table of Contents", 402 | "title_sidebar": "Contents", 403 | "toc_cell": false, 404 | "toc_position": {}, 405 | "toc_section_display": true, 406 | "toc_window_display": false 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 2 411 | } -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | See these rendered Jupyter notebooks for examples of `microdf` usage. -------------------------------------------------------------------------------- /docs/gini.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `gini` example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import microdf as mdf\n", 17 | "\n", 18 | "import pandas as pd" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "x = [-10, -1, 0, 5, 100]\n", 28 | "w = [1, 2, 3, 4, 5]\n", 29 | "df = pd.DataFrame({'x': x, 'w': w})" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Simple behavior" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "0.9617021276595745" 48 | ] 49 | }, 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "mdf.gini(df, 'x')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Dealing with negatives" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "This will be equivalent to `mdf.gini(pd.DataFrame({'x': [0, 0, 0, 5, 100]}))`." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "0.780952380952381" 82 | ] 83 | }, 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "mdf.gini(df, 'x', negatives='zero')" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "0.780952380952381" 102 | ] 103 | }, 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "mdf.gini(pd.DataFrame({'x': [0, 0, 0, 5, 100]}), 'x')" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "This will be equivalent to `mdf.gini(pd.DataFrame({'x': [0, 9, 10, 15, 110]}))`." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "0.6277777777777778" 129 | ] 130 | }, 131 | "execution_count": 6, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "mdf.gini(df, 'x', negatives='shift')" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "0.6277777777777778" 149 | ] 150 | }, 151 | "execution_count": 7, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "mdf.gini(pd.DataFrame({'x': [0, 9, 10, 15, 110]}), 'x')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Dealing with weights" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 8, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "0.6800524934383202" 176 | ] 177 | }, 178 | "execution_count": 8, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "mdf.gini(df, 'x', 'w')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 9, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "0.6800524934383202" 196 | ] 197 | }, 198 | "execution_count": 9, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "mdf.gini(pd.DataFrame({'x': [-10,\n", 205 | " -1, -1,\n", 206 | " 0, 0, 0,\n", 207 | " 5, 5, 5, 5,\n", 208 | " 100, 100, 100, 100, 100]}),\n", 209 | " 'x')" 210 | ] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.7.9" 230 | }, 231 | "toc": { 232 | "base_numbering": 1, 233 | "nav_menu": {}, 234 | "number_sections": true, 235 | "sideBar": true, 236 | "skip_h1_title": false, 237 | "title_cell": "Table of Contents", 238 | "title_sidebar": "Contents", 239 | "toc_cell": false, 240 | "toc_position": {}, 241 | "toc_section_display": true, 242 | "toc_window_display": false 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 2 247 | } 248 | -------------------------------------------------------------------------------- /docs/home.md: -------------------------------------------------------------------------------- 1 | `microdf` documentation 2 | ======================= 3 | 4 | This includes example notebooks, and in the future will also include function documentation. -------------------------------------------------------------------------------- /docs/income_measures.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Income measures\n", 8 | "\n", 9 | "## Setup" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 6, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "\n", 21 | "import taxcalc as tc\n", 22 | "import microdf as mdf" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 7, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "'2.3.0'" 34 | ] 35 | }, 36 | "execution_count": 7, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "tc.__version__" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Load data\n", 50 | "\n", 51 | "Start with a `DataFrame` with `expanded_income` and the variables in `expanded_income` excluded from `tpc_eci`." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 8, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df = mdf.calc_df(group_vars=['expanded_income', 'wic_ben', 'housing_ben', \n", 61 | " 'vet_ben', 'mcare_ben', 'mcaid_ben'],\n", 62 | " metric_vars=['XTOT'])" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Calculate `tpc_eci`." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 9, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df['tpc_eci'] = mdf.tpc_eci(df)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 10, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/html": [ 89 | "
\n", 90 | "\n", 103 | "\n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | "
snap_benvet_benmcaid_benmcare_benaftertax_incomee02300ssi_benwic_bens006expanded_income...tanf_benother_bene02400XTOTmarket_incomebenstaxs006_mXTOT_mtpc_eci
RECID
10.000000.00.0000000.00000043371.0125040.00.000000.0250.1453636.919015...0.00.0000000.0000002.053636.9190150.00000010265.9065110.0002500.00050053636.919015
20.000000.00.0000000.00000020937.8865110.00.000000.0211.6318650.034959...0.00.0000000.0000003.018650.0349590.000000-2287.8515530.0002120.00063518650.034959
31734.129390.08211.59362713640.39061252516.1653970.03374.522390.0323.5052516.165397...0.06663.70162313227.0798161.00.00000052516.1653970.0000000.0003240.00032424999.433219
40.000000.08211.5936270.00000036857.7091880.00.000000.0186.3237764.286717...0.03906.5423680.0000002.025646.15072312118.135995906.5775290.0001860.00037329552.693091
50.000000.00.00000027280.78122363941.1582830.00.000000.0343.0863941.158283...0.00.00000035560.5532862.01099.82377462841.3345090.0000000.0003430.00068636660.377060
\n", 277 | "

5 rows × 21 columns

\n", 278 | "
" 279 | ], 280 | "text/plain": [ 281 | " snap_ben vet_ben mcaid_ben mcare_ben aftertax_income \\\n", 282 | "RECID \n", 283 | "1 0.00000 0.0 0.000000 0.000000 43371.012504 \n", 284 | "2 0.00000 0.0 0.000000 0.000000 20937.886511 \n", 285 | "3 1734.12939 0.0 8211.593627 13640.390612 52516.165397 \n", 286 | "4 0.00000 0.0 8211.593627 0.000000 36857.709188 \n", 287 | "5 0.00000 0.0 0.000000 27280.781223 63941.158283 \n", 288 | "\n", 289 | " e02300 ssi_ben wic_ben s006 expanded_income ... tanf_ben \\\n", 290 | "RECID ... \n", 291 | "1 0.0 0.00000 0.0 250.14 53636.919015 ... 0.0 \n", 292 | "2 0.0 0.00000 0.0 211.63 18650.034959 ... 0.0 \n", 293 | "3 0.0 3374.52239 0.0 323.50 52516.165397 ... 0.0 \n", 294 | "4 0.0 0.00000 0.0 186.32 37764.286717 ... 0.0 \n", 295 | "5 0.0 0.00000 0.0 343.08 63941.158283 ... 0.0 \n", 296 | "\n", 297 | " other_ben e02400 XTOT market_income bens \\\n", 298 | "RECID \n", 299 | "1 0.000000 0.000000 2.0 53636.919015 0.000000 \n", 300 | "2 0.000000 0.000000 3.0 18650.034959 0.000000 \n", 301 | "3 6663.701623 13227.079816 1.0 0.000000 52516.165397 \n", 302 | "4 3906.542368 0.000000 2.0 25646.150723 12118.135995 \n", 303 | "5 0.000000 35560.553286 2.0 1099.823774 62841.334509 \n", 304 | "\n", 305 | " tax s006_m XTOT_m tpc_eci \n", 306 | "RECID \n", 307 | "1 10265.906511 0.000250 0.000500 53636.919015 \n", 308 | "2 -2287.851553 0.000212 0.000635 18650.034959 \n", 309 | "3 0.000000 0.000324 0.000324 24999.433219 \n", 310 | "4 906.577529 0.000186 0.000373 29552.693091 \n", 311 | "5 0.000000 0.000343 0.000686 36660.377060 \n", 312 | "\n", 313 | "[5 rows x 21 columns]" 314 | ] 315 | }, 316 | "execution_count": 10, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "df.head()" 323 | ] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "Python 3", 329 | "language": "python", 330 | "name": "python3" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 3 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython3", 342 | "version": "3.7.3" 343 | }, 344 | "toc": { 345 | "base_numbering": 1, 346 | "nav_menu": {}, 347 | "number_sections": true, 348 | "sideBar": true, 349 | "skip_h1_title": false, 350 | "title_cell": "Table of Contents", 351 | "title_sidebar": "Contents", 352 | "toc_cell": false, 353 | "toc_position": {}, 354 | "toc_section_display": true, 355 | "toc_window_display": false 356 | } 357 | }, 358 | "nbformat": 4, 359 | "nbformat_minor": 2 360 | } 361 | -------------------------------------------------------------------------------- /docs/microdf_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PolicyEngine/microdf/ccf2e54e559ce7563ca9c19b144ab8d41986e1fb/docs/microdf_logo.png -------------------------------------------------------------------------------- /docs/weighting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Weighting in taxcalc_helpers\n", 8 | "\n", 9 | "## Setup" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "\n", 21 | "import taxcalc as tc\n", 22 | "import microdf as mdf" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "'3.0.0'" 34 | ] 35 | }, 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "tc.__version__" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Load data\n", 50 | "\n", 51 | "Start with a `DataFrame` with `nu18` and `XTOT`, and also calculate `XTOT_m`." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "Index(['s006', 'other_ben', 'snap_ben', 'aftertax_income', 'mcaid_ben',\n", 63 | " 'mcare_ben', 'ssi_ben', 'e02300', 'nu18', 'expanded_income',\n", 64 | " 'housing_ben', 'vet_ben', 'wic_ben', 'e02400', 'tanf_ben', 'XTOT',\n", 65 | " 'market_income', 'bens', 'tax', 's006_m', 'XTOT_m'],\n", 66 | " dtype='object')" 67 | ] 68 | }, 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "df = mdf.calc_df(group_vars=['nu18'], metric_vars=['XTOT'])\n", 76 | "df.columns" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "From this we can calculate the number of people and tax units by the tax unit's number of children." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/html": [ 94 | "
\n", 95 | "\n", 108 | "\n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | "
s006_mXTOT_m
nu18
0.0152.988772209.816367
1.022.68825354.115850
2.018.85994568.880292
3.07.43848134.795527
4.02.37111113.539261
5.00.7442765.015182
6.00.2161581.688063
7.00.0903320.790239
8.00.0265010.258552
9.00.0122380.134320
10.00.0071960.084201
12.00.0002650.003715
\n", 184 | "
" 185 | ], 186 | "text/plain": [ 187 | " s006_m XTOT_m\n", 188 | "nu18 \n", 189 | "0.0 152.988772 209.816367\n", 190 | "1.0 22.688253 54.115850\n", 191 | "2.0 18.859945 68.880292\n", 192 | "3.0 7.438481 34.795527\n", 193 | "4.0 2.371111 13.539261\n", 194 | "5.0 0.744276 5.015182\n", 195 | "6.0 0.216158 1.688063\n", 196 | "7.0 0.090332 0.790239\n", 197 | "8.0 0.026501 0.258552\n", 198 | "9.0 0.012238 0.134320\n", 199 | "10.0 0.007196 0.084201\n", 200 | "12.0 0.000265 0.003715" 201 | ] 202 | }, 203 | "execution_count": 4, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "df.groupby('nu18')[['s006_m', 'XTOT_m']].sum()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "What if we also want to calculate the total number of *children* by the tax unit's number of children?\n", 217 | "\n", 218 | "For this we can use `add_weighted_metrics`, the function called within `calc_df`." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 5, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "mdf.add_weighted_metrics(df, ['nu18'])" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "Now we can do the same thing as before, with the new `nu18_m` column." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 6, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/html": [ 245 | "
\n", 246 | "\n", 259 | "\n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | "
nu18_m
nu18
0.00.000000
1.022.688253
2.037.719889
3.022.315444
4.09.484444
5.03.721381
6.01.296949
7.00.632325
8.00.212008
9.00.110139
10.00.071958
12.00.003184
\n", 321 | "
" 322 | ], 323 | "text/plain": [ 324 | " nu18_m\n", 325 | "nu18 \n", 326 | "0.0 0.000000\n", 327 | "1.0 22.688253\n", 328 | "2.0 37.719889\n", 329 | "3.0 22.315444\n", 330 | "4.0 9.484444\n", 331 | "5.0 3.721381\n", 332 | "6.0 1.296949\n", 333 | "7.0 0.632325\n", 334 | "8.0 0.212008\n", 335 | "9.0 0.110139\n", 336 | "10.0 0.071958\n", 337 | "12.0 0.003184" 338 | ] 339 | }, 340 | "execution_count": 6, 341 | "metadata": {}, 342 | "output_type": "execute_result" 343 | } 344 | ], 345 | "source": [ 346 | "df.groupby('nu18')[['nu18_m']].sum()" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "We can also calculate weighted sums without adding the weighted metric." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 7, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "data": { 363 | "text/plain": [ 364 | "'Total children: 98M.'" 365 | ] 366 | }, 367 | "execution_count": 7, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "total_children = mdf.weighted_sum(df, 'nu18', 's006')\n", 374 | "# Fix this decimal.\n", 375 | "'Total children: ' + str(round(total_children / 1e6)) + 'M.'" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "We can also calculate the weighted mean and median." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 8, 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "text/plain": [ 393 | "0.4782626894263673" 394 | ] 395 | }, 396 | "execution_count": 8, 397 | "metadata": {}, 398 | "output_type": "execute_result" 399 | } 400 | ], 401 | "source": [ 402 | "mdf.weighted_mean(df, 'nu18', 's006')" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 9, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "ename": "TypeError", 412 | "evalue": "weighted_quantile() missing 1 required positional argument: 'quantiles'", 413 | "output_type": "error", 414 | "traceback": [ 415 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 416 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 417 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweighted_median\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nu18'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m's006'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 418 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/microdf/weighted.py\u001b[0m in \u001b[0;36mweighted_median\u001b[0;34m(df, col, w)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \"\"\"\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mweighted_quantile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 419 | "\u001b[0;31mTypeError\u001b[0m: weighted_quantile() missing 1 required positional argument: 'quantiles'" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "mdf.weighted_median(df, 'nu18', 's006')" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "We can also look at more quantiles.\n", 432 | "\n", 433 | "*Note that weighted quantiles have a different interface.*" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "decile_bounds = np.arange(0, 1.1, 0.1)\n", 443 | "deciles = mdf.weighted_quantile(df, 'nu18', 's006', decile_bounds)\n", 444 | "pd.DataFrame(deciles, index=decile_bounds)" 445 | ] 446 | } 447 | ], 448 | "metadata": { 449 | "kernelspec": { 450 | "display_name": "Python 3", 451 | "language": "python", 452 | "name": "python3" 453 | }, 454 | "language_info": { 455 | "codemirror_mode": { 456 | "name": "ipython", 457 | "version": 3 458 | }, 459 | "file_extension": ".py", 460 | "mimetype": "text/x-python", 461 | "name": "python", 462 | "nbconvert_exporter": "python", 463 | "pygments_lexer": "ipython3", 464 | "version": "3.7.9" 465 | }, 466 | "toc": { 467 | "base_numbering": 1, 468 | "nav_menu": {}, 469 | "number_sections": true, 470 | "sideBar": true, 471 | "skip_h1_title": false, 472 | "title_cell": "Table of Contents", 473 | "title_sidebar": "Contents", 474 | "toc_cell": false, 475 | "toc_position": {}, 476 | "toc_section_display": true, 477 | "toc_window_display": false 478 | } 479 | }, 480 | "nbformat": 4, 481 | "nbformat_minor": 2 482 | } 483 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: microdf 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - codecov 6 | - flake8 7 | - matplotlib 8 | - numpy 9 | - pandas 10 | - pip 11 | - pytest 12 | - seaborn 13 | - setuptools 14 | - pip: 15 | - jupyter-book 16 | -------------------------------------------------------------------------------- /microdf/__init__.py: -------------------------------------------------------------------------------- 1 | from .agg import agg, combine_base_reform, pctchg_base_reform 2 | from .chart_utils import dollar_format, currency_format 3 | from .charts import quantile_pct_chg_plot 4 | from .concat import concat 5 | from .constants import ( 6 | BENS, 7 | ECI_REMOVE_COLS, 8 | HOUSING_CASH_SHARE, 9 | MCAID_CASH_SHARE, 10 | MCARE_CASH_SHARE, 11 | MED_BENS, 12 | OTHER_CASH_SHARE, 13 | SNAP_CASH_SHARE, 14 | SSI_CASH_SHARE, 15 | TANF_CASH_SHARE, 16 | VET_CASH_SHARE, 17 | WIC_CASH_SHARE, 18 | ) 19 | from .custom_taxes import ( 20 | CARBON_TAX_INCIDENCE, 21 | FTT_INCIDENCE, 22 | VAT_INCIDENCE, 23 | add_carbon_tax, 24 | add_custom_tax, 25 | add_ftt, 26 | add_vat, 27 | ) 28 | from .income_measures import cash_income, market_income, tpc_eci 29 | from .inequality import ( 30 | bottom_50_pct_share, 31 | bottom_x_pct_share, 32 | gini, 33 | t10_b50, 34 | top_0_1_pct_share, 35 | top_10_pct_share, 36 | top_1_pct_share, 37 | top_50_pct_share, 38 | top_x_pct_share, 39 | ) 40 | from .io import read_stata_zip 41 | from .poverty import ( 42 | fpl, 43 | poverty_rate, 44 | deep_poverty_rate, 45 | poverty_gap, 46 | squared_poverty_gap, 47 | deep_poverty_gap, 48 | ) 49 | from .style import AXIS_COLOR, DPI, GRID_COLOR, TITLE_COLOR, set_plot_style 50 | from .tax import mtr, tax_from_mtrs 51 | from .taxcalc import ( 52 | add_weighted_metrics, 53 | calc_df, 54 | n65, 55 | recalculate, 56 | static_baseline_calc, 57 | ) 58 | from .ubi import ubi_or_bens 59 | from .utils import ( 60 | cartesian_product, 61 | dedup_list, 62 | flatten, 63 | listify, 64 | ordinal_label, 65 | ) 66 | from .weighted import ( 67 | add_weighted_quantiles, 68 | quantile_chg, 69 | weight, 70 | weighted_mean, 71 | weighted_median, 72 | weighted_quantile, 73 | weighted_sum, 74 | ) 75 | from .generic import MicroDataFrame, MicroSeries 76 | 77 | name = "microdf" 78 | __version__ = "0.1.0" 79 | 80 | __all__ = [ 81 | # agg.py 82 | "combine_base_reform", 83 | "pctchg_base_reform", 84 | "agg", 85 | # chart_utils.py 86 | "dollar_format", 87 | "currency_format", 88 | # charts.py 89 | "quantile_pct_chg_plot", 90 | # concat.py 91 | "concat", 92 | # constants.py 93 | "BENS", 94 | "ECI_REMOVE_COLS", 95 | "HOUSING_CASH_SHARE", 96 | "MCAID_CASH_SHARE", 97 | "MCARE_CASH_SHARE", 98 | "MED_BENS", 99 | "OTHER_CASH_SHARE", 100 | "SNAP_CASH_SHARE", 101 | "SSI_CASH_SHARE", 102 | "TANF_CASH_SHARE", 103 | "VET_CASH_SHARE", 104 | "WIC_CASH_SHARE", 105 | # custom_taxes.py 106 | "CARBON_TAX_INCIDENCE", 107 | "FTT_INCIDENCE", 108 | "VAT_INCIDENCE", 109 | "add_custom_tax", 110 | "add_vat", 111 | "add_carbon_tax", 112 | "add_ftt", 113 | # income_measures.py 114 | "cash_income", 115 | "tpc_eci", 116 | "market_income", 117 | # inequality.py 118 | "gini", 119 | "top_x_pct_share", 120 | "bottom_x_pct_share", 121 | "bottom_50_pct_share", 122 | "top_10_pct_share", 123 | "top_1_pct_share", 124 | "top_0_1_pct_share", 125 | "top_50_pct_share", 126 | "t10_b50", 127 | # io.py 128 | "read_stata_zip", 129 | # poverty.py 130 | "fpl", 131 | "poverty_rate", 132 | "deep_poverty_rate", 133 | "poverty_gap", 134 | "squared_poverty_gap", 135 | "deep_poverty_gap", 136 | # style.py 137 | "AXIS_COLOR", 138 | "DPI", 139 | "GRID_COLOR", 140 | "TITLE_COLOR", 141 | "set_plot_style", 142 | # tax.py 143 | "mtr", 144 | "tax_from_mtrs", 145 | # taxcalc.py 146 | "static_baseline_calc", 147 | "add_weighted_metrics", 148 | "n65", 149 | "calc_df", 150 | "recalculate", 151 | # ubi.py 152 | "ubi_or_bens", 153 | # utils.py 154 | "ordinal_label", 155 | "dedup_list", 156 | "listify", 157 | "flatten", 158 | "cartesian_product", 159 | # weighted.py 160 | "weight", 161 | "weighted_sum", 162 | "weighted_mean", 163 | "weighted_quantile", 164 | "weighted_median", 165 | "add_weighted_quantiles", 166 | "quantile_chg", 167 | # generic.py 168 | "MicroSeries", 169 | "MicroDataFrame", 170 | ] 171 | -------------------------------------------------------------------------------- /microdf/_optional.py: -------------------------------------------------------------------------------- 1 | import distutils.version 2 | import importlib 3 | import types 4 | import warnings 5 | 6 | 7 | # Adapted from: 8 | # https://github.com/pandas-dev/pandas/blob/master/pandas/compat/_optional.py 9 | 10 | VERSIONS = { 11 | "taxcalc": "2.0.0", 12 | } 13 | 14 | 15 | def _get_version(module: types.ModuleType) -> str: 16 | """ 17 | 18 | :param module: types.ModuleType: 19 | :param module: types.ModuleType: 20 | 21 | """ 22 | version = getattr(module, "__version__", None) 23 | if version is None: 24 | # xlrd uses a capitalized attribute name 25 | version = getattr(module, "__VERSION__", None) 26 | 27 | if version is None: 28 | raise ImportError(f"Can't determine version for {module.__name__}") 29 | return version 30 | 31 | 32 | def import_optional_dependency( 33 | name: str, 34 | extra: str = "", 35 | raise_on_missing: bool = True, 36 | on_version: str = "raise", 37 | ): 38 | """Import an optional dependency. 39 | By default, if a dependency is missing an ImportError with a nice 40 | message will be raised. If a dependency is present, but too old, 41 | we raise. 42 | 43 | :param name: The module name. This should be top-level only, so that the 44 | version may be checked. 45 | :type name: str 46 | :param extra: Additional text to include in the ImportError message. 47 | :type extra: str 48 | :param raise_on_missing: Whether to raise if the optional dependency is 49 | not found. When False and the module is not present, None is returned. 50 | :type raise_on_missing: bool, default True 51 | :param on_version: What to do when a dependency's version is too old. 52 | * raise : Raise an ImportError 53 | * warn : Warn that the version is too old. Returns None 54 | * ignore: Return the module, even if the version is too old. 55 | It's expected that users validate the version locally when 56 | :type on_version: str {'raise', 'warn'} 57 | """ 58 | msg = ( 59 | f"Missing optional dependency '{name}'. {extra} " 60 | f"Use pip or conda to install {name}." 61 | ) 62 | try: 63 | module = importlib.import_module(name) 64 | except ImportError: 65 | if raise_on_missing: 66 | raise ImportError(msg) from None 67 | else: 68 | return None 69 | 70 | minimum_version = VERSIONS.get(name) 71 | if minimum_version: 72 | version = _get_version(module) 73 | if distutils.version.LooseVersion(version) < minimum_version: 74 | assert on_version in {"warn", "raise", "ignore"} 75 | msg = ( 76 | f"microdf requires version '{minimum_version}' or newer of " 77 | f"'{name}' " 78 | f"(version '{version}' currently installed)." 79 | ) 80 | if on_version == "warn": 81 | warnings.warn(msg, UserWarning) 82 | return None 83 | elif on_version == "raise": 84 | raise ImportError(msg) 85 | 86 | return module 87 | -------------------------------------------------------------------------------- /microdf/agg.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from typing import Optional 3 | 4 | import microdf as mdf 5 | 6 | 7 | def combine_base_reform( 8 | base: pd.DataFrame, 9 | reform: pd.DataFrame, 10 | base_cols: Optional[list], 11 | cols: Optional[list], 12 | reform_cols: Optional[list], 13 | ) -> pd.DataFrame: 14 | """Combine base and reform with certain columns. 15 | 16 | :param base: Base DataFrame. Index must match reform. 17 | :type base: pd.DataFrame 18 | :param reform: Reform DataFrame. Index must match base. 19 | :type reform: pd.DataFrame 20 | :param base_cols: Columns in base to keep. 21 | :type base_cols: list, optional 22 | :param cols: Columns to keep from both base and reform. 23 | :type cols: list, optional 24 | :param reform_cols: Columns in reform to keep. 25 | :type reform_cols: list, optional 26 | :returns: DataFrame with columns for base ("_base") and reform ("_reform"). 27 | :rtype: pd.DataFrame 28 | 29 | """ 30 | all_base_cols = mdf.listify([base_cols] + [cols]) 31 | all_reform_cols = mdf.listify([reform_cols] + [cols]) 32 | return base[all_base_cols].join( 33 | reform[all_reform_cols], lsuffix="_base", rsuffix="_reform" 34 | ) 35 | 36 | 37 | def pctchg_base_reform(combined: pd.DataFrame, metric: str) -> pd.Series: 38 | """Calculates the percentage change in a metric for a combined 39 | dataset. 40 | 41 | :param combined: Combined DataFrame with _base and _reform columns. 42 | :type combined: pd.DataFrame 43 | :param metric: String of the column to calculate the difference. 44 | Must exist as metric_m_base and metric_m_reform in combined. 45 | :type metric: str 46 | :returns: Series with percentage change. 47 | :rtype: pd.Series 48 | 49 | """ 50 | return combined[metric + "_m_reform"] / combined[metric + "_m_base"] - 1 51 | 52 | 53 | def agg( 54 | base: pd.DataFrame, 55 | reform: pd.DataFrame, 56 | groupby: str, 57 | metrics: list, 58 | base_metrics: Optional[list], 59 | reform_metrics: Optional[list], 60 | ) -> pd.DataFrame: 61 | """Aggregates differences between base and reform. 62 | 63 | :param base: Base DataFrame. Index must match reform. 64 | :type base: pd.DataFrame 65 | :param reform: Reform DataFrame. Index must match base. 66 | :type reform: pd.DataFrame 67 | :param groupby: Variable in base to group on. 68 | :type groupby: str 69 | :param metrics: List of variables to agg and calculate the % change of. 70 | These should have associated weighted columns ending in _m in base 71 | and reform. 72 | :type metrics: list 73 | :param base_metrics: List of variables from base to sum. 74 | :type base_metrics: Optional[list] 75 | :param reform_metrics: List of variables from reform to sum. 76 | :type reform_metrics: Optional[list] 77 | :returns: DataFrame with groupby and metrics, and _pctchg metrics. 78 | :rtype: pd.DataFrame 79 | 80 | """ 81 | metrics = mdf.listify(metrics) 82 | metrics_m = [i + "_m" for i in metrics] 83 | combined = combine_base_reform( 84 | base, 85 | reform, 86 | base_cols=mdf.listify([groupby, base_metrics]), 87 | cols=mdf.listify(metrics_m), 88 | reform_cols=mdf.listify(reform_metrics), 89 | ) 90 | grouped = combined.groupby(groupby).sum() 91 | for metric in metrics: 92 | grouped[metric + "_pctchg"] = pctchg_base_reform(grouped, metric) 93 | return grouped 94 | -------------------------------------------------------------------------------- /microdf/chart_utils.py: -------------------------------------------------------------------------------- 1 | def dollar_format(suffix=""): 2 | """Dollar formatter for matplotlib. 3 | 4 | :param suffix: Suffix to append, e.g. 'B'. Defaults to ''. 5 | :returns: FuncFormatter. 6 | 7 | """ 8 | return currency_format(currency="USD", suffix=suffix) 9 | 10 | 11 | def currency_format(currency="USD", suffix=""): 12 | """Currency formatter for matplotlib. 13 | 14 | :param currency: Name of the currency, e.g. 'USD', 'GBP'. 15 | :param suffix: Suffix to append, e.g. 'B'. Defaults to ''. 16 | :returns: FuncFormatter. 17 | 18 | """ 19 | try: 20 | import matplotlib as mpl 21 | except ImportError: 22 | raise ImportError( 23 | "The function you've called requires extra dependencies. " + 24 | "Please install microdf with the 'charts' extra by running " + 25 | "'pip install microdf[charts]'" 26 | ) 27 | 28 | prefix = {"USD": "$", "GBP": "£"}[currency] 29 | 30 | return mpl.ticker.FuncFormatter( 31 | lambda x, _: prefix + format(int(x), ",") + suffix 32 | ) 33 | -------------------------------------------------------------------------------- /microdf/charts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import microdf as mdf 4 | 5 | 6 | def quantile_pct_chg_plot(df1, df2, col1, col2, w1=None, w2=None, q=None): 7 | """Create stem plot with percent change in decile boundaries. 8 | 9 | :param df1: DataFrame with first set of values. 10 | :param df2: DataFrame with second set of values. 11 | :param col1: Name of columns with values in df1. 12 | :param col2: Name of columns with values in df2. 13 | :param w1: Name of weight column in df1. 14 | :param w2: Name of weight column in df2. 15 | :param q: Quantiles. Defaults to decile boundaries. 16 | :returns: Axis. 17 | 18 | """ 19 | try: 20 | import seaborn as sns 21 | import matplotlib as mpl 22 | import matplotlib.pyplot as plt 23 | except ImportError: 24 | raise ImportError( 25 | "The function you've called requires extra dependencies. " + 26 | "Please install microdf with the 'charts' extra by running " + 27 | "'pip install microdf[charts]'" 28 | ) 29 | 30 | if q is None: 31 | q = np.arange(0.1, 1, 0.1) 32 | # Calculate weighted quantiles. 33 | df = mdf.quantile_chg(df1, df2, col1, col2, w1, w2, q).transpose() 34 | # Prepare dataset for plotting. 35 | df.columns = ["base", "reform"] 36 | df["pct_chg"] = df.reform / df.base - 1 37 | # Multiply by 100 pending github.com/matplotlib/matplotlib/issues/17113 38 | df.pct_chg *= 100 39 | df["index_newline"] = np.where( 40 | df.index == "50th (median)", "50th\n(median)", df.index 41 | ) 42 | # Plot. 43 | fig, ax = plt.subplots() 44 | markerline, stemlines, baseline = ax.stem( 45 | df.index_newline, df.pct_chg 46 | ) 47 | plt.setp(baseline, color="gray", linewidth=0) 48 | ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True)) 49 | ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=100)) 50 | plt.title("Change to percentiles", loc="left") 51 | plt.ylabel("Change at the percentile boundary") 52 | plt.xlabel("Percentile") 53 | sns.despine(left=True, bottom=True) 54 | ax.grid(color=mdf.GRID_COLOR, axis="y") 55 | plt.xticks(rotation=0) 56 | return ax 57 | -------------------------------------------------------------------------------- /microdf/concat.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import inspect 3 | import microdf as mdf 4 | 5 | 6 | def concat(*args, **kwargs): 7 | """Concatenates MicroDataFrame objects, preserving weights. 8 | If concatenating horizontally, the first set of weights are used. 9 | All args and kwargs are passed to pd.concat. 10 | 11 | :return: MicroDataFrame with concatenated weights. 12 | :rtype: mdf.MicroDataFrame 13 | """ 14 | # Extract args with respect to pd.concat. 15 | pd_args = inspect.getcallargs(pd.concat, *args, **kwargs) 16 | objs = pd_args["objs"] 17 | axis = pd_args["axis"] 18 | # Create result, starting with pd.concat. 19 | res = mdf.MicroDataFrame(pd.concat(*args, **kwargs)) 20 | # Assign weights depending on axis. 21 | if axis == 0: 22 | res.weights = pd.concat([obj.weights for obj in objs]) 23 | else: 24 | # If concatenating horizontally, use the first set of weights. 25 | res.weights = objs[0].weights 26 | return res 27 | -------------------------------------------------------------------------------- /microdf/constants.py: -------------------------------------------------------------------------------- 1 | # Constants for share of each benefit that is cash. 2 | HOUSING_CASH_SHARE = 0.0 3 | MCAID_CASH_SHARE = 0.0 4 | MCARE_CASH_SHARE = 0.0 5 | # https://github.com/open-source-economics/taxdata/issues/148 6 | # https://docs.google.com/spreadsheets/d/1g_YdFd5idgLL764G0pZBiBnIlnCBGyxBmapXCOZ1OV4 7 | OTHER_CASH_SHARE = 0.35 8 | SNAP_CASH_SHARE = 0.0 9 | SSI_CASH_SHARE = 1.0 10 | TANF_CASH_SHARE = 0.25 11 | # https://github.com/open-source-economics/C-TAM/issues/62. 12 | VET_CASH_SHARE = 0.48 13 | WIC_CASH_SHARE = 0.0 14 | 15 | # Columns to remove from expanded_income to approximate TPC's Expanded Cash 16 | # Income. 17 | ECI_REMOVE_COLS = [ 18 | "wic_ben", 19 | "housing_ben", 20 | "vet_ben", 21 | "mcare_ben", 22 | "mcaid_ben", 23 | ] 24 | 25 | # Benefits. 26 | BENS = [ 27 | "housing_ben", 28 | "mcaid_ben", 29 | "mcare_ben", 30 | "vet_ben", 31 | "other_ben", 32 | "snap_ben", 33 | "ssi_ben", 34 | "tanf_ben", 35 | "wic_ben", 36 | "e02400", # Social Security (OASDI). 37 | "e02300", # Unemployment insurance. 38 | ] 39 | 40 | MED_BENS = ["mcaid_ben", "mcare_ben", "vet_ben"] 41 | -------------------------------------------------------------------------------- /microdf/custom_taxes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions and data for estimating taxes outside the income tax system. 3 | Examples include value added tax, financial transaction tax, and carbon tax. 4 | """ 5 | 6 | import microdf as mdf 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | 12 | # Source: 13 | # https://www.taxpolicycenter.org/briefing-book/who-would-bear-burden-vat 14 | VAT_INCIDENCE = pd.Series( 15 | index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9], 16 | data=[3.9, 3.9, 3.6, 3.6, 3.6, 3.6, 3.6, 3.4, 3.4, 3.2, 2.8, 2.5, 2.5], 17 | ) 18 | VAT_INCIDENCE /= 100 19 | 20 | # Source: Table 5 in 21 | # https://www.treasury.gov/resource-center/tax-policy/tax-analysis/Documents/WP-115.pdf 22 | CARBON_TAX_INCIDENCE = pd.Series( 23 | index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9], 24 | data=[0.8, 1.2, 1.4, 1.5, 1.6, 1.7, 1.8, 1.8, 1.8, 1.8, 1.6, 1.4, 0.7], 25 | ) 26 | CARBON_TAX_INCIDENCE /= 100 27 | 28 | # Source: Figure 1 in 29 | # https://www.taxpolicycenter.org/sites/default/files/alfresco/publication-pdfs/2000587-financial-transaction-taxes.pdf 30 | FTT_INCIDENCE = pd.Series( 31 | index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9], 32 | data=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.8, 1.0], 33 | ) 34 | FTT_INCIDENCE /= 100 35 | 36 | 37 | def add_custom_tax( 38 | df, 39 | segment_income, 40 | w, 41 | base_income, 42 | incidence, 43 | name, 44 | total=None, 45 | ratio=None, 46 | verbose=True, 47 | ): 48 | """Add a custom tax based on incidence analysis driven by percentiles. 49 | 50 | :param df: DataFrame. 51 | :param segment_income: Income measure used to segment tax units into 52 | quantiles. 53 | :param w: Weight used to segment into quantiles (either s006 or XTOT_m). 54 | :param base_income: Income measure by which incidence is multiplied to 55 | estimate liability. 56 | :param incidence: pandas Series indexed on the floor of an income 57 | percentile, with values for the tax rate. 58 | :param name: Name of the column to add. 59 | :param total: Total amount the tax should generate. If not provided, 60 | liabilities are calculated only based on the incidence schedule. 61 | (Default value = None) 62 | :param ratio: Ratio to adjust the tax by, compared to the original tax. 63 | This acts as a multiplier for the incidence argument. 64 | (Default value = None) 65 | :param verbose: Whether to print the tax adjustment factor if needed. 66 | Defaults to True. 67 | :returns: Nothing. Adds the column name to df representing the tax 68 | liability. df is also sorted by segment_income. 69 | 70 | """ 71 | if ratio is not None: 72 | incidence = incidence * ratio 73 | assert total is None, "ratio and total cannot both be provided." 74 | df.sort_values(segment_income, inplace=True) 75 | income_percentile = 100 * df[w].cumsum() / df[w].sum() 76 | tu_incidence = incidence.iloc[ 77 | pd.cut( 78 | income_percentile, 79 | # Add a right endpoint. Should be 100 but sometimes a decimal 80 | # gets added. 81 | bins=incidence.index.tolist() + [101], 82 | labels=False, 83 | ) 84 | ].values 85 | df[name] = np.maximum(0, tu_incidence * df[base_income]) 86 | if total is not None: 87 | initial_total = mdf.weighted_sum(df, name, "s006") 88 | if verbose: 89 | print( 90 | "Multiplying tax by " 91 | + str(round(total / initial_total, 2)) 92 | + "." 93 | ) 94 | df[name] *= total / initial_total 95 | 96 | 97 | def add_vat( 98 | df, 99 | segment_income="tpc_eci", 100 | w="XTOT_m", 101 | base_income="aftertax_income", 102 | incidence=VAT_INCIDENCE, 103 | name="vat", 104 | **kwargs 105 | ): 106 | """Add value added tax based on incidence estimate from Tax Policy Center. 107 | 108 | :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income. 109 | :param Other: arguments: Args to add_custom_tax with VAT defaults. 110 | :param segment_income: Default value = "tpc_eci") 111 | :param w: Default value = "XTOT_m") 112 | :param base_income: Default value = "aftertax_income") 113 | :param incidence: Default value = VAT_INCIDENCE) 114 | :param name: Default value = "vat") 115 | :param **kwargs: Other arguments passed to add_custom_tax(). 116 | :returns: Nothing. Adds vat to df. 117 | df is also sorted by tpc_eci. 118 | 119 | """ 120 | add_custom_tax( 121 | df, segment_income, w, base_income, incidence, name, **kwargs 122 | ) 123 | 124 | 125 | def add_carbon_tax( 126 | df, 127 | segment_income="tpc_eci", 128 | w="XTOT_m", 129 | base_income="aftertax_income", 130 | incidence=CARBON_TAX_INCIDENCE, 131 | name="carbon_tax", 132 | **kwargs 133 | ): 134 | """Add carbon tax based on incidence estimate from the US Treasury 135 | Department. 136 | 137 | :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income. 138 | :param Other: arguments: Args to add_custom_tax with carbon tax defaults. 139 | :param segment_income: Default value = "tpc_eci") 140 | :param w: Default value = "XTOT_m") 141 | :param base_income: Default value = "aftertax_income") 142 | :param incidence: Default value = CARBON_TAX_INCIDENCE) 143 | :param name: Default value = "carbon_tax") 144 | :param **kwargs: Other arguments passed to add_custom_tax(). 145 | :returns: Nothing. Adds carbon_tax to df. 146 | df is also sorted by tpc_eci. 147 | 148 | """ 149 | add_custom_tax( 150 | df, segment_income, w, base_income, incidence, name, **kwargs 151 | ) 152 | 153 | 154 | def add_ftt( 155 | df, 156 | segment_income="tpc_eci", 157 | w="XTOT_m", 158 | base_income="aftertax_income", 159 | incidence=FTT_INCIDENCE, 160 | name="ftt", 161 | **kwargs 162 | ): 163 | """Add financial transaction tax based on incidence estimate from Tax 164 | Policy Center. 165 | 166 | :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income. 167 | :param Other: arguments: Args to add_custom_tax with FTT defaults. 168 | :param segment_income: Default value = "tpc_eci") 169 | :param w: Default value = "XTOT_m") 170 | :param base_income: Default value = "aftertax_income") 171 | :param incidence: Default value = FTT_INCIDENCE) 172 | :param name: Default value = "ftt") 173 | :param **kwargs: Other arguments passed to add_custom_tax(). 174 | :returns: Nothing. Adds ftt to df. 175 | df is also sorted by tpc_eci. 176 | 177 | """ 178 | add_custom_tax( 179 | df, segment_income, w, base_income, incidence, name, **kwargs 180 | ) 181 | -------------------------------------------------------------------------------- /microdf/generic.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Union 2 | from functools import wraps 3 | import warnings 4 | import copy 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | class MicroSeries(pd.Series): 10 | def __init__(self, *args, weights: np.array = None, **kwargs): 11 | """A Series-inheriting class for weighted microdata. 12 | Weights can be provided at initialisation, or using set_weights. 13 | 14 | :param weights: Array of weights. 15 | :type weights: np.array 16 | """ 17 | super().__init__(*args, **kwargs) 18 | self.set_weights(weights) 19 | 20 | def weighted_function(fn: Callable) -> Callable: 21 | @wraps(fn) 22 | def safe_fn(*args, **kwargs): 23 | try: 24 | return fn(*args, **kwargs) 25 | except ZeroDivisionError: 26 | return np.NaN 27 | 28 | return safe_fn 29 | 30 | @weighted_function 31 | def scalar_function(fn: Callable) -> Callable: 32 | fn._rtype = float 33 | return fn 34 | 35 | @weighted_function 36 | def vector_function(fn: Callable) -> Callable: 37 | fn._rtype = pd.Series 38 | return fn 39 | 40 | def set_weights(self, weights: np.array) -> None: 41 | """Sets the weight values. 42 | 43 | :param weights: Array of weights. 44 | :type weights: np.array. 45 | """ 46 | if weights is None: 47 | self.weights = pd.Series(np.ones_like(self.values), dtype=float) 48 | else: 49 | self.weights = pd.Series(weights, dtype=float) 50 | 51 | @vector_function 52 | def weight(self) -> pd.Series: 53 | """Calculates the weighted value of the MicroSeries. 54 | 55 | :returns: A Series multiplying the MicroSeries by its weight. 56 | :rtype: pd.Series 57 | """ 58 | return self.multiply(self.weights) 59 | 60 | @scalar_function 61 | def sum(self) -> float: 62 | """Calculates the weighted sum of the MicroSeries. 63 | 64 | :returns: The weighted sum. 65 | :rtype: float 66 | """ 67 | return self.multiply(self.weights).sum() 68 | 69 | @scalar_function 70 | def count(self) -> float: 71 | """Calculates the weighted count of the MicroSeries. 72 | 73 | :returns: The weighted count. 74 | """ 75 | return self.weights.sum() 76 | 77 | @scalar_function 78 | def mean(self) -> float: 79 | """Calculates the weighted mean of the MicroSeries 80 | 81 | :returns: The weighted mean. 82 | :rtype: float 83 | """ 84 | return np.average(self.values, weights=self.weights) 85 | 86 | def quantile(self, q: np.array) -> pd.Series: 87 | """Calculates weighted quantiles of the MicroSeries. 88 | 89 | Doesn't exactly match unweighted quantiles of stacked values. 90 | See stackoverflow.com/q/21844024#comment102342137_29677616. 91 | 92 | :param q: Array of quantiles to calculate. 93 | :type q: np.array 94 | 95 | :return: Array of weighted quantiles. 96 | :rtype: pd.Series 97 | """ 98 | values = np.array(self.values) 99 | quantiles = np.array(q) 100 | sample_weight = np.array(self.weights) 101 | assert np.all(quantiles >= 0) and np.all( 102 | quantiles <= 1 103 | ), "quantiles should be in [0, 1]" 104 | sorter = np.argsort(values) 105 | values = values[sorter] 106 | sample_weight = sample_weight[sorter] 107 | weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight 108 | weighted_quantiles /= np.sum(sample_weight) 109 | result = np.interp(quantiles, weighted_quantiles, values) 110 | if quantiles.shape == (): 111 | return result 112 | return pd.Series(result, index=quantiles) 113 | 114 | @scalar_function 115 | def median(self) -> float: 116 | """Calculates the weighted median of the MicroSeries. 117 | 118 | :returns: The weighted median of a DataFrame's column. 119 | :rtype: float 120 | """ 121 | return self.quantile(0.5) 122 | 123 | @scalar_function 124 | def gini(self, negatives: str = None) -> float: 125 | """Calculates Gini index. 126 | 127 | :param negatives: An optional string indicating how to treat negative 128 | values of x: 129 | 'zero' replaces negative values with zeroes. 130 | 'shift' subtracts the minimum value from all values of x, 131 | when this minimum is negative. That is, it adds the absolute 132 | minimum value. 133 | Defaults to None, which leaves negative values as they are. 134 | :type q: str 135 | :returns: Gini index. 136 | :rtype: float 137 | """ 138 | x = np.array(self).astype("float") 139 | if negatives == "zero": 140 | x[x < 0] = 0 141 | if negatives == "shift" and np.amin(x) < 0: 142 | x -= np.amin(x) 143 | if (self.weights != np.ones(len(self))).any(): # Varying weights. 144 | sorted_indices = np.argsort(self) 145 | sorted_x = np.array(self[sorted_indices]) 146 | sorted_w = np.array(self.weights[sorted_indices]) 147 | cumw = np.cumsum(sorted_w) 148 | cumxw = np.cumsum(sorted_x * sorted_w) 149 | return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / ( 150 | cumxw[-1] * cumw[-1] 151 | ) 152 | else: 153 | sorted_x = np.sort(self) 154 | n = len(x) 155 | cumxw = np.cumsum(sorted_x) 156 | # The above formula, with all weights equal to 1 simplifies to: 157 | return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n 158 | 159 | @scalar_function 160 | def top_x_pct_share(self, top_x_pct: float) -> float: 161 | """Calculates top x% share. 162 | 163 | :param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 164 | 0.001. 165 | :type top_x_pct: float 166 | :returns: The weighted share held by the top x%. 167 | :rtype: float 168 | """ 169 | threshold = self.quantile(1 - top_x_pct) 170 | top_x_pct_sum = self[self >= threshold].sum() 171 | total_sum = self.sum() 172 | return top_x_pct_sum / total_sum 173 | 174 | @scalar_function 175 | def bottom_x_pct_share(self, bottom_x_pct) -> float: 176 | """Calculates bottom x% share. 177 | 178 | :param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 179 | 0.001. 180 | :type bottom_x_pct: float 181 | :returns: The weighted share held by the bottom x%. 182 | :rtype: float 183 | """ 184 | return 1 - self.top_x_pct_share(1 - bottom_x_pct) 185 | 186 | @scalar_function 187 | def bottom_50_pct_share(self) -> float: 188 | """Calculates bottom 50% share. 189 | 190 | :returns: The weighted share held by the bottom 50%. 191 | :rtype: float 192 | """ 193 | return self.bottom_x_pct_share(0.5) 194 | 195 | @scalar_function 196 | def top_50_pct_share(self) -> float: 197 | """Calculates top 50% share. 198 | 199 | :returns: The weighted share held by the top 50%. 200 | :rtype: float 201 | """ 202 | return self.top_x_pct_share(0.5) 203 | 204 | @scalar_function 205 | def top_10_pct_share(self) -> float: 206 | """Calculates top 10% share. 207 | 208 | :returns: The weighted share held by the top 10%. 209 | :rtype: float 210 | """ 211 | return self.top_x_pct_share(0.1) 212 | 213 | @scalar_function 214 | def top_1_pct_share(self) -> float: 215 | """Calculates top 1% share. 216 | 217 | :returns: The weighted share held by the top 50%. 218 | :rtype: float 219 | """ 220 | return self.top_x_pct_share(0.01) 221 | 222 | @scalar_function 223 | def top_0_1_pct_share(self) -> float: 224 | """Calculates top 0.1% share. 225 | 226 | :returns: The weighted share held by the top 0.1%. 227 | :rtype: float 228 | """ 229 | return self.top_x_pct_share(0.001) 230 | 231 | @scalar_function 232 | def t10_b50(self) -> float: 233 | """Calculates ratio between the top 10% and bottom 50% shares. 234 | 235 | :returns: The weighted share held by the top 10% divided by 236 | the weighted share held by the bottom 50%. 237 | 238 | """ 239 | t10 = self.top_10_pct_share() 240 | b50 = self.bottom_50_pct_share() 241 | return t10 / b50 242 | 243 | @vector_function 244 | def cumsum(self) -> pd.Series: 245 | return pd.Series(self * self.weights).cumsum() 246 | 247 | @vector_function 248 | def rank(self, pct=False) -> pd.Series: 249 | order = np.argsort(self.values) 250 | inverse_order = np.argsort(order) 251 | ranks = np.array(self.weights.values)[order].cumsum()[inverse_order] 252 | if pct: 253 | ranks /= self.weights.values.sum() 254 | np.where(ranks > 1.0, 1.0, ranks) 255 | return pd.Series(ranks, index=self.index) 256 | 257 | @vector_function 258 | def decile_rank(self): 259 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 10), 10)) 260 | 261 | @vector_function 262 | def quintile_rank(self): 263 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 5), 5)) 264 | 265 | @vector_function 266 | def quartile_rank(self): 267 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 4), 4)) 268 | 269 | @vector_function 270 | def percentile_rank(self): 271 | return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 100), 100)) 272 | 273 | def groupby(self, *args, **kwargs): 274 | gb = super().groupby(*args, **kwargs) 275 | gb.__class__ = MicroSeriesGroupBy 276 | gb._init() 277 | gb.weights = pd.Series(self.weights).groupby(*args, **kwargs) 278 | return gb 279 | 280 | def copy(self, deep=True): 281 | res = super().copy(deep) 282 | res = MicroSeries(res, weights=self.weights.copy(deep)) 283 | return res 284 | 285 | def equals(self, other) -> bool: 286 | equal_values = super().equals(other) 287 | equal_weights = self.weights.equals(other.weights) 288 | return equal_values and equal_weights 289 | 290 | def __getitem__(self, key): 291 | result = super().__getitem__(key) 292 | if isinstance(result, pd.Series): 293 | weights = self.weights.__getitem__(key) 294 | return MicroSeries(result, weights=weights) 295 | return result 296 | 297 | def __getattr__(self, name): 298 | return MicroSeries(super().__getattr__(name), weights=self.weights) 299 | 300 | # operators 301 | 302 | def __add__(self, other): 303 | return MicroSeries(super().__add__(other), weights=self.weights) 304 | 305 | def __sub__(self, other): 306 | return MicroSeries(super().__sub__(other), weights=self.weights) 307 | 308 | def __mul__(self, other): 309 | return MicroSeries(super().__mul__(other), weights=self.weights) 310 | 311 | def __floordiv__(self, other): 312 | return MicroSeries(super().__floordiv__(other), weights=self.weights) 313 | 314 | def __truediv__(self, other): 315 | return MicroSeries(super().__truediv__(other), weights=self.weights) 316 | 317 | def __mod__(self, other): 318 | return MicroSeries(super().__mod__(other), weights=self.weights) 319 | 320 | def __pow__(self, other): 321 | return MicroSeries(super().__pow__(other), weights=self.weights) 322 | 323 | # comparators 324 | 325 | def __lt__(self, other): 326 | return MicroSeries(super().__lt__(other), weights=self.weights) 327 | 328 | def __le__(self, other): 329 | return MicroSeries(super().__le__(other), weights=self.weights) 330 | 331 | def __eq__(self, other): 332 | return MicroSeries(super().__eq__(other), weights=self.weights) 333 | 334 | def __ne__(self, other): 335 | return MicroSeries(super().__ne__(other), weights=self.weights) 336 | 337 | def __ge__(self, other): 338 | return MicroSeries(super().__ge__(other), weights=self.weights) 339 | 340 | def __gt__(self, other): 341 | return MicroSeries(super().__gt__(other), weights=self.weights) 342 | 343 | # assignment operators 344 | 345 | def __iadd__(self, other): 346 | return MicroSeries(super().__iadd__(other), weights=self.weights) 347 | 348 | def __isub__(self, other): 349 | return MicroSeries(super().__isub__(other), weights=self.weights) 350 | 351 | def __imul__(self, other): 352 | return MicroSeries(super().__imul__(other), weights=self.weights) 353 | 354 | def __ifloordiv__(self, other): 355 | return MicroSeries(super().__ifloordiv__(other), weights=self.weights) 356 | 357 | def __idiv__(self, other): 358 | return MicroSeries(super().__idiv__(other), weights=self.weights) 359 | 360 | def __itruediv__(self, other): 361 | return MicroSeries(super().__itruediv__(other), weights=self.weights) 362 | 363 | def __imod__(self, other): 364 | return MicroSeries(super().__imod__(other), weights=self.weights) 365 | 366 | def __ipow__(self, other): 367 | return MicroSeries(super().__ipow__(other), weights=self.weights) 368 | 369 | # other 370 | 371 | def __neg__(self, other): 372 | return MicroSeries(super().__neg__(other), weights=self.weights) 373 | 374 | def __pos__(self, other): 375 | return MicroSeries(super().__pos__(other), weights=self.weights) 376 | 377 | def __repr__(self): 378 | return pd.DataFrame( 379 | dict(value=self.values, weight=self.weights.values) 380 | ).__repr__() 381 | 382 | 383 | MicroSeries.SCALAR_FUNCTIONS = [ 384 | fn 385 | for fn in dir(MicroSeries) 386 | if "_rtype" in dir(getattr(MicroSeries, fn)) 387 | and getattr(getattr(MicroSeries, fn), "_rtype") == float 388 | ] 389 | MicroSeries.VECTOR_FUNCTIONS = [ 390 | fn 391 | for fn in dir(MicroSeries) 392 | if "_rtype" in dir(getattr(MicroSeries, fn)) 393 | and getattr(getattr(MicroSeries, fn), "_rtype") == pd.Series 394 | ] 395 | MicroSeries.AGNOSTIC_FUNCTIONS = ["quantile"] 396 | MicroSeries.FUNCTIONS = sum( 397 | [ 398 | MicroSeries.SCALAR_FUNCTIONS, 399 | MicroSeries.VECTOR_FUNCTIONS, 400 | MicroSeries.AGNOSTIC_FUNCTIONS, 401 | ], 402 | [], 403 | ) 404 | 405 | 406 | class MicroSeriesGroupBy(pd.core.groupby.generic.SeriesGroupBy): 407 | def _init(self): 408 | def _weighted_agg(name) -> Callable: 409 | def via_micro_series(row, *args, **kwargs): 410 | return getattr(MicroSeries(row.a, weights=row.w), name)( 411 | *args, **kwargs 412 | ) 413 | 414 | fn = getattr(MicroSeries, name) 415 | 416 | @wraps(fn) 417 | def _weighted_agg_fn(*args, **kwargs): 418 | arrays = self.apply(np.array) 419 | weights = self.weights.apply(np.array) 420 | df = pd.DataFrame(dict(a=arrays, w=weights)) 421 | is_array = len(args) > 0 and hasattr(args[0], "__len__") 422 | if ( 423 | name in MicroSeries.SCALAR_FUNCTIONS 424 | or name in MicroSeries.AGNOSTIC_FUNCTIONS 425 | and not is_array 426 | ): 427 | result = df.agg( 428 | lambda row: via_micro_series(row, *args, **kwargs), 429 | axis=1, 430 | ) 431 | elif ( 432 | name in MicroSeries.VECTOR_FUNCTIONS 433 | or name in MicroSeries.AGNOSTIC_FUNCTIONS 434 | and is_array 435 | ): 436 | result = df.apply( 437 | lambda row: via_micro_series(row, *args, **kwargs), 438 | axis=1, 439 | ) 440 | return result.stack() 441 | return result 442 | 443 | return _weighted_agg_fn 444 | 445 | for fn_name in MicroSeries.FUNCTIONS: 446 | setattr(self, fn_name, _weighted_agg(fn_name)) 447 | 448 | 449 | class MicroDataFrameGroupBy(pd.core.groupby.generic.DataFrameGroupBy): 450 | def _init(self, by: Union[str, list]): 451 | self.columns = list(self.obj.columns) 452 | if isinstance(by, list): 453 | for column in by: 454 | self.columns.remove(column) 455 | elif isinstance(by, str): 456 | self.columns.remove(by) 457 | self.columns.remove("__tmp_weights") 458 | for fn_name in MicroSeries.SCALAR_FUNCTIONS: 459 | 460 | def get_fn(name): 461 | def fn(*args, **kwargs): 462 | return MicroDataFrame( 463 | { 464 | col: getattr(getattr(self, col), name)( 465 | *args, **kwargs 466 | ) 467 | for col in self.columns 468 | } 469 | ) 470 | 471 | return fn 472 | 473 | setattr(self, fn_name, get_fn(fn_name)) 474 | for fn_name in MicroSeries.VECTOR_FUNCTIONS: 475 | 476 | def get_fn(name): 477 | def fn(*args, **kwargs): 478 | return MicroDataFrame( 479 | { 480 | col: getattr(getattr(self, col), name)( 481 | *args, **kwargs 482 | ) 483 | for col in self.columns 484 | } 485 | ) 486 | 487 | return fn 488 | 489 | setattr(self, fn_name, get_fn(fn_name)) 490 | 491 | 492 | class MicroDataFrame(pd.DataFrame): 493 | def __init__(self, *args, weights=None, **kwargs): 494 | """A DataFrame-inheriting class for weighted microdata. 495 | Weights can be provided at initialisation, or using set_weights or 496 | set_weight_col. 497 | 498 | :param weights: Array of weights. 499 | :type weights: np.array 500 | """ 501 | super().__init__(*args, **kwargs) 502 | self.weights = None 503 | self.set_weights(weights) 504 | self._link_all_weights() 505 | self.override_df_functions() 506 | 507 | def override_df_functions(self): 508 | for name in MicroSeries.FUNCTIONS: 509 | 510 | def get_fn(name): 511 | def fn(*args, **kwargs): 512 | is_array = len(args) > 0 and hasattr(args[0], "__len__") 513 | if ( 514 | name in MicroSeries.SCALAR_FUNCTIONS 515 | or name in MicroSeries.AGNOSTIC_FUNCTIONS 516 | and not is_array 517 | ): 518 | results = pd.Series( 519 | [ 520 | getattr(self[col], name)(*args, **kwargs) 521 | for col in self.columns 522 | ] 523 | ) 524 | results.index = self.columns 525 | return results 526 | elif ( 527 | name in MicroSeries.VECTOR_FUNCTIONS 528 | or name in MicroSeries.AGNOSTIC_FUNCTIONS 529 | and is_array 530 | ): 531 | results = pd.DataFrame( 532 | [ 533 | getattr(self[col], name)(*args, **kwargs) 534 | for col in self.columns 535 | ] 536 | ) 537 | results.index = self.columns 538 | return results 539 | 540 | return fn 541 | 542 | setattr(self, name, get_fn(name)) 543 | 544 | def get_args_as_micro_series(*kwarg_names: tuple) -> Callable: 545 | """Decorator for auto-parsing column names into MicroSeries objects. 546 | If given, kwarg_names limits arguments checked to keyword arguments 547 | specified. 548 | 549 | :param arg_names: argument names to restrict to. 550 | :type arg_names: str 551 | """ 552 | 553 | def arg_series_decorator(fn): 554 | @wraps(fn) 555 | def series_function(self, *args, **kwargs): 556 | new_args = [] 557 | new_kwargs = {} 558 | if len(kwarg_names) == 0: 559 | for value in args: 560 | if isinstance(value, str): 561 | if value not in self.columns: 562 | raise Exception("Column not found") 563 | new_args += [self[value]] 564 | else: 565 | new_args += [value] 566 | for name, value in kwargs.items(): 567 | if isinstance(value, str) and ( 568 | len(kwarg_names) == 0 or name in kwarg_names 569 | ): 570 | if value not in self.columns: 571 | raise Exception("Column not found") 572 | new_kwargs[name] = self[value] 573 | else: 574 | new_kwargs[name] = value 575 | return fn(self, *new_args, **new_kwargs) 576 | 577 | return series_function 578 | 579 | return arg_series_decorator 580 | 581 | def __setitem__(self, *args, **kwargs): 582 | super().__setitem__(*args, **kwargs) 583 | self._link_all_weights() 584 | 585 | def _link_weights(self, column): 586 | # self[column] = ... triggers __setitem__, which forces pd.Series 587 | # this workaround avoids that 588 | self[column].__class__ = MicroSeries 589 | self[column].set_weights(self.weights) 590 | 591 | def _link_all_weights(self): 592 | if self.weights is None: 593 | self.set_weights(np.ones((len(self)))) 594 | for column in self.columns: 595 | if column != self.weights_col: 596 | self._link_weights(column) 597 | 598 | def set_weights(self, weights) -> None: 599 | """Sets the weights for the MicroDataFrame. If a string is received, 600 | it will be assumed to be the column name of the weight column. 601 | 602 | :param weights: Array of weights. 603 | :type weights: np.array 604 | """ 605 | if isinstance(weights, str): 606 | self.weights_col = weights 607 | self.weights = pd.Series(self[weights], dtype=float) 608 | elif weights is not None: 609 | self.weights_col = None 610 | with warnings.catch_warnings(): 611 | warnings.filterwarnings("ignore", category=UserWarning) 612 | self.weights = pd.Series(weights, dtype=float) 613 | self._link_all_weights() 614 | 615 | def set_weight_col(self, column: str) -> None: 616 | """Sets the weights for the MicroDataFrame by specifying the name of 617 | the weight column. 618 | 619 | :param weights: Array of weights. 620 | :type weights: np.array 621 | """ 622 | self.weights = np.array(self[column]) 623 | self.weight_col = column 624 | self._link_all_weights() 625 | 626 | def __getitem__(self, key): 627 | result = super().__getitem__(key) 628 | if isinstance(result, pd.DataFrame): 629 | try: 630 | weights = self.weights[key] 631 | except Exception: 632 | weights = self.weights 633 | return MicroDataFrame(result, weights=weights) 634 | return result 635 | 636 | def catch_series_relapse(self): 637 | for col in self.columns: 638 | if self[col].__class__ == pd.Series: 639 | self._link_weights(col) 640 | 641 | def __setattr__(self, key, value): 642 | super().__setattr__(key, value) 643 | self.catch_series_relapse() 644 | 645 | def reset_index(self): 646 | res = super().reset_index() 647 | res = MicroDataFrame(res, weights=self.weights) 648 | return res 649 | 650 | def copy(self, deep=True): 651 | res = super().copy(deep) 652 | # This changes the original columns to Series. Undo it: 653 | for col in self.columns: 654 | self[col] = MicroSeries(self[col]) 655 | res = MicroDataFrame(res, weights=self.weights.copy(deep)) 656 | return res 657 | 658 | def equals(self, other) -> bool: 659 | equal_values = super().equals(other) 660 | equal_weights = self.weights.equals(other.weights) 661 | return equal_values and equal_weights 662 | 663 | @get_args_as_micro_series() 664 | def groupby(self, by: Union[str, list], *args, **kwargs): 665 | """ 666 | Returns a GroupBy object with MicroSeriesGroupBy objects for 667 | each column 668 | 669 | :param by: column to group by 670 | :type by: Union[str, list] 671 | 672 | return: DataFrameGroupBy object with columns using weights 673 | rtype: DataFrameGroupBy 674 | """ 675 | self["__tmp_weights"] = self.weights 676 | gb = super().groupby(by, *args, **kwargs) 677 | weights = copy.deepcopy(gb["__tmp_weights"]) 678 | for col in self.columns: # df.groupby(...)[col]s use weights 679 | res = gb[col] 680 | res.__class__ = MicroSeriesGroupBy 681 | res._init() 682 | res.weights = weights 683 | setattr(gb, col, res) 684 | gb.__class__ = MicroDataFrameGroupBy 685 | gb._init(by) 686 | return gb 687 | 688 | @get_args_as_micro_series() 689 | def poverty_rate(self, income: str, threshold: str) -> float: 690 | """Calculate poverty rate, i.e., the population share with income 691 | below their poverty threshold. 692 | 693 | :param income: Column indicating income. 694 | :type income: str 695 | :param threshold: Column indicating threshold. 696 | :type threshold: str 697 | :return: Poverty rate between zero and one. 698 | :rtype: float 699 | """ 700 | pov = income < threshold 701 | return pov.sum() / pov.count() 702 | 703 | @get_args_as_micro_series() 704 | def deep_poverty_rate(self, income: str, threshold: str) -> float: 705 | """Calculate deep poverty rate, i.e., the population share with income 706 | below half their poverty threshold. 707 | 708 | :param income: Column indicating income. 709 | :type income: str 710 | :param threshold: Column indicating threshold. 711 | :type threshold: str 712 | :return: Deep poverty rate between zero and one. 713 | :rtype: float 714 | """ 715 | pov = income < (threshold / 2) 716 | return pov.sum() / pov.count() 717 | 718 | @get_args_as_micro_series() 719 | def poverty_gap(self, income: str, threshold: str) -> float: 720 | """Calculate poverty gap, i.e., the total gap between income and 721 | poverty thresholds for all people in poverty. 722 | 723 | :param income: Column indicating income. 724 | :type income: str 725 | :param threshold: Column indicating threshold. 726 | :type threshold: str 727 | :return: Poverty gap. 728 | :rtype: float 729 | """ 730 | gaps = (threshold - income)[threshold > income] 731 | return gaps.sum() 732 | 733 | @get_args_as_micro_series() 734 | def deep_poverty_gap(self, income: str, threshold: str) -> float: 735 | """Calculate deep poverty gap, i.e., the total gap between income and 736 | half of poverty thresholds for all people in deep poverty. 737 | 738 | :param income: Column indicating income. 739 | :type income: str 740 | :param threshold: Column indicating threshold. 741 | :type threshold: str 742 | :return: Deep poverty gap. 743 | :rtype: float 744 | """ 745 | deep_threshold = threshold / 2 746 | gaps = (deep_threshold - income)[deep_threshold > income] 747 | return gaps.sum() 748 | 749 | @get_args_as_micro_series() 750 | def squared_poverty_gap(self, income: str, threshold: str) -> float: 751 | """Calculate squared poverty gap, i.e., the total squared gap between 752 | income and poverty thresholds for all people in poverty. 753 | Also known as the poverty severity index. 754 | 755 | :param income: Column indicating income. 756 | :type income: str 757 | :param threshold: Column indicating threshold. 758 | :type threshold: str 759 | :return: Squared poverty gap. 760 | :rtype: float 761 | """ 762 | gaps = (threshold - income)[threshold > income] 763 | squared_gaps = gaps ** 2 764 | return squared_gaps.sum() 765 | 766 | @get_args_as_micro_series() 767 | def poverty_count( 768 | self, 769 | income: Union[MicroSeries, str], 770 | threshold: Union[MicroSeries, str], 771 | ) -> int: 772 | """ 773 | Calculates the number of entities with income below a poverty 774 | threshold. 775 | 776 | :param income: income array or column name 777 | :type income: Union[MicroSeries, str] 778 | 779 | :param threshold: threshold array or column name 780 | :type threshold: Union[MicroSeries, str] 781 | 782 | return: number of entities in poverty 783 | rtype: int 784 | """ 785 | in_poverty = income < threshold 786 | return in_poverty.sum() 787 | 788 | def __repr__(self): 789 | df = pd.DataFrame(self) 790 | df["weight"] = self.weights 791 | return df[[df.columns[-1]] + list(df.columns[:-1])].__repr__() 792 | -------------------------------------------------------------------------------- /microdf/income_measures.py: -------------------------------------------------------------------------------- 1 | import microdf as mdf 2 | 3 | # See 4 | # https://docs.google.com/spreadsheets/d/1I-Qe8uD58bLnPkimc9eaPgs4AE7x5FZYmTZwVX_WyT8 5 | # for a comparison of income measures used here. 6 | 7 | 8 | def cash_income(df): 9 | """Calculates income after taxes and cash transfers. 10 | 11 | Defined as aftertax_income minus non-cash benefits. 12 | 13 | :param df: A Tax-Calculator pandas DataFrame with columns for 14 | * aftertax_income 15 | * housing_ben 16 | * mcaid_ben 17 | * mcare_ben 18 | * other_ben 19 | * snap_ben 20 | * ssi_bn 21 | * tanf_ben 22 | * vet_ben 23 | * wic_ben 24 | :returns: A pandas Series with the cash income for each row in df. 25 | 26 | """ 27 | return ( 28 | df.aftertax_income 29 | - (1 - mdf.HOUSING_CASH_SHARE) * df.housing_ben 30 | - (1 - mdf.MCAID_CASH_SHARE) * df.mcaid_ben 31 | - (1 - mdf.MCARE_CASH_SHARE) * df.mcare_ben 32 | - (1 - mdf.OTHER_CASH_SHARE) * df.other_ben 33 | - (1 - mdf.SNAP_CASH_SHARE) * df.snap_ben 34 | - (1 - mdf.SSI_CASH_SHARE) * df.ssi_ben 35 | - (1 - mdf.TANF_CASH_SHARE) * df.tanf_ben 36 | - (1 - mdf.VET_CASH_SHARE) * df.vet_ben 37 | - (1 - mdf.WIC_CASH_SHARE) * df.wic_ben 38 | ) 39 | 40 | 41 | def tpc_eci(df): 42 | """Approximates Tax Policy Center's Expanded Cash Income measure. 43 | 44 | Subtracts WIC, housing assistance, veteran's benefits, Medicare, and 45 | Medicaid from expanded_income. ECI adds income measures not modeled in 46 | Tax-Calculator, so these are ignored and will create a discrepancy 47 | compared to TPC's ECI. 48 | 49 | :param df: DataFrame with columns from Tax-Calculator. 50 | :returns: pandas Series with TPC's ECI. 51 | 52 | """ 53 | return df.expanded_income - df[mdf.ECI_REMOVE_COLS].sum(axis=1) 54 | 55 | 56 | def market_income(df): 57 | """Approximates CBO's market income concept, which is income 58 | before social insurance, means-tested transfers, and taxes. 59 | 60 | :param df: DataFrame with expanded_income and benefits. 61 | :returns: pandas Series of the same length as df. 62 | 63 | """ 64 | return df.expanded_income - df[mdf.BENS].sum(axis=1) 65 | -------------------------------------------------------------------------------- /microdf/inequality.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import microdf as mdf 4 | 5 | 6 | def gini(df, col, w=None, negatives=None, groupby=None): 7 | """Calculates Gini index. 8 | 9 | :param df: DataFrame. 10 | :param col: Name of column in df representing value. 11 | :param w: Column representing weight in df. 12 | :param negatives: An optional string indicating how to treat negative 13 | values of x: 14 | 'zero' replaces negative values with zeroes. 15 | 'shift' subtracts the minimum value from all values of x, 16 | when this minimum is negative. That is, it adds the absolute 17 | minimum value. 18 | Defaults to None, which leaves negative values as they are. 19 | :param groupby: Column, or list of columns, to group by. 20 | :returns: A float, the Gini index. 21 | 22 | """ 23 | 24 | def _gini(df, col, w=None, negatives=None): 25 | # Requires float numpy arrays (not pandas Series or lists) to work. 26 | x = np.array(df[col]).astype("float") 27 | if negatives == "zero": 28 | x[x < 0] = 0 29 | if negatives == "shift" and np.amin(x) < 0: 30 | x -= np.amin(x) 31 | if w is not None: 32 | w = np.array(df[w]).astype("float") 33 | sorted_indices = np.argsort(x) 34 | sorted_x = x[sorted_indices] 35 | sorted_w = w[sorted_indices] 36 | cumw = np.cumsum(sorted_w) 37 | cumxw = np.cumsum(sorted_x * sorted_w) 38 | return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / ( 39 | cumxw[-1] * cumw[-1] 40 | ) 41 | else: 42 | sorted_x = np.sort(x) 43 | n = len(x) 44 | cumxw = np.cumsum(sorted_x) 45 | # The above formula, with all weights equal to 1 simplifies to: 46 | return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n 47 | 48 | if groupby is None: 49 | return _gini(df, col, w, negatives) 50 | return df.groupby(groupby).apply(lambda x: _gini(x, col, w, negatives)) 51 | 52 | 53 | def top_x_pct_share(df, col, top_x_pct, w=None, groupby=None): 54 | """Calculates top x% share. 55 | 56 | :param df: DataFrame. 57 | :param col: Name of column in df representing value. 58 | :param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001. 59 | :param w: Column representing weight in df. 60 | :param groupby: Column, or list of columns, to group by. 61 | :returns: The share of w-weighted val held by the top x%. 62 | 63 | """ 64 | 65 | def _top_x_pct_share(df, col, top_x_pct, w=None): 66 | threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct) 67 | top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w) 68 | total_sum = mdf.weighted_sum(df, col, w) 69 | return top_x_pct_sum / total_sum 70 | 71 | if groupby is None: 72 | return _top_x_pct_share(df, col, top_x_pct, w) 73 | return df.groupby(groupby).apply( 74 | lambda x: _top_x_pct_share(x, col, top_x_pct, w) 75 | ) 76 | 77 | 78 | def bottom_x_pct_share(df, col, bottom_x_pct, w=None, groupby=None): 79 | """Calculates bottom x% share. 80 | 81 | :param df: DataFrame. 82 | :param col: Name of column in df representing value. 83 | :param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001. 84 | :param w: Column representing weight in df. 85 | :param groupby: Column, or list of columns, to group by. 86 | :returns: The share of w-weighted val held by the bottom x%. 87 | 88 | """ 89 | return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, groupby) 90 | 91 | 92 | def bottom_50_pct_share(df, col, w=None, groupby=None): 93 | """Calculates bottom 50% share. 94 | 95 | :param df: DataFrame. 96 | :param col: Name of column in df representing value. 97 | :param w: Column representing weight in df. 98 | :param groupby: Column, or list of columns, to group by. 99 | :returns: The share of w-weighted val held by the bottom 50%. 100 | 101 | """ 102 | return bottom_x_pct_share(df, col, 0.5, w, groupby) 103 | 104 | 105 | def top_50_pct_share(df, col, w=None, groupby=None): 106 | """Calculates top 50% share. 107 | 108 | :param df: DataFrame. 109 | :param col: Name of column in df representing value. 110 | :param w: Column representing weight in df. 111 | :param groupby: Column, or list of columns, to group by. 112 | :returns: The share of w-weighted val held by the top 50%. 113 | 114 | """ 115 | return top_x_pct_share(df, col, 0.5, w, groupby) 116 | 117 | 118 | def top_10_pct_share(df, col, w=None, groupby=None): 119 | """Calculates top 10% share. 120 | 121 | :param df: DataFrame. 122 | :param col: Name of column in df representing value. 123 | :param w: Column representing weight in df. 124 | :param groupby: Column, or list of columns, to group by. 125 | :returns: The share of w-weighted val held by the top 10%. 126 | 127 | """ 128 | return top_x_pct_share(df, col, 0.1, w, groupby) 129 | 130 | 131 | def top_1_pct_share(df, col, w=None, groupby=None): 132 | """Calculates top 1% share. 133 | 134 | :param df: DataFrame. 135 | :param col: Name of column in df representing value. 136 | :param w: Column representing weight in df. 137 | :param groupby: Column, or list of columns, to group by. 138 | :returns: The share of w-weighted val held by the top 1%. 139 | 140 | """ 141 | return top_x_pct_share(df, col, 0.01, w, groupby) 142 | 143 | 144 | def top_0_1_pct_share(df, col, w=None, groupby=None): 145 | """Calculates top 0.1% share. 146 | 147 | :param df: DataFrame. 148 | :param col: Name of column in df representing value. 149 | :param w: Column representing weight in df. 150 | :param groupby: Column, or list of columns, to group by. 151 | :returns: The share of w-weighted val held by the top 0.1%. 152 | 153 | """ 154 | return top_x_pct_share(df, col, 0.001, w, groupby) 155 | 156 | 157 | def t10_b50(df, col, w=None, groupby=None): 158 | """Calculates ratio between the top 10% and bottom 50% shares. 159 | 160 | :param df: DataFrame. 161 | :param col: Name of column in df representing value. 162 | :param w: Column representing weight in df. 163 | :param groupby: Column, or list of columns, to group by. 164 | :returns: The share of w-weighted val held by the top 10% divided by 165 | the share of w-weighted val held by the bottom 50%. 166 | 167 | """ 168 | t10 = top_10_pct_share(df, col, w, groupby) 169 | b50 = bottom_50_pct_share(df, col, w, groupby) 170 | return t10 / b50 171 | -------------------------------------------------------------------------------- /microdf/io.py: -------------------------------------------------------------------------------- 1 | import io 2 | import zipfile 3 | import requests 4 | import pandas as pd 5 | 6 | HEADER = { 7 | "User-Agent": 8 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) " + 9 | "AppleWebKit/537.36 (KHTML, like Gecko) " + 10 | "Chrome/50.0.2661.102 Safari/537.36" 11 | } 12 | 13 | 14 | def read_stata_zip(url: str, **kwargs) -> pd.DataFrame: 15 | """Reads zipped Stata file by URL. 16 | 17 | From https://stackoverflow.com/a/59122689/1840471 18 | 19 | Pending native support in 20 | https://github.com/pandas-dev/pandas/issues/26599. 21 | 22 | :param url: URL string of .zip file containing a single 23 | .dta file. 24 | :param **kwargs: Arguments passed to pandas.read_stata(). 25 | :returns: DataFrame. 26 | 27 | """ 28 | r = requests.get(url, headers=HEADER) 29 | data = io.BytesIO(r.content) 30 | with zipfile.ZipFile(data) as archive: 31 | with archive.open(archive.namelist()[0]) as stata: 32 | return pd.read_stata(stata, **kwargs) 33 | -------------------------------------------------------------------------------- /microdf/poverty.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def fpl(people: int): 6 | """Calculates the federal poverty guideline for a household of a certain 7 | size. 8 | 9 | :param XTOT: The number of people in the household. 10 | :param people: returns: The federal poverty guideline for the contiguous 11 | 48 states. 12 | :returns: The federal poverty guideline for the contiguous 48 states. 13 | 14 | """ 15 | return 7820 + 4320 * people 16 | 17 | 18 | def poverty_rate( 19 | df: pd.DataFrame, income: str, threshold: str, w: str = None 20 | ) -> float: 21 | """Calculate poverty rate, i.e., the population share with income 22 | below their poverty threshold. 23 | 24 | :param df: DataFrame with income, threshold, and possibly weight columns 25 | for each person/household. 26 | :type df: pd.DataFrame 27 | :param income: Column indicating income. 28 | :type income: str 29 | :param threshold: Column indicating threshold. 30 | :type threshold: str 31 | :param w: Column indicating weight, defaults to None (unweighted). 32 | :type w: str, optional 33 | :return: Poverty rate between zero and one. 34 | :rtype: float 35 | """ 36 | pov = df[income] < df[threshold] 37 | if w is None: 38 | return pov.mean() 39 | return (pov * df[w]).sum() / df[w].sum() 40 | 41 | 42 | def deep_poverty_rate( 43 | df: pd.DataFrame, income: str, threshold: str, w: str = None 44 | ) -> float: 45 | """Calculate deep poverty rate, i.e., the population share with income 46 | below half their poverty threshold. 47 | 48 | :param df: DataFrame with income, threshold, and possibly weight columns 49 | for each person/household. 50 | :type df: pd.DataFrame 51 | :param income: Column indicating income. 52 | :type income: str 53 | :param threshold: Column indicating threshold. 54 | :type threshold: str 55 | :param w: Column indicating weight, defaults to None (unweighted). 56 | :type w: str, optional 57 | :return: Deep poverty rate between zero and one. 58 | :rtype: float 59 | """ 60 | pov = df[income] < df[threshold] / 2 61 | if w is None: 62 | return pov.mean() 63 | return (pov * df[w]).sum() / df[w].sum() 64 | 65 | 66 | def poverty_gap( 67 | df: pd.DataFrame, income: str, threshold: str, w: str = None 68 | ) -> float: 69 | """Calculate poverty gap, i.e., the total gap between income and poverty 70 | thresholds for all people in poverty. 71 | 72 | :param df: DataFrame with income, threshold, and possibly weight columns 73 | for each household (data should represent households, not persons). 74 | :type df: pd.DataFrame 75 | :param income: Column indicating income. 76 | :type income: str 77 | :param threshold: Column indicating threshold. 78 | :type threshold: str 79 | :param w: Column indicating weight, defaults to None (unweighted). 80 | :type w: str, optional 81 | :return: Poverty gap. 82 | :rtype: float 83 | """ 84 | gap = np.maximum(df[threshold] - df[income], 0) 85 | if w is None: 86 | return gap.sum() 87 | return (gap * df[w]).sum() 88 | 89 | 90 | def squared_poverty_gap( 91 | df: pd.DataFrame, income: str, threshold: str, w: str = None 92 | ) -> float: 93 | """Calculate squared poverty gap, i.e., the total squared gap between 94 | income and poverty thresholds for all people in poverty. 95 | Also known as poverty severity index. 96 | 97 | :param df: DataFrame with income, threshold, and possibly weight columns 98 | for each household (data should represent households, not persons). 99 | :type df: pd.DataFrame 100 | :param income: Column indicating income. 101 | :type income: str 102 | :param threshold: Column indicating threshold. 103 | :type threshold: str 104 | :param w: Column indicating weight, defaults to None (unweighted). 105 | :type w: str, optional 106 | :return: Squared poverty gap. 107 | :rtype: float 108 | """ 109 | gap = np.maximum(df[threshold] - df[income], 0) 110 | sq_gap = np.power(gap, 2) 111 | if w is None: 112 | return sq_gap.sum() 113 | return (sq_gap * df[w]).sum() 114 | 115 | 116 | def deep_poverty_gap( 117 | df: pd.DataFrame, income: str, threshold: str, w: str = None 118 | ) -> float: 119 | """Calculate deep poverty gap, i.e., the total gap between income and 120 | halved poverty thresholds for all people in deep poverty. 121 | 122 | :param df: DataFrame with income, threshold, and possibly weight columns 123 | for each household (data should represent households, not persons). 124 | :type df: pd.DataFrame 125 | :param income: Column indicating income. 126 | :type income: str 127 | :param threshold: Column indicating threshold. 128 | :type threshold: str 129 | :param w: Column indicating weight, defaults to None (unweighted). 130 | :type w: str, optional 131 | :return: Deep poverty gap. 132 | :rtype: float 133 | """ 134 | gap = np.maximum((df[threshold] / 2) - df[income], 0) 135 | if w is None: 136 | return gap.sum() 137 | return (gap * df[w]).sum() 138 | -------------------------------------------------------------------------------- /microdf/style.py: -------------------------------------------------------------------------------- 1 | TITLE_COLOR = "#212121" 2 | AXIS_COLOR = "#757575" 3 | GRID_COLOR = "#eeeeee" # Previously lighter #f5f5f5. 4 | DPI = 200 5 | 6 | 7 | def set_plot_style(dpi: int = DPI): 8 | """Set plot style. 9 | 10 | :param dpi: DPI for saving and displaying figures, defaults to microdf.DPI 11 | (200). 12 | :type dpi: int, optional 13 | """ 14 | try: 15 | import seaborn as sns 16 | import matplotlib as mpl 17 | import matplotlib.font_manager as fm 18 | except ImportError: 19 | raise ImportError( 20 | "The function you've called requires extra dependencies. " + 21 | "Please install microdf with the 'charts' extra by running " + 22 | "'pip install microdf[charts]'" 23 | ) 24 | 25 | sns.set_style("white") 26 | 27 | # Set up Roboto. Must be downloaded in the current directory. 28 | # See https://stackoverflow.com/a/51844978/1840471. 29 | fm.fontManager.ttflist += fm.createFontList(["Roboto-Regular.ttf"]) 30 | 31 | STYLE = { 32 | "savefig.dpi": dpi, 33 | "figure.dpi": dpi, 34 | "figure.figsize": (6.4, 4.8), # Default. 35 | "font.sans-serif": "Roboto", 36 | "font.family": "sans-serif", 37 | # Set title text color to dark gray (https://material.io/color) not 38 | # black. 39 | "text.color": TITLE_COLOR, 40 | # Axis titles and tick marks are medium gray. 41 | "axes.labelcolor": AXIS_COLOR, 42 | "xtick.color": AXIS_COLOR, 43 | "ytick.color": AXIS_COLOR, 44 | # Grid is light gray. 45 | "axes.grid": True, 46 | "grid.color": GRID_COLOR, 47 | # Equivalent to seaborn.despine(left=True, bottom=True). 48 | "axes.spines.left": False, 49 | "axes.spines.right": False, 50 | "axes.spines.top": False, 51 | "axes.spines.bottom": False, 52 | } 53 | 54 | mpl.rcParams.update(STYLE) 55 | -------------------------------------------------------------------------------- /microdf/tax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def mtr(val, brackets, rates): 6 | """Calculates the marginal tax rate applied to a value depending on a 7 | tax schedule. 8 | 9 | :param val: Value to assess tax on, e.g. wealth or income (list or Series). 10 | :param brackets: Left side of each bracket (list or Series). 11 | :param rates: Rate corresponding to each bracket. 12 | :returns: Series of the size of val representing the marginal tax rate. 13 | 14 | """ 15 | df_tax = pd.DataFrame({"brackets": brackets, "rates": rates}) 16 | df_tax["base_tax"] = ( 17 | df_tax.brackets.sub(df_tax.brackets.shift(fill_value=0)) 18 | .mul(df_tax.rates.shift(fill_value=0)) 19 | .cumsum() 20 | ) 21 | rows = df_tax.brackets.searchsorted(val, side="right") - 1 22 | income_bracket_df = df_tax.loc[rows].reset_index(drop=True) 23 | return income_bracket_df.rates 24 | 25 | 26 | def tax_from_mtrs( 27 | val, 28 | brackets, 29 | rates, 30 | avoidance_rate=0, 31 | avoidance_elasticity=0, 32 | avoidance_elasticity_flat=0, 33 | ): 34 | """Calculates tax liability based on a marginal tax rate schedule. 35 | 36 | :param val: Value to assess tax on, e.g. wealth or income (list or Series). 37 | :param brackets: Left side of each bracket (list or Series). 38 | :param rates: Rate corresponding to each bracket. 39 | :param avoidance_rate: Constant avoidance/evasion rate in percentage terms. 40 | Defaults to zero. 41 | :param avoidance_elasticity: Avoidance/evasion elasticity. 42 | Response of log taxable value with respect 43 | to tax rate. 44 | Defaults to zero. Should be positive. 45 | :param avoidance_elasticity_flat: Response of taxable value with respect 46 | to tax rate. 47 | Use avoidance_elasticity in most cases. 48 | Defaults to zero. Should be positive. 49 | :returns: Series of tax liabilities with the same size as val. 50 | 51 | """ 52 | assert ( 53 | avoidance_rate == 0 54 | or avoidance_elasticity == 0 55 | or avoidance_elasticity_flat == 0 56 | ), "Cannot supply multiple avoidance parameters." 57 | assert ( 58 | avoidance_elasticity >= 0 59 | ), "Provide nonnegative avoidance_elasticity." 60 | df_tax = pd.DataFrame({"brackets": brackets, "rates": rates}) 61 | df_tax["base_tax"] = ( 62 | df_tax.brackets.sub(df_tax.brackets.shift(fill_value=0)) 63 | .mul(df_tax.rates.shift(fill_value=0)) 64 | .cumsum() 65 | ) 66 | if avoidance_rate == 0: # Only need MTRs if elasticity is supplied. 67 | mtrs = mtr(val, brackets, rates) 68 | if avoidance_elasticity > 0: 69 | avoidance_rate = 1 - np.exp(-avoidance_elasticity * mtrs) 70 | if avoidance_elasticity_flat > 0: 71 | avoidance_rate = avoidance_elasticity_flat * mtrs 72 | taxable = pd.Series(val) * (1 - avoidance_rate) 73 | rows = df_tax.brackets.searchsorted(taxable, side="right") - 1 74 | income_bracket_df = df_tax.loc[rows].reset_index(drop=True) 75 | return ( 76 | pd.Series(taxable) 77 | .sub(income_bracket_df.brackets) 78 | .mul(income_bracket_df.rates) 79 | .add(income_bracket_df.base_tax) 80 | ) 81 | -------------------------------------------------------------------------------- /microdf/taxcalc.py: -------------------------------------------------------------------------------- 1 | import microdf as mdf 2 | from microdf._optional import import_optional_dependency 3 | 4 | 5 | def static_baseline_calc(recs, year): 6 | """Creates a static Calculator object. 7 | 8 | :param recs: Records object. 9 | :param year: Year to advance to. 10 | :returns: Calculator object. 11 | 12 | """ 13 | tc = import_optional_dependency("taxcalc") 14 | calc = tc.Calculator(records=recs, policy=tc.Policy()) 15 | calc.advance_to_year(year) 16 | calc.calc_all() 17 | return calc 18 | 19 | 20 | def add_weighted_metrics(df, metric_vars, w="s006", divisor=1e6, suffix="_m"): 21 | """Adds weighted metrics in millions to a Tax-Calculator pandas DataFrame. 22 | 23 | Columns are renamed to *_m. 24 | 25 | :param df: A pandas DataFrame containing Tax-Calculator data. 26 | :param metric_vars: A list of column names to weight, or a single column 27 | name. 28 | :param w: Weight column. Defaults to s006. 29 | :param divisor: Number by which the product is divided. Defaults to 1e6. 30 | :param suffix: Suffix to add to each weighted total. Defaults to '_m' 31 | to match divisor default of 1e6. 32 | :returns: Nothing. Weighted columns are added in place. 33 | 34 | """ 35 | df[w + suffix] = df[w] / divisor 36 | metric_vars = mdf.listify(metric_vars) 37 | for metric_var in metric_vars: 38 | df[metric_var + suffix] = df[metric_var] * df[w + suffix] 39 | 40 | 41 | def n65(age_head, age_spouse, elderly_dependents): 42 | """Calculates number of people in the tax unit age 65 or older. 43 | 44 | :param age_head: Series representing age_head from taxcalc data. 45 | :param age_spouse: Series representing age_spouse from taxcalc data. 46 | :param elderly_dependents: Series representing elderly_dependents from 47 | taxcalc data. 48 | :returns: Series representing the number of people age 65 or older. 49 | 50 | """ 51 | return ( 52 | (age_head >= 65).astype(int) 53 | + (age_spouse >= 65).astype(int) 54 | + elderly_dependents 55 | ) 56 | 57 | 58 | def calc_df( 59 | records=None, 60 | policy=None, 61 | year=2020, 62 | reform=None, 63 | group_vars=None, 64 | metric_vars=None, 65 | group_n65=False, 66 | ): 67 | """Creates a pandas DataFrame for given Tax-Calculator data. 68 | 69 | s006 is always included, and RECID is used as an index. 70 | 71 | :param records: An optional Records object. If not provided, uses CPS 72 | records. (Default value = None) 73 | :param policy: An optional Policy object. If not provided, uses default 74 | Policy. 75 | :param year: An optional year to advance to. If not provided, defaults to 76 | 2020. 77 | :param reform: An optional reform to implement for the Policy object. 78 | (Default value = None) 79 | :param group_vars: An optional list of column names to include in the 80 | DataFrame. (Default value = None) 81 | :param metric_vars: An optional list of column names to include and 82 | calculate weighted sums of (in millions named as *_m) in the DataFrame. 83 | (Default value = None) 84 | :param group_n65: Whether to calculate and group by n65. Defaults to False. 85 | :returns: A pandas DataFrame. market_income is also always calculated. 86 | 87 | """ 88 | tc = import_optional_dependency("taxcalc") 89 | # Assign defaults. 90 | if records is None: 91 | records = tc.Records.cps_constructor() 92 | if policy is None: 93 | policy = tc.Policy() 94 | if reform is not None: 95 | policy.implement_reform(reform) 96 | # Calculate. 97 | calc = tc.Calculator(records=records, policy=policy, verbose=False) 98 | calc.advance_to_year(year) 99 | calc.calc_all() 100 | # Get a deduplicated list of all columns. 101 | if group_n65: 102 | group_vars = group_vars + [ 103 | "age_head", 104 | "age_spouse", 105 | "elderly_dependents", 106 | ] 107 | # Include expanded_income and benefits to produce market_income. 108 | all_cols = mdf.listify( 109 | [ 110 | "RECID", 111 | "s006", 112 | "expanded_income", 113 | "aftertax_income", 114 | mdf.BENS, 115 | group_vars, 116 | metric_vars, 117 | ] 118 | ) 119 | df = calc.dataframe(all_cols) 120 | # Create core elements. 121 | df["market_income"] = mdf.market_income(df) 122 | df["bens"] = df[mdf.BENS].sum(axis=1) 123 | df["tax"] = df.expanded_income - df.aftertax_income 124 | if group_n65: 125 | df["n65"] = n65(df.age_head, df.age_spouse, df.elderly_dependents) 126 | df.drop( 127 | ["age_head", "age_spouse", "elderly_dependents"], 128 | axis=1, 129 | inplace=True, 130 | ) 131 | # Add calculated columns for metrics. 132 | mdf.add_weighted_metrics(df, metric_vars) 133 | # Set RECID to int and set it as index before returning. 134 | df["RECID"] = df.RECID.map(int) 135 | return df.set_index("RECID") 136 | 137 | 138 | def recalculate(df): 139 | """Recalculates fields in the DataFrame for after components have changed. 140 | 141 | :param df: DataFrame for use in microdf. 142 | :returns: Nothing. Updates the DataFrame in place. 143 | 144 | """ 145 | # Recalculate TPC's Expanded Cash Income measure. 146 | cols = df.columns 147 | if "tpc_eci" in cols: 148 | df.tpc_eci = mdf.tpc_eci(df) 149 | # Recalculate weighted metrics (anything ending in _m). 150 | mcols = cols[cols.str.endswith("_m")] 151 | mdf.add_weighted_metrics(df, mcols) 152 | -------------------------------------------------------------------------------- /microdf/tests/__pycache__/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/mghenis/anaconda3/bin/python3" 3 | } -------------------------------------------------------------------------------- /microdf/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture(scope="session") 7 | def tests_path(): 8 | """ """ 9 | return os.path.abspath(os.path.dirname(__file__)) 10 | -------------------------------------------------------------------------------- /microdf/tests/test_compare.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | import microdf as mdf 7 | 8 | 9 | def differences(actual, expected, f_actual, f_expected): 10 | """Check for differences between results in afilename and efilename files. 11 | 12 | :param actual: Actual DataFrame. 13 | :param expected: Expected DataFrame. 14 | :param f_actual: Filename of the actual CSV. 15 | :param f_expected: Filename of the expected CSV. 16 | """ 17 | if not np.allclose(actual, expected): 18 | msg = "COMPARE RESULTS DIFFER\n" 19 | msg += "-------------------------------------------------\n" 20 | msg += "--- NEW RESULTS IN {} FILE ---\n" 21 | msg += "--- if new OK, copy {} to ---\n" 22 | msg += "--- {} ---\n" 23 | msg += "--- and rerun test. ---\n" 24 | msg += "-------------------------------------------------\n" 25 | raise ValueError(msg.format(f_actual, f_actual, f_expected)) 26 | 27 | 28 | def test_percentile_agg_compare(tests_path): 29 | """ 30 | :param tests_path: Folder path to write test results. 31 | """ 32 | N = 1000 33 | np.random.seed(0) 34 | df = pd.DataFrame({"val": np.random.rand(N), "w": np.random.rand(N)}) 35 | mdf.add_weighted_quantiles(df, "val", "w") 36 | percentile_sum = df.groupby("val_percentile")[["val", "w"]].sum() 37 | F_ACTUAL = "test_percentile_actual.csv" 38 | F_EXPECTED = "test_percentile_expected.csv" 39 | percentile_sum.to_csv(os.path.join(tests_path, F_ACTUAL)) 40 | # Re-read as CSV to remove index and ensure CSVs are equal. 41 | actual = pd.read_csv(os.path.join(tests_path, F_ACTUAL)) 42 | expected = pd.read_csv(os.path.join(tests_path, F_EXPECTED)) 43 | differences(actual, expected, F_ACTUAL, F_EXPECTED) 44 | -------------------------------------------------------------------------------- /microdf/tests/test_generic.py: -------------------------------------------------------------------------------- 1 | from microdf.generic import MicroDataFrame, MicroSeries 2 | import numpy as np 3 | import microdf as mdf 4 | import pandas as pd 5 | 6 | 7 | def test_df_init(): 8 | arr = np.array([0, 1, 1]) 9 | w = np.array([3, 0, 9]) 10 | df = mdf.MicroDataFrame({"a": arr}, weights=w) 11 | assert df.a.mean() == np.average(arr, weights=w) 12 | 13 | df = mdf.MicroDataFrame() 14 | df["a"] = arr 15 | df.set_weights(w) 16 | assert df.a.mean() == np.average(arr, weights=w) 17 | 18 | df = mdf.MicroDataFrame() 19 | df["a"] = arr 20 | df["w"] = w 21 | df.set_weight_col("w") 22 | assert df.a.mean() == np.average(arr, weights=w) 23 | 24 | 25 | def test_series_getitem(): 26 | arr = np.array([0, 1, 1]) 27 | w = np.array([3, 0, 9]) 28 | s = mdf.MicroSeries(arr, weights=w) 29 | assert s[[1, 2]].sum() == np.sum(arr[[1, 2]] * w[[1, 2]]) 30 | 31 | assert s[1:3].sum() == np.sum(arr[1:3] * w[1:3]) 32 | 33 | 34 | def test_sum(): 35 | arr = np.array([0, 1, 1]) 36 | w = np.array([3, 0, 9]) 37 | series = mdf.MicroSeries(arr, weights=w) 38 | assert series.sum() == (arr * w).sum() 39 | 40 | arr = np.linspace(-20, 100, 100) 41 | w = np.linspace(1, 3, 100) 42 | series = mdf.MicroSeries(arr) 43 | series.set_weights(w) 44 | assert series.sum() == (arr * w).sum() 45 | 46 | # Verify that an error is thrown when passing weights of different size 47 | # from the values. 48 | w = np.linspace(1, 3, 101) 49 | series = mdf.MicroSeries(arr) 50 | try: 51 | series.set_weights(w) 52 | assert False 53 | except Exception: 54 | pass 55 | 56 | 57 | def test_mean(): 58 | arr = np.array([3, 0, 2]) 59 | w = np.array([4, 1, 1]) 60 | series = mdf.MicroSeries(arr, weights=w) 61 | assert series.mean() == np.average(arr, weights=w) 62 | 63 | arr = np.linspace(-20, 100, 100) 64 | w = np.linspace(1, 3, 100) 65 | series = mdf.MicroSeries(arr) 66 | series.set_weights(w) 67 | assert series.mean() == np.average(arr, weights=w) 68 | 69 | w = np.linspace(1, 3, 101) 70 | series = mdf.MicroSeries(arr) 71 | try: 72 | series.set_weights(w) 73 | assert False 74 | except Exception: 75 | pass 76 | 77 | 78 | def test_poverty_count(): 79 | arr = np.array([10000, 20000, 50000]) 80 | w = np.array([1123, 1144, 2211]) 81 | df = MicroDataFrame(weights=w) 82 | df["income"] = arr 83 | df["threshold"] = 16000 84 | assert df.poverty_count("income", "threshold") == w[0] 85 | 86 | 87 | def test_median(): 88 | # 1, 2, 3, 4, *4*, 4, 5, 5, 5 89 | arr = np.array([1, 2, 3, 4, 5]) 90 | w = np.array([1, 1, 1, 3, 3]) 91 | series = mdf.MicroSeries(arr, weights=w) 92 | assert series.median() == 4 93 | 94 | 95 | def test_unweighted_groupby(): 96 | df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}) 97 | assert (df.groupby("x").z.sum().values == np.array([5.0, 6.0])).all() 98 | 99 | 100 | def test_multiple_groupby(): 101 | df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}) 102 | assert (df.groupby(["x", "y"]).z.sum() == np.array([5, 6])).all() 103 | 104 | 105 | def test_concat(): 106 | df1 = mdf.MicroDataFrame({"x": [1, 2]}, weights=[3, 4]) 107 | df2 = mdf.MicroDataFrame({"y": [5, 6]}, weights=[7, 8]) 108 | # Verify that pd.concat returns DataFrame (probably no way to fix this). 109 | pd_long = pd.concat([df1, df2]) 110 | assert isinstance(pd_long, pd.DataFrame) 111 | assert not isinstance(pd_long, mdf.MicroDataFrame) 112 | # Verify that mdf.concat works. 113 | mdf_long = mdf.concat([df1, df2]) 114 | assert isinstance(mdf_long, mdf.MicroDataFrame) 115 | # Weights should be preserved. 116 | assert mdf_long.weights.equals(pd.concat([df1.weights, df2.weights])) 117 | # Verify it works horizontally too (take the first set of weights). 118 | mdf_wide = mdf.concat([df1, df2], axis=1) 119 | assert isinstance(mdf_wide, mdf.MicroDataFrame) 120 | assert mdf_wide.weights.equals(df1.weights) 121 | 122 | 123 | def test_set_index(): 124 | d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6]) 125 | assert d.x.__class__ == MicroSeries 126 | d.index = [1, 2, 3] 127 | assert d.x.__class__ == MicroSeries 128 | 129 | 130 | def test_reset_index(): 131 | d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6]) 132 | assert d.reset_index().__class__ == MicroDataFrame 133 | 134 | 135 | def test_cumsum(): 136 | s = mdf.MicroSeries([1, 2, 3], weights=[4, 5, 6]) 137 | assert np.array_equal(s.cumsum().values, [4, 14, 32]) 138 | 139 | s = mdf.MicroSeries([2, 1, 3], weights=[5, 4, 6]) 140 | assert np.array_equal(s.cumsum().values, [10, 14, 32]) 141 | 142 | s = mdf.MicroSeries([3, 1, 2], weights=[6, 4, 5]) 143 | assert np.array_equal(s.cumsum().values, [18, 22, 32]) 144 | 145 | 146 | def test_rank(): 147 | s = mdf.MicroSeries([1, 2, 3], weights=[4, 5, 6]) 148 | assert np.array_equal(s.rank().values, [4, 9, 15]) 149 | 150 | s = mdf.MicroSeries([3, 1, 2], weights=[6, 4, 5]) 151 | assert np.array_equal(s.rank().values, [15, 4, 9]) 152 | 153 | s = mdf.MicroSeries([2, 1, 3], weights=[5, 4, 6]) 154 | assert np.array_equal(s.rank().values, [9, 4, 15]) 155 | 156 | 157 | def test_percentile_rank(): 158 | s = mdf.MicroSeries([4, 2, 3, 1], weights=[20, 40, 20, 20]) 159 | assert np.array_equal(s.percentile_rank().values, [100, 60, 80, 20]) 160 | 161 | 162 | def test_quartile_rank(): 163 | s = mdf.MicroSeries([4, 2, 3], weights=[25, 50, 25]) 164 | assert np.array_equal(s.quartile_rank().values, [4, 2, 3]) 165 | 166 | 167 | def test_quintile_rank(): 168 | s = mdf.MicroSeries([4, 2, 3], weights=[20, 60, 20]) 169 | assert np.array_equal(s.quintile_rank().values, [5, 3, 4]) 170 | 171 | 172 | def test_decile_rank_rank(): 173 | s = mdf.MicroSeries( 174 | [5, 4, 3, 2, 1, 6, 7, 8, 9], 175 | weights=[10, 20, 10, 10, 10, 10, 10, 10, 10, 10], 176 | ) 177 | assert np.array_equal(s.decile_rank().values, [6, 5, 3, 2, 1, 7, 8, 9, 10]) 178 | 179 | 180 | def test_copy_equals(): 181 | d = mdf.MicroDataFrame( 182 | {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8] 183 | ) 184 | d_copy = d.copy() 185 | d_copy_diff_weights = d_copy.copy() 186 | d_copy_diff_weights.weights *= 2 187 | assert d.equals(d_copy) 188 | assert not d.equals(d_copy_diff_weights) 189 | # Same for a MicroSeries. 190 | assert d.x.equals(d_copy.x) 191 | assert not d.x.equals(d_copy_diff_weights.x) 192 | 193 | 194 | def test_subset(): 195 | df = mdf.MicroDataFrame( 196 | {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8] 197 | ) 198 | df_no_z = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4]}, weights=[7, 8]) 199 | assert df[["x", "y"]].equals(df_no_z) 200 | df_no_z_diff_weights = df_no_z.copy() 201 | df_no_z_diff_weights.weights += 1 202 | assert not df[["x", "y"]].equals(df_no_z_diff_weights) 203 | 204 | 205 | def test_value_subset(): 206 | d = mdf.MicroDataFrame({"x": [1, 2, 3], "y": [1, 2, 2]}, weights=[4, 5, 6]) 207 | d2 = d[d.y > 1] 208 | assert d2.y.shape == d2.weights.shape 209 | -------------------------------------------------------------------------------- /microdf/tests/test_inequality.py: -------------------------------------------------------------------------------- 1 | import microdf as mdf 2 | 3 | import pandas as pd 4 | 5 | 6 | def test_top_pct(): 7 | x = list(range(1, 11)) # 1 to 10. Sum = 10 * 11 / 2 = 55. 8 | df = pd.DataFrame({"x": x}) 9 | ms = mdf.MicroSeries(x) 10 | RES = 10 / 55 11 | assert mdf.top_10_pct_share(df, "x") == RES 12 | assert ms.top_10_pct_share() == RES 13 | x = list(range(1, 4)) 14 | df = pd.DataFrame({"x": x, "w": x}) 15 | ms = mdf.MicroSeries(x, weights=x) 16 | # This is equivalent to [1, 2, 2, 3, 3, 3] 17 | # Sum = 14, top half is 9. 18 | RES = 9 / 14 19 | assert mdf.top_50_pct_share(df, "x", "w") == RES 20 | assert ms.top_50_pct_share() == RES 21 | -------------------------------------------------------------------------------- /microdf/tests/test_io.py: -------------------------------------------------------------------------------- 1 | import microdf as mdf 2 | 3 | 4 | def test_read_stata_zip(): 5 | """ """ 6 | SCF2016 = "https://www.federalreserve.gov/econres/files/scfp2016s.zip" 7 | COLS = ["wgt", "networth"] 8 | df = mdf.read_stata_zip(SCF2016, columns=COLS) 9 | assert df.columns.tolist() == COLS 10 | assert df.shape[0] > 0 11 | -------------------------------------------------------------------------------- /microdf/tests/test_optional_dependency.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import types 3 | 4 | import pytest 5 | 6 | from microdf._optional import VERSIONS, import_optional_dependency 7 | 8 | 9 | def test_import_optional(): 10 | """ """ 11 | match = "Missing .*notapackage.* pip .* conda .* notapackage" 12 | with pytest.raises(ImportError, match=match): 13 | import_optional_dependency("notapackage") 14 | 15 | result = import_optional_dependency("notapackage", raise_on_missing=False) 16 | assert result is None 17 | 18 | 19 | def test_xlrd_version_fallback(): 20 | """ """ 21 | pytest.importorskip("xlrd") 22 | import_optional_dependency("xlrd") 23 | 24 | 25 | def test_bad_version(): 26 | """ """ 27 | name = "fakemodule" 28 | module = types.ModuleType(name) 29 | module.__version__ = "0.9.0" 30 | sys.modules[name] = module 31 | VERSIONS[name] = "1.0.0" 32 | 33 | match = "microdf requires .*1.0.0.* of .fakemodule.*'0.9.0'" 34 | with pytest.raises(ImportError, match=match): 35 | import_optional_dependency("fakemodule") 36 | 37 | with pytest.warns(UserWarning): 38 | result = import_optional_dependency("fakemodule", on_version="warn") 39 | assert result is None 40 | 41 | module.__version__ = "1.0.0" # exact match is OK 42 | result = import_optional_dependency("fakemodule") 43 | assert result is module 44 | 45 | 46 | def test_no_version_raises(): 47 | """ """ 48 | name = "fakemodule" 49 | module = types.ModuleType(name) 50 | sys.modules[name] = module 51 | VERSIONS[name] = "1.0.0" 52 | 53 | with pytest.raises(ImportError, match="Can't determine .* fakemodule"): 54 | import_optional_dependency(name) 55 | -------------------------------------------------------------------------------- /microdf/tests/test_percentile_actual.csv: -------------------------------------------------------------------------------- 1 | val_percentile,val,w 2 | 1,0.04936696707980226,5.0030480466019815 3 | 2,0.17830779834685495,5.114684559704431 4 | 3,0.2857988855674821,5.003202737366776 5 | 4,0.33805302460864795,4.733066370107501 6 | 5,0.5703591960673162,5.239285359053462 7 | 6,0.6096331244255117,5.429420284539545 8 | 7,0.5985039643349068,4.348564066721563 9 | 8,0.7510884464659845,5.440450569600465 10 | 9,1.0876105767407074,5.499343558417599 11 | 10,0.8899045403172029,5.1538348751709435 12 | 11,1.394683443411786,5.23502927227519 13 | 12,0.9305982112361821,4.417722946149524 14 | 13,0.9650569353812546,5.348799899633306 15 | 14,1.3214822270450117,5.3325704209630524 16 | 15,1.2715100776802908,5.061289605022857 17 | 16,1.6720424048893725,5.305386843295495 18 | 17,1.9424737512447825,5.255981017771979 19 | 18,1.0348899262653468,4.336843836714941 20 | 19,1.7975116284606523,5.1773144696335915 21 | 20,2.1006171498258643,5.589603555523628 22 | 21,1.8709513510005413,5.335531722226071 23 | 22,1.7207120623536305,4.695751363757509 24 | 23,2.00187368264572,5.054306807303892 25 | 24,2.313857097420784,4.884140680676319 26 | 25,2.896700637851371,5.5236380227234845 27 | 26,2.489820947315488,4.791953107597212 28 | 27,2.8411561635459712,5.5661465432344635 29 | 28,2.4261357418555654,4.446073644558097 30 | 29,3.357591023039818,5.291769801304535 31 | 30,2.314506073869091,5.149625279108836 32 | 31,3.551962771845865,5.402587912747316 33 | 32,4.002233897939344,5.093801654006394 34 | 33,3.527022677246467,5.10624451980372 35 | 34,2.654138701549258,5.043397568469286 36 | 35,2.7068858476732176,4.636131024815089 37 | 36,4.511798986257631,5.713008105710415 38 | 37,3.583675870814159,4.894200392814213 39 | 38,3.669836624431939,5.033490962922036 40 | 39,3.363028108896736,5.715252956264232 41 | 40,2.269128705743366,4.687099540814201 42 | 41,3.0962247363582,5.0205442523389685 43 | 42,5.586938042279173,5.2375613191718084 44 | 43,4.062902694908744,5.482945818496652 45 | 44,3.3160969156123503,4.938943405325701 46 | 45,4.241531465516574,5.242988142385608 47 | 46,5.632990791952207,4.604524871392653 48 | 47,4.003109598155357,5.449983106760843 49 | 48,3.627194217198549,4.5252648036434655 50 | 49,4.574489241426403,5.827965751590956 51 | 50,3.26114249800974,5.07472388942889 52 | 51,5.770693937000146,4.924561771632174 53 | 52,6.928803378451052,5.390871201576556 54 | 53,5.605614198095166,4.867506840251817 55 | 54,5.712794397822728,4.802840821886715 56 | 55,6.330048423625907,5.043215026134606 57 | 56,6.49917632389808,5.549363062730124 58 | 57,4.455249905003633,5.388617265562158 59 | 58,3.9747504049202647,4.804030090560459 60 | 59,4.591734071798827,4.79723029172527 61 | 60,9.365137265130915,5.674437752031215 62 | 61,7.18714959246965,4.968027795024001 63 | 62,5.514056987390571,5.21537151235826 64 | 63,6.199762618472243,5.007944951253934 65 | 64,4.396249378665468,5.013683226930631 66 | 65,7.678394207429429,5.4324805777516385 67 | 66,8.535490990338737,4.983177889646899 68 | 67,7.380492997015747,5.160276660633111 69 | 68,7.495870406959786,5.166303035743688 70 | 69,6.23459148472051,4.800326067433167 71 | 70,4.8795947527845165,5.144612976891027 72 | 71,6.331517082169763,4.923377667981699 73 | 72,10.028408222423991,4.7524585511425315 74 | 73,7.267010600317143,5.928033173228412 75 | 74,5.856718614341236,5.153535683492981 76 | 75,6.6513086587256645,4.4540875011243095 77 | 76,6.009374270984309,5.563696325737205 78 | 77,6.903278042059779,4.9610776312587195 79 | 78,9.33170057141403,5.442361429739662 80 | 79,4.741840898628048,4.385127095815581 81 | 80,10.441998036333615,5.681838547736346 82 | 81,8.15549502111278,5.3122709245136726 83 | 82,8.264309038308452,4.777889650212625 84 | 83,5.887391058178539,4.86549347601554 85 | 84,7.6746735994204345,5.1933876664286505 86 | 85,8.617091500906668,5.330823383917326 87 | 86,5.205242021670235,5.116263366906492 88 | 87,6.110525427774986,5.0877466081133615 89 | 88,10.555886286183668,4.846433766268649 90 | 89,9.79517076482145,5.551267106013298 91 | 90,8.999880920529858,4.8964913320490835 92 | 91,7.269834366073169,5.094073872052731 93 | 92,8.288552906946475,5.482297252107353 94 | 93,9.264743724824317,4.4854074855836235 95 | 94,8.42071960033443,5.3698127342621085 96 | 95,10.393061379249968,5.397446566018766 97 | 96,9.545057271504412,5.083199029497632 98 | 97,9.61390230771744,5.147118076577214 99 | 98,8.719700395822183,4.537389380232294 100 | 99,14.670850665733171,5.177534031309767 101 | 100,12.905505941170981,5.722129244713919 102 | -------------------------------------------------------------------------------- /microdf/tests/test_percentile_expected.csv: -------------------------------------------------------------------------------- 1 | val_percentile,val,w 2 | 1,0.04936696707980226,5.0030480466019815 3 | 2,0.17830779834685495,5.114684559704431 4 | 3,0.2857988855674821,5.003202737366776 5 | 4,0.33805302460864795,4.733066370107501 6 | 5,0.5703591960673162,5.239285359053462 7 | 6,0.6096331244255117,5.429420284539545 8 | 7,0.5985039643349068,4.348564066721563 9 | 8,0.7510884464659845,5.440450569600465 10 | 9,1.0876105767407074,5.499343558417599 11 | 10,0.8899045403172029,5.1538348751709435 12 | 11,1.394683443411786,5.23502927227519 13 | 12,0.9305982112361821,4.417722946149524 14 | 13,0.9650569353812546,5.348799899633306 15 | 14,1.3214822270450117,5.3325704209630524 16 | 15,1.2715100776802908,5.061289605022857 17 | 16,1.6720424048893725,5.305386843295495 18 | 17,1.9424737512447825,5.255981017771979 19 | 18,1.0348899262653468,4.336843836714941 20 | 19,1.7975116284606523,5.1773144696335915 21 | 20,2.1006171498258643,5.589603555523628 22 | 21,1.8709513510005413,5.335531722226071 23 | 22,1.7207120623536305,4.695751363757509 24 | 23,2.00187368264572,5.054306807303892 25 | 24,2.313857097420784,4.884140680676319 26 | 25,2.896700637851371,5.5236380227234845 27 | 26,2.489820947315488,4.791953107597212 28 | 27,2.8411561635459712,5.5661465432344635 29 | 28,2.4261357418555654,4.446073644558097 30 | 29,3.357591023039818,5.291769801304535 31 | 30,2.314506073869091,5.149625279108836 32 | 31,3.551962771845865,5.402587912747316 33 | 32,4.002233897939344,5.093801654006394 34 | 33,3.527022677246467,5.10624451980372 35 | 34,2.654138701549258,5.043397568469286 36 | 35,2.7068858476732176,4.636131024815089 37 | 36,4.511798986257631,5.713008105710415 38 | 37,3.583675870814159,4.894200392814213 39 | 38,3.669836624431939,5.033490962922036 40 | 39,3.363028108896736,5.715252956264232 41 | 40,2.269128705743366,4.687099540814201 42 | 41,3.0962247363582,5.0205442523389685 43 | 42,5.586938042279173,5.2375613191718084 44 | 43,4.062902694908744,5.482945818496652 45 | 44,3.3160969156123503,4.938943405325701 46 | 45,4.241531465516574,5.242988142385608 47 | 46,5.632990791952207,4.604524871392653 48 | 47,4.003109598155357,5.449983106760843 49 | 48,3.627194217198549,4.5252648036434655 50 | 49,4.574489241426403,5.827965751590956 51 | 50,3.26114249800974,5.07472388942889 52 | 51,5.770693937000146,4.924561771632174 53 | 52,6.928803378451052,5.390871201576556 54 | 53,5.605614198095166,4.867506840251817 55 | 54,5.712794397822728,4.802840821886715 56 | 55,6.330048423625907,5.043215026134606 57 | 56,6.49917632389808,5.549363062730124 58 | 57,4.455249905003633,5.388617265562158 59 | 58,3.9747504049202647,4.804030090560459 60 | 59,4.591734071798827,4.79723029172527 61 | 60,9.365137265130915,5.674437752031215 62 | 61,7.18714959246965,4.968027795024001 63 | 62,5.514056987390571,5.21537151235826 64 | 63,6.199762618472243,5.007944951253934 65 | 64,4.396249378665468,5.013683226930631 66 | 65,7.678394207429429,5.4324805777516385 67 | 66,8.535490990338737,4.983177889646899 68 | 67,7.380492997015747,5.160276660633111 69 | 68,7.495870406959786,5.166303035743688 70 | 69,6.23459148472051,4.800326067433167 71 | 70,4.8795947527845165,5.144612976891027 72 | 71,6.331517082169763,4.923377667981699 73 | 72,10.028408222423991,4.7524585511425315 74 | 73,7.267010600317143,5.928033173228412 75 | 74,5.856718614341236,5.153535683492981 76 | 75,6.6513086587256645,4.4540875011243095 77 | 76,6.009374270984309,5.563696325737205 78 | 77,6.903278042059779,4.9610776312587195 79 | 78,9.33170057141403,5.442361429739662 80 | 79,4.741840898628048,4.385127095815581 81 | 80,10.441998036333615,5.681838547736346 82 | 81,8.15549502111278,5.3122709245136726 83 | 82,8.264309038308452,4.777889650212625 84 | 83,5.887391058178539,4.86549347601554 85 | 84,7.6746735994204345,5.1933876664286505 86 | 85,8.617091500906668,5.330823383917326 87 | 86,5.205242021670235,5.116263366906492 88 | 87,6.110525427774986,5.0877466081133615 89 | 88,10.555886286183668,4.846433766268649 90 | 89,9.79517076482145,5.551267106013298 91 | 90,8.999880920529858,4.8964913320490835 92 | 91,7.269834366073169,5.094073872052731 93 | 92,8.288552906946475,5.482297252107353 94 | 93,9.264743724824317,4.4854074855836235 95 | 94,8.42071960033443,5.3698127342621085 96 | 95,10.393061379249968,5.397446566018766 97 | 96,9.545057271504412,5.083199029497632 98 | 97,9.61390230771744,5.147118076577214 99 | 98,8.719700395822183,4.537389380232294 100 | 99,14.670850665733171,5.177534031309767 101 | 100,12.905505941170981,5.722129244713919 102 | -------------------------------------------------------------------------------- /microdf/tests/test_poverty.py: -------------------------------------------------------------------------------- 1 | import microdf as mdf 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | df = pd.DataFrame( 7 | { 8 | "income": [-10, 0, 10, 20], 9 | "threshold": [15, 10, 15, 10], 10 | "weight": [1, 2, 3, 4], 11 | } 12 | ) 13 | md = mdf.MicroDataFrame(df[["income", "threshold"]], weights=df.weight) 14 | 15 | 16 | def test_poverty_rate(): 17 | # Unweighted 18 | assert np.allclose(mdf.poverty_rate(df, "income", "threshold"), 3 / 4) 19 | # Weighted 20 | assert np.allclose( 21 | mdf.poverty_rate(df, "income", "threshold", "weight"), 6 / 10 22 | ) 23 | assert np.allclose(md.poverty_rate("income", "threshold"), 6 / 10) 24 | 25 | 26 | def test_deep_poverty_rate(): 27 | # Unweighted 28 | assert np.allclose(mdf.deep_poverty_rate(df, "income", "threshold"), 2 / 4) 29 | # Weighted 30 | assert np.allclose( 31 | mdf.deep_poverty_rate(df, "income", "threshold", "weight"), 3 / 10 32 | ) 33 | assert np.allclose(md.deep_poverty_rate("income", "threshold"), 3 / 10) 34 | 35 | 36 | def test_poverty_gap(): 37 | # Unweighted 38 | assert np.allclose(mdf.poverty_gap(df, "income", "threshold"), 25 + 10 + 5) 39 | # Weighted 40 | RES = 25 * 1 + 10 * 2 + 5 * 3 41 | assert np.allclose( 42 | mdf.poverty_gap(df, "income", "threshold", "weight"), RES 43 | ) 44 | assert np.allclose(md.poverty_gap("income", "threshold"), RES) 45 | 46 | 47 | def test_squared_poverty_gap(): 48 | # Unweighted 49 | assert np.allclose( 50 | mdf.squared_poverty_gap(df, "income", "threshold"), 51 | 25 ** 2 + 10 ** 2 + 5 ** 2, 52 | ) 53 | # Weighted 54 | RES = 1 * (25 ** 2) + 2 * (10 ** 2) + 3 * (5 ** 2) 55 | assert np.allclose( 56 | mdf.squared_poverty_gap(df, "income", "threshold", "weight"), RES, 57 | ) 58 | assert np.allclose(md.squared_poverty_gap("income", "threshold"), RES) 59 | 60 | 61 | def test_deep_poverty_gap(): 62 | # Unweighted 63 | assert np.allclose( 64 | mdf.deep_poverty_gap(df, "income", "threshold"), 17.5 + 5 + 0 + 0 65 | ) 66 | # Weighted 67 | RES = 17.5 * 1 + 5 * 2 + 0 * 3 + 0 * 4 68 | assert np.allclose( 69 | mdf.deep_poverty_gap(df, "income", "threshold", "weight"), RES 70 | ) 71 | # Same in MicroDataFrame. 72 | assert np.allclose(md.deep_poverty_gap("income", "threshold"), RES) 73 | -------------------------------------------------------------------------------- /microdf/tests/test_quantile_chg.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import microdf as mdf 4 | 5 | 6 | V1 = [1, 2, 3] 7 | V2 = [4, 5, 6] 8 | W1 = [7, 8, 9] 9 | W2 = [10, 11, 12] 10 | DF1 = pd.DataFrame({"v": V1, "w": W1}) 11 | DF2 = pd.DataFrame({"v": V2, "w": W2}) 12 | 13 | 14 | def test_quantile_chg(): 15 | mdf.quantile_chg(DF1, DF2, "v", "w", "v", "w") 16 | 17 | 18 | def test_quantile_pct_chg_plot(): 19 | mdf.quantile_pct_chg_plot(DF1, DF2, "v", "w", "v", "w") 20 | -------------------------------------------------------------------------------- /microdf/tests/test_tax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | import microdf as mdf 6 | 7 | 8 | def test_tax(): 9 | """ """ 10 | # Consider a MTR schedule of 0% up to 10,000, then 10% after that. 11 | BRACKETS = [0, 10e3] 12 | RATES = [0, 0.1] 13 | INCOME = [0, 5e3, 10e3, 10e3 + 1, 20e3] 14 | EXPECTED = [0, 0, 0, 0.1, 1e3] 15 | res = mdf.tax_from_mtrs(INCOME, BRACKETS, RATES) 16 | pd.testing.assert_series_equal(res, pd.Series(EXPECTED)) 17 | # Try with 10% avoidance. 18 | EXPECTED_10PCT_AVOIDANCE = [0, 0, 0, 0, 800.0] 19 | res_10pct_avoidance = mdf.tax_from_mtrs(INCOME, BRACKETS, RATES, 0.1) 20 | pd.testing.assert_series_equal( 21 | res_10pct_avoidance, pd.Series(EXPECTED_10PCT_AVOIDANCE) 22 | ) 23 | # Try with avoidance elasticity of 2. 24 | EXPECTED_E2_AVOIDANCE = [ 25 | 0, 26 | 0, 27 | 0, 28 | 0, # Taxable base becomes (10e3 + 1) * (1 - 2 * 0.1) 29 | # Taxable base becomes 20e3 * (exp(-2 * 0.1)). 30 | 0.1 * (20e3 * np.exp(-0.2) - 10e3), 31 | ] 32 | res_e2_avoidance = mdf.tax_from_mtrs( 33 | INCOME, BRACKETS, RATES, avoidance_elasticity=2 34 | ) 35 | pd.testing.assert_series_equal( 36 | res_e2_avoidance, pd.Series(EXPECTED_E2_AVOIDANCE) 37 | ) 38 | # Try with flat avoidance elasticity of 2. 39 | EXPECTED_E2_AVOIDANCE_FLAT = [ 40 | 0, 41 | 0, 42 | 0, 43 | 0, # Taxable base becomes (10e3 + 1) * (1 - 2 * 0.1) 44 | 600.0, 45 | ] # Taxable base becomes 20e3 * (1 - 2 * 0.1) = 16e3. 46 | res_e2_avoidance_flat = mdf.tax_from_mtrs( 47 | INCOME, BRACKETS, RATES, avoidance_elasticity_flat=2 48 | ) 49 | pd.testing.assert_series_equal( 50 | res_e2_avoidance_flat, pd.Series(EXPECTED_E2_AVOIDANCE_FLAT) 51 | ) 52 | # Ensure error when passing both rate and elasticity. 53 | with pytest.raises(Exception): 54 | mdf.tax_from_mtrs(INCOME, BRACKETS, RATES, 0.1, 2) 55 | -------------------------------------------------------------------------------- /microdf/tests/test_taxcalc.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import microdf as mdf 4 | 5 | 6 | try: 7 | import taxcalc as tc 8 | 9 | _HAVE_TAXCALC = True 10 | except ImportError: 11 | _HAVE_TAXCALC = False 12 | 13 | 14 | def test_calc_df(): 15 | """ """ 16 | if not _HAVE_TAXCALC: 17 | pytest.skip("taxcalc is not installed") 18 | mdf.calc_df() 19 | 20 | 21 | def test_static_baseline_calc(): 22 | """ """ 23 | if not _HAVE_TAXCALC: 24 | pytest.skip("taxcalc is not installed") 25 | recs = tc.Records.cps_constructor() 26 | mdf.static_baseline_calc(recs, 2020) 27 | -------------------------------------------------------------------------------- /microdf/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import microdf as mdf 4 | 5 | 6 | def test_cartesian_product(): 7 | """ """ 8 | res = mdf.cartesian_product( 9 | {"a": [1, 2, 3], "b": ["val1", "val2"], "c": [100, 101]} 10 | ) 11 | EXPECTED = pd.DataFrame( 12 | { 13 | "a": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 14 | "b": [ 15 | "val1", 16 | "val1", 17 | "val2", 18 | "val2", 19 | "val1", 20 | "val1", 21 | "val2", 22 | "val2", 23 | "val1", 24 | "val1", 25 | "val2", 26 | "val2", 27 | ], 28 | "c": [100, 101, 100, 101, 100, 101, 100, 101, 100, 101, 100, 101], 29 | } 30 | ) 31 | pd.testing.assert_frame_equal(res, EXPECTED) 32 | 33 | 34 | def test_flatten(): 35 | """ """ 36 | L = [[[1, 2, 3], [4, 5]], 6] 37 | res = list(mdf.flatten(L)) 38 | EXPECTED = [1, 2, 3, 4, 5, 6] 39 | assert res == EXPECTED 40 | -------------------------------------------------------------------------------- /microdf/tests/test_weighted.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | import microdf as mdf 5 | 6 | 7 | X = [1, 5, 2] 8 | Y = [0, -6, 3] 9 | W = [4, 1, 1] 10 | df = pd.DataFrame({"x": X, "y": Y, "w": W}) 11 | ms = mdf.MicroSeries(X, weights=W) 12 | md = mdf.MicroDataFrame(df[["x", "y"]], weights=W) 13 | # Also make a version with groups. 14 | df2 = df.copy(deep=True) 15 | df2.x *= 2 16 | df2.y *= 1.5 17 | dfg = pd.concat([df, df2]) 18 | dfg["g"] = ["a"] * 3 + ["b"] * 3 19 | mdg = mdf.MicroDataFrame(dfg[["x", "y", "g"]], weights=W) 20 | 21 | 22 | def test_weighted_quantile(): 23 | Q = [0, 0.5, 1] 24 | mdf.weighted_quantile(df, "x", "w", Q).tolist() 25 | 26 | 27 | def test_weighted_median(): 28 | assert mdf.weighted_median(df, "x") == 2 29 | mdf.weighted_median(df, "x", "w") 30 | # Test with groups. 31 | mdf.weighted_median(dfg, "x", "w", "g") 32 | 33 | 34 | def test_weighted_mean(): 35 | # Test umweighted. 36 | assert mdf.weighted_mean(df, "x") == 8 / 3 37 | # Test weighted. 38 | assert mdf.weighted_mean(df, "x", "w") == 11 / 6 39 | # Test weighted with multiple columns. 40 | assert mdf.weighted_mean(df, ["x", "y"], "w").tolist() == [11 / 6, -3 / 6] 41 | # Test grouped. 42 | mdf.weighted_mean(dfg, "x", "w", "g") 43 | mdf.weighted_mean(dfg, ["x", "y"], "w", "g") 44 | 45 | 46 | def test_weighted_sum(): 47 | # Test unweighted. 48 | assert mdf.weighted_sum(df, "x") == 8 49 | # Test weighted. 50 | assert mdf.weighted_sum(df, "x", "w") == 11 51 | # Test weighted with multiple columns. 52 | assert mdf.weighted_sum(df, ["x", "y"], "w").tolist() == [11, -3] 53 | # Test grouped. 54 | mdf.weighted_sum(dfg, "x", "w", "g") 55 | mdf.weighted_sum(dfg, ["x", "y"], "w", "g") 56 | 57 | 58 | def test_gini(): 59 | # Test nothing breaks. 60 | ms.gini() 61 | # Unweighted. 62 | mdf.gini(df, "x") 63 | # Weighted 64 | mdf.gini(df, "x", "w") 65 | # Unweighted, grouped 66 | mdf.gini(dfg, "x", groupby="g") 67 | # Weighted, grouped 68 | mdf.gini(dfg, "x", "w", groupby="g") 69 | # Test old and new match. 70 | assert ms.gini() == mdf.gini(df, "x", "w") 71 | 72 | 73 | def test_add_weighted_quantiles(): 74 | with pytest.deprecated_call(): 75 | mdf.add_weighted_quantiles(df, "x", "w") 76 | -------------------------------------------------------------------------------- /microdf/ubi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import microdf as mdf 4 | 5 | 6 | def ubi_or_bens( 7 | df, 8 | ben_cols, 9 | max_ubi="max_ubi", 10 | ubi="ubi", 11 | bens="bens", 12 | update_income_measures=None, 13 | ): 14 | """Calculates whether a tax unit will take UBI or benefits, 15 | and adjusts values accordingly. 16 | 17 | :param df: DataFrame. 18 | :param ben_cols: List of columns for benefits. 19 | :param max_ubi: Column name of the maximum UBI, before accounting 20 | for benefits. Defaults to 'max_ubi'. 21 | :param ubi: Column name to add representing the UBI. Defaults to 'ubi'. 22 | :param bens: Column name to add representing total benefits (after 23 | adjustment). Defaults to 'bens'. 24 | :param update_income_measures: List of income measures to update. 25 | Defaults to ['expanded_income', 'aftertax_income']. 26 | :returns: Nothing. Benefits in ben_cols are adjusted, ubi and bens columns 27 | are added, and expanded_income and aftertax_income are updated 28 | according to the net difference. 29 | 30 | """ 31 | if update_income_measures is None: 32 | update_income_measures = ["expanded_income", "aftertax_income"] 33 | # Prep list args. 34 | update_income_measures = mdf.listify(update_income_measures) 35 | total_bens = df[ben_cols].sum(axis=1) 36 | take_ubi = df[max_ubi] > total_bens 37 | df[ubi] = np.where(take_ubi, df[max_ubi], 0) 38 | for ben in ben_cols: 39 | df[ben] *= np.where(take_ubi, 0, 1) 40 | df[bens] = df[ben_cols].sum(axis=1) 41 | # Update expanded and aftertax income. 42 | diff = df.ubi + df.bens - total_bens 43 | for i in update_income_measures: 44 | df[i] += diff 45 | -------------------------------------------------------------------------------- /microdf/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import pandas as pd 4 | 5 | 6 | def ordinal_label(n): 7 | """Creates ordinal label from number. 8 | 9 | Adapted from https://stackoverflow.com/a/20007730/1840471. 10 | 11 | :param n: Number. 12 | :returns: Ordinal label, e.g., 1st, 3rd, 24th, etc. 13 | 14 | """ 15 | n = int(n) 16 | ix = (n / 10 % 10 != 1) * (n % 10 < 4) * n % 10 17 | return "%d%s" % (n, "tsnrhtdd"[ix::4]) 18 | 19 | 20 | def dedup_list(lst): 21 | """Remove duplicate items from a list. 22 | 23 | :param lst: List. 24 | :returns: List with duplicate items removed from lst. 25 | 26 | """ 27 | return list(set(lst)) 28 | 29 | 30 | def listify(x, dedup=True): 31 | """Return x as a list, if it isn't one already. 32 | 33 | :param x: A single item or a list 34 | :param dedup: Default value = True) 35 | :returns: x if x is a list, otherwise [x]. Also flattens the list 36 | and removes Nones. 37 | 38 | """ 39 | if not isinstance(x, list): 40 | x = [x] 41 | res = flatten(x) 42 | res = [x for x in res if x is not None] 43 | if dedup: 44 | return dedup_list(res) 45 | return res 46 | 47 | 48 | def flatten(lst): 49 | """Flatten list. From https://stackoverflow.com/a/2158532/1840471. 50 | 51 | :param lst: List. 52 | :returns: Flattened version. 53 | 54 | """ 55 | for el in lst: 56 | if isinstance(el, collections.abc.Iterable) and not isinstance( 57 | el, (str, bytes) 58 | ): 59 | yield from flatten(el) 60 | else: 61 | yield el 62 | 63 | 64 | def cartesian_product(d): 65 | """Produces a DataFrame as a Cartesian product of dictionary 66 | keys and values. 67 | 68 | :param d: Dictionary where each item's key corresponds to a column 69 | name, and each value is a list of values. 70 | :returns: DataFrame with a Cartesian product of each dictionary item. 71 | 72 | """ 73 | index = pd.MultiIndex.from_product(d.values(), names=d.keys()) 74 | return pd.DataFrame(index=index).reset_index() 75 | -------------------------------------------------------------------------------- /microdf/weighted.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import warnings 4 | 5 | import microdf as mdf 6 | 7 | 8 | def weight(df, col, w=None): 9 | """Calculates the weighted value of a column in a DataFrame. 10 | 11 | :param df: A pandas DataFrame. 12 | :param col: A string indicating the column in the DataFrame to weight. 13 | Can also be a list of column strings. 14 | :param w: Weight column. 15 | :returns: A pandas Series multiplying the column by its weight. 16 | 17 | """ 18 | if w is None: 19 | return df[col] 20 | return df[col].multiply(df[w], axis="index") 21 | 22 | 23 | def weighted_sum(df, col, w=None, groupby=None): 24 | """Calculates the weighted sum of a column in a DataFrame. 25 | 26 | :param df: A pandas DataFrame. 27 | :param col: A string indicating the column in the DataFrame. 28 | Can also be a list of column strings. 29 | :param w: Weight column. 30 | :param groupby: Groupby column. 31 | :returns: The weighted sum of a DataFrame's column. 32 | 33 | """ 34 | 35 | def _weighted_sum(df, col, w): 36 | """ For weighted sum with provided weight. """ 37 | return weight(df, col, w).sum() 38 | 39 | if groupby is None: 40 | if w is None: 41 | return df[col].sum() 42 | return _weighted_sum(df, col, w) 43 | # If grouping. 44 | if w is None: 45 | return df.groupby(groupby)[col].sum() 46 | return df.groupby(groupby).apply(lambda x: _weighted_sum(x, col, w)) 47 | 48 | 49 | def weighted_mean(df, col, w=None, groupby=None): 50 | """Calculates the weighted mean of a column in a DataFrame. 51 | 52 | :param df: A pandas DataFrame. 53 | :param col: A string indicating the column in the DataFrame. 54 | Can also be a list of column strings. 55 | :param w: Weight column. 56 | :param groupby: Groupby column. 57 | :returns: The weighted mean of a DataFrame's column. 58 | 59 | """ 60 | 61 | def _weighted_mean(df, col, w=None): 62 | """ For weighted mean with provided weight. """ 63 | return weighted_sum(df, col, w) / df[w].sum() 64 | 65 | if groupby is None: 66 | if w is None: 67 | return df[col].mean() 68 | return _weighted_mean(df, col, w) 69 | # Group. 70 | if w is None: 71 | return df.groupby(groupby)[col].mean() 72 | return df.groupby(groupby).apply(lambda x: _weighted_mean(x, col, w)) 73 | 74 | 75 | def weighted_quantile(df: pd.DataFrame, col: str, w: str, quantiles: np.array): 76 | """Calculates weighted quantiles of a set of values. 77 | 78 | Doesn't exactly match unweighted quantiles of stacked values. 79 | See stackoverflow.com/q/21844024#comment102342137_29677616. 80 | 81 | :param df: DataFrame to calculate weighted quantiles from. 82 | :type df: pd.DataFrame 83 | :param col: Name of numeric column in df to calculate weighted quantiles 84 | from. 85 | :type col: str 86 | :param w: Name of weight column in df. 87 | :type w: str 88 | :param quantiles: Array of quantiles to calculate. 89 | :type quantiles: np.array 90 | :return: Array of weighted quantiles. 91 | :rtype: np.array 92 | """ 93 | values = np.array(df[col]) 94 | quantiles = np.array(quantiles) 95 | if w is None: 96 | sample_weight = np.ones(len(values)) 97 | else: 98 | sample_weight = np.array(df[w]) 99 | assert np.all(quantiles >= 0) and np.all( 100 | quantiles <= 1 101 | ), "quantiles should be in [0, 1]" 102 | sorter = np.argsort(values) 103 | values = values[sorter] 104 | sample_weight = sample_weight[sorter] 105 | weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight 106 | weighted_quantiles /= np.sum(sample_weight) 107 | return np.interp(quantiles, weighted_quantiles, values) 108 | 109 | 110 | def weighted_median(df, col, w=None, groupby=None): 111 | """Calculates the weighted median of a column in a DataFrame. 112 | 113 | :param df: A pandas DataFrame containing Tax-Calculator data. 114 | :param col: A string indicating the column in the DataFrame. 115 | :param w: Weight column. 116 | :returns: The weighted median of a DataFrame's column. 117 | 118 | """ 119 | 120 | def _weighted_median(df, col, w): 121 | """ For weighted median with provided weight. """ 122 | return weighted_quantile(df, col, w, 0.5) 123 | 124 | if groupby is None: 125 | if w is None: 126 | return df[col].median() 127 | return _weighted_median(df, col, w) 128 | # Group. 129 | if w is None: 130 | return df.groupby(groupby)[col].median() 131 | return df.groupby(groupby).apply(lambda x: _weighted_median(x, col, w)) 132 | 133 | 134 | def add_weighted_quantiles(df, col, w): 135 | """Adds weighted quantiles of a column to a DataFrame. 136 | This will be deprecated in the next minor release. Please use 137 | MicroSeries.rank instead. 138 | 139 | Adds columns for each of these types of quantiles to a DataFrame: 140 | * *_percentile_exact: Exact percentile. 141 | * *_percentile: Integer percentile (ceiling). 142 | * *_2percentile: Integer percentile (ceiling, for each two percentiles). 143 | * *_ventile: Integer percentile (ceiling, for each five percentiles). 144 | * *_decile: Integer decile. 145 | * *_quintile: Integer quintile. 146 | * *_quartile: Integer quartile. 147 | 148 | Negative values are assigned -1. 149 | 150 | :param df: A pandas DataFrame. 151 | :param col: A string indicating the column in the DataFrame to calculate. 152 | :param w: Weight column. 153 | :returns: Nothing. Columns are added in place. Also sorts df by col. 154 | """ 155 | warnings.warn( 156 | "This will be deprecated in the next minor release. " 157 | "Please use MicroSeries.rank instead.", 158 | DeprecationWarning, 159 | ) 160 | df.sort_values(by=col, inplace=True) 161 | col_pctile = col + "_percentile_exact" 162 | df[col_pctile] = 100 * df[w].cumsum() / df[w].sum() 163 | # "Null out" negatives using -1, since integer arrays can't be NaN. 164 | df[col_pctile] = np.where(df[col] >= 0, df[col_pctile], 0) 165 | # Reduce top record, otherwise it's incorrectly rounded up. 166 | df[col_pctile] = np.where( 167 | df[col_pctile] >= 99.99999, 99.99999, df[col_pctile] 168 | ) 169 | df[col + "_percentile"] = np.ceil(df[col_pctile]).astype(int) 170 | df[col + "_2percentile"] = 2 * np.ceil(df[col_pctile] / 2).astype(int) 171 | df[col + "_ventile"] = 5 * np.ceil(df[col_pctile] / 5).astype(int) 172 | df[col + "_decile"] = np.ceil(df[col_pctile] / 10).astype(int) 173 | df[col + "_quintile"] = np.ceil(df[col_pctile] / 20).astype(int) 174 | df[col + "_quartile"] = np.ceil(df[col_pctile] / 25).astype(int) 175 | 176 | 177 | def quantile_chg(df1, df2, col1, col2, w1=None, w2=None, q=None): 178 | """Create table with two sets of quantiles. 179 | 180 | :param df1: DataFrame with first set of values. 181 | :param df2: DataFrame with second set of values. 182 | :param col1: Name of columns with values in df1. 183 | :param col2: Name of columns with values in df2. 184 | :param w1: Name of weight column in df1. 185 | :param w2: Name of weight column in df2. 186 | :param q: Quantiles. Defaults to decile boundaries. 187 | :returns: DataFrame with two rows and a column for each quantile. 188 | Column labels are "xth percentile" and a label is added 189 | to the median. 190 | 191 | """ 192 | if q is None: 193 | q = np.arange(0.1, 1, 0.1) 194 | q1 = weighted_quantile(df1, col1, w1, q) 195 | q2 = weighted_quantile(df2, col2, w2, q) 196 | qdf = pd.DataFrame([q1, q2]) 197 | # Set decile labels. 198 | q_print = [mdf.ordinal_label((i * 100)) for i in q] 199 | try: # List index throws an error if the value is not found. 200 | median_index = q.tolist().index(0.5) 201 | q_print[median_index] += " (median)" 202 | except ValueError: 203 | pass # Don't assign median to any label. 204 | qdf.columns = q_print 205 | return qdf 206 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="microdf-python", 5 | version="0.4.3", 6 | description="Survey microdata as DataFrames.", 7 | url="http://github.com/PSLmodels/microdf", 8 | author="Max Ghenis", 9 | author_email="max@ubicenter.org", 10 | license="MIT", 11 | packages=["microdf"], 12 | install_requires=[ 13 | "numpy", 14 | "pandas", 15 | ], 16 | extras_require={ 17 | "taxcalc": ["taxcalc"], 18 | "charts": [ 19 | "seaborn", 20 | "matplotlib", 21 | "matplotlib-label-lines" 22 | ] 23 | }, 24 | zip_safe=False, 25 | ) 26 | --------------------------------------------------------------------------------