├── .github
    └── workflows
    │   ├── build_and_test.yml
    │   ├── check_jupyterbook.yml
    │   ├── codecov.yml
    │   ├── deploy_jupyterbook.yml
    │   └── linting.yml
├── .gitignore
├── LICENSE
├── PSL_catalog.json
├── README.md
├── ROADMAP.md
├── codecov.yml
├── docs
    ├── _config.yml
    ├── _toc.yml
    ├── agg.ipynb
    ├── charts.ipynb
    ├── custom_taxes.ipynb
    ├── demo.ipynb
    ├── examples.md
    ├── gini.ipynb
    ├── home.md
    ├── income_measures.ipynb
    ├── microdf_logo.png
    └── weighting.ipynb
├── environment.yml
├── microdf
    ├── __init__.py
    ├── _optional.py
    ├── agg.py
    ├── chart_utils.py
    ├── charts.py
    ├── concat.py
    ├── constants.py
    ├── custom_taxes.py
    ├── generic.py
    ├── income_measures.py
    ├── inequality.py
    ├── io.py
    ├── poverty.py
    ├── style.py
    ├── tax.py
    ├── taxcalc.py
    ├── tests
    │   ├── __pycache__
    │   │   └── .vscode
    │   │   │   └── settings.json
    │   ├── conftest.py
    │   ├── test_compare.py
    │   ├── test_generic.py
    │   ├── test_inequality.py
    │   ├── test_io.py
    │   ├── test_optional_dependency.py
    │   ├── test_percentile_actual.csv
    │   ├── test_percentile_expected.csv
    │   ├── test_poverty.py
    │   ├── test_quantile_chg.py
    │   ├── test_tax.py
    │   ├── test_taxcalc.py
    │   ├── test_utils.py
    │   └── test_weighted.py
    ├── ubi.py
    ├── utils.py
    └── weighted.py
└── setup.py


/.github/workflows/build_and_test.yml:
--------------------------------------------------------------------------------
 1 | name: Build and test [Python 3.9, 3.10, 3.11]
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       matrix:
10 |         python-version: ["3.9", "3.10", "3.11"]
11 | 
12 |     steps:
13 |       - name: Checkout
14 |         uses: actions/checkout@v2
15 |         with:
16 |           persist-credentials: false
17 | 
18 |       - name: Setup Miniconda using Python ${{ matrix.python-version }}
19 |         uses: conda-incubator/setup-miniconda@v2
20 |         with:
21 |           activate-environment: microdf
22 |           environment-file: environment.yml
23 |           python-version: ${{ matrix.python-version }}
24 |           auto-activate-base: false
25 | 
26 |       - name: Build
27 |         shell: bash -l {0}
28 |         run: pip install -e .
29 | 
30 |       - name: Test
31 |         shell: bash -l {0}
32 |         run: pytest
33 | 


--------------------------------------------------------------------------------
/.github/workflows/check_jupyterbook.yml:
--------------------------------------------------------------------------------
 1 | name: Test that Jupyter-Book builds
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   build:
 5 |     if: github.repository == 'PSLmodels/microdf'
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - name: Checkout
 9 |         uses: actions/checkout@v2
10 |         with:
11 |           persist-credentials: false
12 | 
13 |       - name: Setup Miniconda
14 |         uses: conda-incubator/setup-miniconda@v2
15 |         with:
16 |           activate-environment: microdf
17 |           environment-file: environment.yml
18 |           python-version: 3.9
19 |           auto-activate-base: false
20 | 
21 |       - name: Build  # Build Jupyter Book
22 |         shell: bash -l {0}
23 |         run: |
24 |           pip install -e .
25 |           jb build docs/.
26 | 


--------------------------------------------------------------------------------
/.github/workflows/codecov.yml:
--------------------------------------------------------------------------------
 1 | name: CodeCov
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   run:
 5 |     runs-on: ubuntu-latest
 6 |     env:
 7 |       OS: ubuntu-latest
 8 |       PYTHON: '3.9'
 9 |     steps:
10 |     - uses: checkout@v2
11 |       with:
12 |         fetch-depth: ‘2’
13 | 
14 |     - name: Setup Python
15 |       uses: actions/setup-python@master
16 |       with:
17 |         python-version: 3.9
18 |     - name: Generate Report
19 |       run: |
20 |         pip install coverage
21 |         coverage run -m unittest
22 |     - name: Upload Coverage to Codecov
23 |       uses: codecov/codecov-action@v1
24 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy_jupyterbook.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Deploy Jupyter Book
 2 | on:
 3 |   push:
 4 |     branches:    
 5 |       - master
 6 | jobs:
 7 |   build-and-deploy:
 8 |     if: github.repository == 'PSLmodels/microdf'
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Checkout
12 |         uses: actions/checkout@v2
13 |         with:
14 |           persist-credentials: false
15 | 
16 |       - name: Setup Miniconda
17 |         uses: conda-incubator/setup-miniconda@v2
18 |         with:
19 |           activate-environment: microdf
20 |           environment-file: environment.yml
21 |           python-version: 3.9
22 |           auto-activate-base: false
23 | 
24 |       - name: Build
25 |         shell: bash -l {0}
26 |         run: |
27 |           pip install -e .
28 |           jb build docs/.
29 | 
30 |       - name: Deploy
31 |         uses: JamesIves/github-pages-deploy-action@releases/v3
32 |         with:
33 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
34 |           BRANCH: gh-pages  # The branch the action should deploy to.
35 |           FOLDER: docs/_build/html  # The folder the action should deploy.
36 | 


--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - '**.py'
 7 |   pull_request:
 8 |     paths:
 9 |       - '**.py'
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: [3.9]
17 | 
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v2
21 |         with:
22 |           persist-credentials: false
23 | 
24 |       - name: Setup Miniconda using Python ${{ matrix.python-version }}
25 |         uses: conda-incubator/setup-miniconda@v2
26 |         with:
27 |           activate-environment: microdf
28 |           environment-file: environment.yml
29 |           python-version: ${{ matrix.python-version }}
30 |           auto-activate-base: false
31 | 
32 |       - name: Lint
33 |         shell: bash -l {0}
34 |         run: flake8
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Setuptools distribution folder.
 5 | /dist/
 6 | 
 7 | # Python egg metadata, regenerated from source files by setuptools.
 8 | /*.egg-info
 9 | 
10 | .ipynb_checkpoints
11 | 
12 | # Built Jupyter-Book documentation.
13 | docs/_build
14 | 
15 | .vscode/settings.json
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Max Ghenis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PSL_catalog.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "microdf",
 3 |     "img": "https://github.com/PSLmodels/microdf/blob/master/docs/microdf_logo.png?raw=true",
 4 |     "banner_title": "microdf",
 5 |     "banner_subtitle": "Analysis tools for working with survey microdata as DataFrames",
 6 |     "detailed_description": "microdf is a Python package for analyzing economic microdata as pandas DataFrames, with special functions for handling sampling weights.",
 7 |     "policy_area": "Survey data, data analysis",
 8 |     "geography": "Not specific",
 9 |     "language": "Python",
10 |     "maintainers": [
11 |         {
12 |             "name": "Max Ghenis",
13 |             "image": "https://policyengine.org/static/media/max-ghenis.536762d4b2439bf591f5.png",
14 |             "link": "mailto:max@policyengine.org"
15 |           }
16 |     ],
17 |     "links": {
18 |       "code_repository": "https://github.com/PSLmodels/microdf",
19 |       "user_documentation": "http://pslmodels.github.io/microdf/",
20 |       "contributor_documentation": "",
21 |       "webapp": "",
22 |       "recent_changes": "https://github.com/PSLmodels/microdf/releases"
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build](https://github.com/PSLmodels/microdf/workflows/Build%20and%20test%20[Python%203.7,%203.8,%203.9]/badge.svg)](https://github.com/PSLmodels/microdf/actions?query=workflow%3A%22Build+and+test+%5BPython+3.7%2C+3.8%2C+3.9%5D%22)
 2 | [![Codecov](https://codecov.io/gh/PSLmodels/microdf/branch/master/graph/badge.svg)](https://codecov.io/gh/PSLmodels/microdf)
 3 | 
 4 | # microdf
 5 | Analysis tools for working with survey microdata as DataFrames.
 6 | 
 7 | *Disclaimer: `MicroSeries` and `MicroDataFrame` are experimental features and may not consider weights after performing some operations. See open issues.*
 8 | 
 9 | ## Installation
10 | Install with:
11 | 
12 |     pip install git+git://github.com/PSLmodels/microdf.git
13 | 
14 | ## Questions
15 | Contact the maintainer, Max Ghenis (mghenis@gmail.com).
16 | 
17 | ## Citation
18 | You may cite the source of your analysis as "microdf release #.#.#, author's calculations."
19 | 


--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
 1 | # `microdf` roadmap
 2 | 
 3 | `microdf` currently provides capabilities for analyzing weighted microdata, including statistics, distributional tables, graphs, and special functions for working with PSL Tax-Calculator. In the future, it will provide more functionality, including:
 4 | * Charts showing distributional changes between a baseline and reform policy
 5 | * Extending these charts to more than one reform
 6 | * Presets for working with common datasets, e.g. suggesting the appropriate weight for SCF and CPS
 7 | * Standard error calculations for surveys with replicate weight files
 8 | 
 9 | See the [issues page](https://github.com/PSLmodels/microdf/issues) to view and suggest other items.
10 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PolicyEngine/microdf/ccf2e54e559ce7563ca9c19b144ab8d41986e1fb/codecov.yml


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | # Book settings
 2 | title: microdf documentation
 3 | author: Max Ghenis
 4 | logo: microdf_logo.png
 5 | 
 6 | launch_buttons:
 7 |   colab_url: "https://colab.research.google.com"
 8 | 
 9 | repository:
10 |   url: https://github.com/PSLmodels/microdf
11 |   branch: master
12 |   path_to_book: docs
13 | 
14 | html:
15 |   use_edit_page_button  : true
16 |   use_repository_button : true
17 |   use_issues_button     : true
18 | 


--------------------------------------------------------------------------------
/docs/_toc.yml:
--------------------------------------------------------------------------------
 1 | format: jb-article
 2 | root: home
 3 | sections:
 4 | - file: examples
 5 |   sections:
 6 |   - file: agg
 7 |   - file: charts
 8 |   - file: custom_taxes
 9 |   - file: demo
10 |   - file: gini
11 |   - file: income_measures
12 |   - file: weighting
13 | 


--------------------------------------------------------------------------------
/docs/agg.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# The `agg` function\n",
  8 |     "\n",
  9 |     "Use `agg` to see the effect of a $10,000 UBI by marital status.\n",
 10 |     "\n",
 11 |     "## Setup"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import numpy as np\n",
 21 |     "import pandas as pd\n",
 22 |     "\n",
 23 |     "import taxcalc as tc\n",
 24 |     "import microdf as mdf"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "data": {
 34 |       "text/plain": [
 35 |        "'2.3.0'"
 36 |       ]
 37 |      },
 38 |      "execution_count": 2,
 39 |      "metadata": {},
 40 |      "output_type": "execute_result"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "tc.__version__"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Load data\n",
 52 |     "\n",
 53 |     "Start with a standard `DataFrame`, then add a UBI manually in a reform copy."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "base = mdf.calc_df(group_vars=['expanded_income', 'MARS', 'XTOT'],\n",
 63 |     "                   metric_vars='aftertax_income')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "reform = base.copy(deep=True)\n",
 73 |     "UBI_PP = 10000\n",
 74 |     "reform['ubi'] = reform.XTOT * UBI_PP\n",
 75 |     "reform['aftertax_income'] = reform.aftertax_income + reform.ubi\n",
 76 |     "mdf.add_weighted_metrics(reform, 'aftertax_income')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## `agg`\n",
 84 |     "\n",
 85 |     "### Change in aftertax income by marital status."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/html": [
 96 |        "<div>\n",
 97 |        "<style scoped>\n",
 98 |        "    .dataframe tbody tr th:only-of-type {\n",
 99 |        "        vertical-align: middle;\n",
100 |        "    }\n",
101 |        "\n",
102 |        "    .dataframe tbody tr th {\n",
103 |        "        vertical-align: top;\n",
104 |        "    }\n",
105 |        "\n",
106 |        "    .dataframe thead th {\n",
107 |        "        text-align: right;\n",
108 |        "    }\n",
109 |        "</style>\n",
110 |        "<table border=\"1\" class=\"dataframe\">\n",
111 |        "  <thead>\n",
112 |        "    <tr style=\"text-align: right;\">\n",
113 |        "      <th></th>\n",
114 |        "      <th>aftertax_income_m_base</th>\n",
115 |        "      <th>aftertax_income_m_reform</th>\n",
116 |        "      <th>aftertax_income_pctchg</th>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>MARS</th>\n",
120 |        "      <th></th>\n",
121 |        "      <th></th>\n",
122 |        "      <th></th>\n",
123 |        "    </tr>\n",
124 |        "  </thead>\n",
125 |        "  <tbody>\n",
126 |        "    <tr>\n",
127 |        "      <th>1.0</th>\n",
128 |        "      <td>3.916351e+06</td>\n",
129 |        "      <td>4.939093e+06</td>\n",
130 |        "      <td>0.261147</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>2.0</th>\n",
134 |        "      <td>7.692072e+06</td>\n",
135 |        "      <td>9.577865e+06</td>\n",
136 |        "      <td>0.245161</td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>4.0</th>\n",
140 |        "      <td>8.531427e+05</td>\n",
141 |        "      <td>1.275820e+06</td>\n",
142 |        "      <td>0.495436</td>\n",
143 |        "    </tr>\n",
144 |        "  </tbody>\n",
145 |        "</table>\n",
146 |        "</div>"
147 |       ],
148 |       "text/plain": [
149 |        "      aftertax_income_m_base  aftertax_income_m_reform  aftertax_income_pctchg\n",
150 |        "MARS                                                                          \n",
151 |        "1.0             3.916351e+06              4.939093e+06                0.261147\n",
152 |        "2.0             7.692072e+06              9.577865e+06                0.245161\n",
153 |        "4.0             8.531427e+05              1.275820e+06                0.495436"
154 |       ]
155 |      },
156 |      "execution_count": 5,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "mdf.agg(base, reform, 'MARS', 'aftertax_income')"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "### Also sum baseline `expanded_income`"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 6,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "data": {
179 |       "text/html": [
180 |        "<div>\n",
181 |        "<style scoped>\n",
182 |        "    .dataframe tbody tr th:only-of-type {\n",
183 |        "        vertical-align: middle;\n",
184 |        "    }\n",
185 |        "\n",
186 |        "    .dataframe tbody tr th {\n",
187 |        "        vertical-align: top;\n",
188 |        "    }\n",
189 |        "\n",
190 |        "    .dataframe thead th {\n",
191 |        "        text-align: right;\n",
192 |        "    }\n",
193 |        "</style>\n",
194 |        "<table border=\"1\" class=\"dataframe\">\n",
195 |        "  <thead>\n",
196 |        "    <tr style=\"text-align: right;\">\n",
197 |        "      <th></th>\n",
198 |        "      <th>aftertax_income_m_base</th>\n",
199 |        "      <th>expanded_income</th>\n",
200 |        "      <th>aftertax_income_m_reform</th>\n",
201 |        "      <th>aftertax_income_pctchg</th>\n",
202 |        "    </tr>\n",
203 |        "    <tr>\n",
204 |        "      <th>MARS</th>\n",
205 |        "      <th></th>\n",
206 |        "      <th></th>\n",
207 |        "      <th></th>\n",
208 |        "      <th></th>\n",
209 |        "    </tr>\n",
210 |        "  </thead>\n",
211 |        "  <tbody>\n",
212 |        "    <tr>\n",
213 |        "      <th>1.0</th>\n",
214 |        "      <td>3.916351e+06</td>\n",
215 |        "      <td>1.593936e+10</td>\n",
216 |        "      <td>4.939093e+06</td>\n",
217 |        "      <td>0.261147</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>2.0</th>\n",
221 |        "      <td>7.692072e+06</td>\n",
222 |        "      <td>6.242669e+10</td>\n",
223 |        "      <td>9.577865e+06</td>\n",
224 |        "      <td>0.245161</td>\n",
225 |        "    </tr>\n",
226 |        "    <tr>\n",
227 |        "      <th>4.0</th>\n",
228 |        "      <td>8.531427e+05</td>\n",
229 |        "      <td>2.210208e+09</td>\n",
230 |        "      <td>1.275820e+06</td>\n",
231 |        "      <td>0.495436</td>\n",
232 |        "    </tr>\n",
233 |        "  </tbody>\n",
234 |        "</table>\n",
235 |        "</div>"
236 |       ],
237 |       "text/plain": [
238 |        "      aftertax_income_m_base  expanded_income  aftertax_income_m_reform  \\\n",
239 |        "MARS                                                                      \n",
240 |        "1.0             3.916351e+06     1.593936e+10              4.939093e+06   \n",
241 |        "2.0             7.692072e+06     6.242669e+10              9.577865e+06   \n",
242 |        "4.0             8.531427e+05     2.210208e+09              1.275820e+06   \n",
243 |        "\n",
244 |        "      aftertax_income_pctchg  \n",
245 |        "MARS                          \n",
246 |        "1.0                 0.261147  \n",
247 |        "2.0                 0.245161  \n",
248 |        "4.0                 0.495436  "
249 |       ]
250 |      },
251 |      "execution_count": 6,
252 |      "metadata": {},
253 |      "output_type": "execute_result"
254 |     }
255 |    ],
256 |    "source": [
257 |     "mdf.agg(base, reform, 'MARS', 'aftertax_income', 'expanded_income')"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "### Also sum UBI amount"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 7,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "data": {
274 |       "text/html": [
275 |        "<div>\n",
276 |        "<style scoped>\n",
277 |        "    .dataframe tbody tr th:only-of-type {\n",
278 |        "        vertical-align: middle;\n",
279 |        "    }\n",
280 |        "\n",
281 |        "    .dataframe tbody tr th {\n",
282 |        "        vertical-align: top;\n",
283 |        "    }\n",
284 |        "\n",
285 |        "    .dataframe thead th {\n",
286 |        "        text-align: right;\n",
287 |        "    }\n",
288 |        "</style>\n",
289 |        "<table border=\"1\" class=\"dataframe\">\n",
290 |        "  <thead>\n",
291 |        "    <tr style=\"text-align: right;\">\n",
292 |        "      <th></th>\n",
293 |        "      <th>aftertax_income_m_base</th>\n",
294 |        "      <th>aftertax_income_m_reform</th>\n",
295 |        "      <th>ubi_m</th>\n",
296 |        "      <th>aftertax_income_pctchg</th>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>MARS</th>\n",
300 |        "      <th></th>\n",
301 |        "      <th></th>\n",
302 |        "      <th></th>\n",
303 |        "      <th></th>\n",
304 |        "    </tr>\n",
305 |        "  </thead>\n",
306 |        "  <tbody>\n",
307 |        "    <tr>\n",
308 |        "      <th>1.0</th>\n",
309 |        "      <td>3.916351e+06</td>\n",
310 |        "      <td>4.939093e+06</td>\n",
311 |        "      <td>1.022742e+06</td>\n",
312 |        "      <td>0.261147</td>\n",
313 |        "    </tr>\n",
314 |        "    <tr>\n",
315 |        "      <th>2.0</th>\n",
316 |        "      <td>7.692072e+06</td>\n",
317 |        "      <td>9.577865e+06</td>\n",
318 |        "      <td>1.885793e+06</td>\n",
319 |        "      <td>0.245161</td>\n",
320 |        "    </tr>\n",
321 |        "    <tr>\n",
322 |        "      <th>4.0</th>\n",
323 |        "      <td>8.531427e+05</td>\n",
324 |        "      <td>1.275820e+06</td>\n",
325 |        "      <td>4.226775e+05</td>\n",
326 |        "      <td>0.495436</td>\n",
327 |        "    </tr>\n",
328 |        "  </tbody>\n",
329 |        "</table>\n",
330 |        "</div>"
331 |       ],
332 |       "text/plain": [
333 |        "      aftertax_income_m_base  aftertax_income_m_reform         ubi_m  \\\n",
334 |        "MARS                                                                   \n",
335 |        "1.0             3.916351e+06              4.939093e+06  1.022742e+06   \n",
336 |        "2.0             7.692072e+06              9.577865e+06  1.885793e+06   \n",
337 |        "4.0             8.531427e+05              1.275820e+06  4.226775e+05   \n",
338 |        "\n",
339 |        "      aftertax_income_pctchg  \n",
340 |        "MARS                          \n",
341 |        "1.0                 0.261147  \n",
342 |        "2.0                 0.245161  \n",
343 |        "4.0                 0.495436  "
344 |       ]
345 |      },
346 |      "execution_count": 7,
347 |      "metadata": {},
348 |      "output_type": "execute_result"
349 |     }
350 |    ],
351 |    "source": [
352 |     "mdf.add_weighted_metrics(reform, 'ubi')  # Creates ubi_m = ubi * s006 / 1e6.\n",
353 |     "\n",
354 |     "mdf.agg(base, reform, 'MARS', 'aftertax_income', reform_metrics='ubi_m')"
355 |    ]
356 |   }
357 |  ],
358 |  "metadata": {
359 |   "kernelspec": {
360 |    "display_name": "Python 3",
361 |    "language": "python",
362 |    "name": "python3"
363 |   },
364 |   "language_info": {
365 |    "codemirror_mode": {
366 |     "name": "ipython",
367 |     "version": 3
368 |    },
369 |    "file_extension": ".py",
370 |    "mimetype": "text/x-python",
371 |    "name": "python",
372 |    "nbconvert_exporter": "python",
373 |    "pygments_lexer": "ipython3",
374 |    "version": "3.7.3"
375 |   },
376 |   "toc": {
377 |    "base_numbering": 1,
378 |    "nav_menu": {},
379 |    "number_sections": true,
380 |    "sideBar": true,
381 |    "skip_h1_title": false,
382 |    "title_cell": "Table of Contents",
383 |    "title_sidebar": "Contents",
384 |    "toc_cell": false,
385 |    "toc_position": {},
386 |    "toc_section_display": true,
387 |    "toc_window_display": false
388 |   }
389 |  },
390 |  "nbformat": 4,
391 |  "nbformat_minor": 2
392 | }
393 | 


--------------------------------------------------------------------------------
/docs/custom_taxes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Custom taxes\n",
  8 |     "\n",
  9 |     "## Setup"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "ename": "ModuleNotFoundError",
 19 |      "evalue": "No module named 'taxcalc'",
 20 |      "output_type": "error",
 21 |      "traceback": [
 22 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 23 |       "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 24 |       "\u001b[0;32m<ipython-input-1-6d80e93aca8e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtaxcalc\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmicrodf\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 25 |       "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'taxcalc'"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "import pandas as pd\n",
 32 |     "\n",
 33 |     "import taxcalc as tc\n",
 34 |     "import microdf as mdf"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "tc.__version__"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Load data\n",
 51 |     "\n",
 52 |     "Start with a `DataFrame` with `aftertax_income` and necessary ingredients of `tpc_eci`. "
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "df = mdf.calc_df(group_vars=['expanded_income', 'aftertax_income'] +\n",
 62 |     "                 mdf.ECI_REMOVE_COLS,\n",
 63 |     "                 metric_vars=['XTOT'])\n",
 64 |     "df.columns"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "Calculate Tax Policy Center's Expanded Cash Income measure, used for the analysis."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "df['tpc_eci'] = mdf.tpc_eci(df)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "Incidence of a VAT per Tax Policy Center."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "mdf.add_vat(df)\n",
 97 |     "df.columns"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "df.head()  # Note these are zero because we block negative tax liability."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "df.sample(5)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "Add carbon tax and financial transaction tax."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "mdf.add_carbon_tax(df)\n",
132 |     "mdf.add_ftt(df)\n",
133 |     "df.columns"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "df.sample(5)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "VAT with a custom amount generated."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "mdf.add_vat(df, total=500e9, name='vat2')\n",
159 |     "df.columns"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "mdf.weighted_sum(df, 'vat', 's006') / 1e9"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "mdf.weighted_sum(df, 'vat2', 's006') / 1e9"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "Calculate by hand using `add_custom_tax`."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "mdf.add_custom_tax(df, 'tpc_eci', 'XTOT_m', 'aftertax_income', \n",
194 |     "                   mdf.VAT_INCIDENCE, 'vat3', 1e12)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "mdf.weighted_sum(df, 'vat3', 's006') / 1e9"
204 |    ]
205 |   }
206 |  ],
207 |  "metadata": {
208 |   "kernelspec": {
209 |    "display_name": "Python 3",
210 |    "language": "python",
211 |    "name": "python3"
212 |   },
213 |   "language_info": {
214 |    "codemirror_mode": {
215 |     "name": "ipython",
216 |     "version": 3
217 |    },
218 |    "file_extension": ".py",
219 |    "mimetype": "text/x-python",
220 |    "name": "python",
221 |    "nbconvert_exporter": "python",
222 |    "pygments_lexer": "ipython3",
223 |    "version": "3.7.9"
224 |   },
225 |   "toc": {
226 |    "base_numbering": 1,
227 |    "nav_menu": {},
228 |    "number_sections": true,
229 |    "sideBar": true,
230 |    "skip_h1_title": false,
231 |    "title_cell": "Table of Contents",
232 |    "title_sidebar": "Contents",
233 |    "toc_cell": false,
234 |    "toc_position": {},
235 |    "toc_section_display": true,
236 |    "toc_window_display": false
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 4
241 | }
242 | 


--------------------------------------------------------------------------------
/docs/demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# `microdf` demo"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Setup"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import pandas as pd\n",
 25 |     "\n",
 26 |     "import taxcalc as tc\n",
 27 |     "import microdf as mdf\n",
 28 |     "\n",
 29 |     "import matplotlib as mpl\n",
 30 |     "import matplotlib.pyplot as plt\n",
 31 |     "import seaborn as sns"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "Chart options."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 2,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stderr",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "/home/mghenis/anaconda3/lib/python3.7/site-packages/microdf/style.py:24: MatplotlibDeprecationWarning: \n",
 51 |       "The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.\n",
 52 |       "  fm.fontManager.ttflist += fm.createFontList([\"Roboto-Regular.ttf\"])\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "mdf.set_plot_style()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Generate data"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "base = mdf.calc_df(group_vars=['expanded_income', 'MARS'],\n",
 74 |     "                   metric_vars=['aftertax_income', 'XTOT'])"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "Index(['e02400', 'mcare_ben', 'aftertax_income', 'ssi_ben', 'expanded_income',\n",
 86 |        "       'snap_ben', 'vet_ben', 'housing_ben', 's006', 'other_ben', 'e02300',\n",
 87 |        "       'mcaid_ben', 'XTOT', 'tanf_ben', 'MARS', 'wic_ben', 'market_income',\n",
 88 |        "       'bens', 'tax', 's006_m', 'aftertax_income_m', 'XTOT_m'],\n",
 89 |        "      dtype='object')"
 90 |       ]
 91 |      },
 92 |      "execution_count": 4,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "base.columns"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "Define a reform that treats capital gains as ordinary income and sets the top marginal rate to 70%."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 5,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "CG_REFORM = {\n",
115 |     "    'CG_nodiff': {2019: True},\n",
116 |     "    'II_rt7': {2019: 0.7}\n",
117 |     "}"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "reform = mdf.calc_df(reform=CG_REFORM, group_vars=['MARS'], group_n65=True, \n",
127 |     "                     metric_vars=['aftertax_income', 'XTOT'])"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 7,
133 |    "metadata": {},
134 |    "outputs": [
135 |     {
136 |      "data": {
137 |       "text/plain": [
138 |        "Index(['vet_ben', 's006', 'e02300', 'MARS', 'e02400', 'mcare_ben', 'ssi_ben',\n",
139 |        "       'snap_ben', 'housing_ben', 'other_ben', 'aftertax_income',\n",
140 |        "       'expanded_income', 'mcaid_ben', 'XTOT', 'tanf_ben', 'wic_ben',\n",
141 |        "       'market_income', 'bens', 'tax', 'n65', 's006_m', 'aftertax_income_m',\n",
142 |        "       'XTOT_m'],\n",
143 |        "      dtype='object')"
144 |       ]
145 |      },
146 |      "execution_count": 7,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "reform.columns"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Calculate senior UBI.\n",
160 |     "\n",
161 |     "Start with total revenue ($ billions)."
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 8,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "326.110945495585"
173 |       ]
174 |      },
175 |      "execution_count": 8,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "new_rev_m = base.aftertax_income_m.sum() - reform.aftertax_income_m.sum()\n",
182 |     "new_rev_m / 1e3"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "How many seniors are there?"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 9,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "data": {
199 |       "text/plain": [
200 |        "59.21619976999999"
201 |       ]
202 |      },
203 |      "execution_count": 9,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "mdf.add_weighted_metrics(reform, 'n65')\n",
210 |     "\n",
211 |     "n65_total_m = reform.n65_m.sum()\n",
212 |     "n65_total_m"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "Divide."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 10,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/plain": [
230 |        "5507.123840473106"
231 |       ]
232 |      },
233 |      "execution_count": 10,
234 |      "metadata": {},
235 |      "output_type": "execute_result"
236 |     }
237 |    ],
238 |    "source": [
239 |     "senior_ubi = new_rev_m / reform.n65_m.sum()\n",
240 |     "senior_ubi"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "### Add senior UBI to `aftertax_income` and recalculate"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 11,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "reform['ubi'] = senior_ubi * reform.n65\n",
257 |     "reform['aftertax_income'] = reform.aftertax_income + reform.ubi\n",
258 |     "mdf.add_weighted_metrics(reform, 'aftertax_income')"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 12,
264 |    "metadata": {},
265 |    "outputs": [
266 |     {
267 |      "data": {
268 |       "text/plain": [
269 |        "True"
270 |       ]
271 |      },
272 |      "execution_count": 12,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "np.allclose(base.aftertax_income_m.sum(), reform.aftertax_income_m.sum())"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "## Analyze\n",
286 |     "\n",
287 |     "Gini, FPL, distributional impact chart\n",
288 |     "\n",
289 |     "### Change to Gini index"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 13,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "0.5032911973267852"
301 |       ]
302 |      },
303 |      "execution_count": 13,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "mdf.gini(base, 'aftertax_income', 's006')"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 14,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "0.48752755152259336"
321 |       ]
322 |      },
323 |      "execution_count": 14,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "mdf.gini(reform, 'aftertax_income', 's006')"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "### Change to poverty rate\n",
337 |     "\n",
338 |     "Add federal poverty line with `mdf.fpl`."
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 15,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "base['fpl'] = mdf.fpl(base.XTOT)\n",
348 |     "reform['fpl'] = mdf.fpl(reform.XTOT)\n",
349 |     "\n",
350 |     "base['fpl_XTOT_m'] = np.where(base.aftertax_income < base.fpl,\n",
351 |     "                              base.XTOT_m, 0)\n",
352 |     "reform['fpl_XTOT_m'] = np.where(reform.aftertax_income < reform.fpl,\n",
353 |     "                                reform.XTOT_m, 0)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 16,
359 |    "metadata": {},
360 |    "outputs": [
361 |     {
362 |      "data": {
363 |       "text/plain": [
364 |        "-0.022307196800575246"
365 |       ]
366 |      },
367 |      "execution_count": 16,
368 |      "metadata": {},
369 |      "output_type": "execute_result"
370 |     }
371 |    ],
372 |    "source": [
373 |     "reform.fpl_XTOT_m.sum() / base.fpl_XTOT_m.sum() - 1"
374 |    ]
375 |   }
376 |  ],
377 |  "metadata": {
378 |   "kernelspec": {
379 |    "display_name": "Python 3",
380 |    "language": "python",
381 |    "name": "python3"
382 |   },
383 |   "language_info": {
384 |    "codemirror_mode": {
385 |     "name": "ipython",
386 |     "version": 3
387 |    },
388 |    "file_extension": ".py",
389 |    "mimetype": "text/x-python",
390 |    "name": "python",
391 |    "nbconvert_exporter": "python",
392 |    "pygments_lexer": "ipython3",
393 |    "version": "3.7.9"
394 |   },
395 |   "toc": {
396 |    "base_numbering": 1,
397 |    "nav_menu": {},
398 |    "number_sections": true,
399 |    "sideBar": true,
400 |    "skip_h1_title": false,
401 |    "title_cell": "Table of Contents",
402 |    "title_sidebar": "Contents",
403 |    "toc_cell": false,
404 |    "toc_position": {},
405 |    "toc_section_display": true,
406 |    "toc_window_display": false
407 |   }
408 |  },
409 |  "nbformat": 4,
410 |  "nbformat_minor": 2
411 | }


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
1 | Examples
2 | ========
3 | 
4 | See these rendered Jupyter notebooks for examples of `microdf` usage.


--------------------------------------------------------------------------------
/docs/gini.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# `gini` example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import microdf as mdf\n",
 17 |     "\n",
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "x = [-10, -1, 0, 5, 100]\n",
 28 |     "w = [1, 2, 3, 4, 5]\n",
 29 |     "df = pd.DataFrame({'x': x, 'w': w})"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Simple behavior"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "0.9617021276595745"
 48 |       ]
 49 |      },
 50 |      "execution_count": 3,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "mdf.gini(df, 'x')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Dealing with negatives"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "This will be equivalent to `mdf.gini(pd.DataFrame({'x': [0, 0, 0, 5, 100]}))`."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "0.780952380952381"
 82 |       ]
 83 |      },
 84 |      "execution_count": 4,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "mdf.gini(df, 'x', negatives='zero')"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "0.780952380952381"
102 |       ]
103 |      },
104 |      "execution_count": 5,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "mdf.gini(pd.DataFrame({'x': [0, 0, 0, 5, 100]}), 'x')"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "This will be equivalent to `mdf.gini(pd.DataFrame({'x': [0, 9, 10, 15, 110]}))`."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "0.6277777777777778"
129 |       ]
130 |      },
131 |      "execution_count": 6,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "mdf.gini(df, 'x', negatives='shift')"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "0.6277777777777778"
149 |       ]
150 |      },
151 |      "execution_count": 7,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "mdf.gini(pd.DataFrame({'x': [0, 9, 10, 15, 110]}), 'x')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## Dealing with weights"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 8,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "0.6800524934383202"
176 |       ]
177 |      },
178 |      "execution_count": 8,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "mdf.gini(df, 'x', 'w')"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 9,
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "data": {
194 |       "text/plain": [
195 |        "0.6800524934383202"
196 |       ]
197 |      },
198 |      "execution_count": 9,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "mdf.gini(pd.DataFrame({'x': [-10,\n",
205 |     "                             -1, -1,\n",
206 |     "                             0, 0, 0,\n",
207 |     "                             5, 5, 5, 5,\n",
208 |     "                             100, 100, 100, 100, 100]}),\n",
209 |     "        'x')"
210 |    ]
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.7.9"
230 |   },
231 |   "toc": {
232 |    "base_numbering": 1,
233 |    "nav_menu": {},
234 |    "number_sections": true,
235 |    "sideBar": true,
236 |    "skip_h1_title": false,
237 |    "title_cell": "Table of Contents",
238 |    "title_sidebar": "Contents",
239 |    "toc_cell": false,
240 |    "toc_position": {},
241 |    "toc_section_display": true,
242 |    "toc_window_display": false
243 |   }
244 |  },
245 |  "nbformat": 4,
246 |  "nbformat_minor": 2
247 | }
248 | 


--------------------------------------------------------------------------------
/docs/home.md:
--------------------------------------------------------------------------------
1 | `microdf` documentation
2 | =======================
3 | 
4 | This includes example notebooks, and in the future will also include function documentation.


--------------------------------------------------------------------------------
/docs/income_measures.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Income measures\n",
  8 |     "\n",
  9 |     "## Setup"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 6,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "\n",
 21 |     "import taxcalc as tc\n",
 22 |     "import microdf as mdf"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 7,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "'2.3.0'"
 34 |       ]
 35 |      },
 36 |      "execution_count": 7,
 37 |      "metadata": {},
 38 |      "output_type": "execute_result"
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "tc.__version__"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Load data\n",
 50 |     "\n",
 51 |     "Start with a `DataFrame` with `expanded_income` and the variables in `expanded_income` excluded from `tpc_eci`."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 8,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "df = mdf.calc_df(group_vars=['expanded_income', 'wic_ben', 'housing_ben', \n",
 61 |     "                             'vet_ben', 'mcare_ben', 'mcaid_ben'],\n",
 62 |     "                 metric_vars=['XTOT'])"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "Calculate `tpc_eci`."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 9,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "df['tpc_eci'] = mdf.tpc_eci(df)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 10,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/html": [
 89 |        "<div>\n",
 90 |        "<style scoped>\n",
 91 |        "    .dataframe tbody tr th:only-of-type {\n",
 92 |        "        vertical-align: middle;\n",
 93 |        "    }\n",
 94 |        "\n",
 95 |        "    .dataframe tbody tr th {\n",
 96 |        "        vertical-align: top;\n",
 97 |        "    }\n",
 98 |        "\n",
 99 |        "    .dataframe thead th {\n",
100 |        "        text-align: right;\n",
101 |        "    }\n",
102 |        "</style>\n",
103 |        "<table border=\"1\" class=\"dataframe\">\n",
104 |        "  <thead>\n",
105 |        "    <tr style=\"text-align: right;\">\n",
106 |        "      <th></th>\n",
107 |        "      <th>snap_ben</th>\n",
108 |        "      <th>vet_ben</th>\n",
109 |        "      <th>mcaid_ben</th>\n",
110 |        "      <th>mcare_ben</th>\n",
111 |        "      <th>aftertax_income</th>\n",
112 |        "      <th>e02300</th>\n",
113 |        "      <th>ssi_ben</th>\n",
114 |        "      <th>wic_ben</th>\n",
115 |        "      <th>s006</th>\n",
116 |        "      <th>expanded_income</th>\n",
117 |        "      <th>...</th>\n",
118 |        "      <th>tanf_ben</th>\n",
119 |        "      <th>other_ben</th>\n",
120 |        "      <th>e02400</th>\n",
121 |        "      <th>XTOT</th>\n",
122 |        "      <th>market_income</th>\n",
123 |        "      <th>bens</th>\n",
124 |        "      <th>tax</th>\n",
125 |        "      <th>s006_m</th>\n",
126 |        "      <th>XTOT_m</th>\n",
127 |        "      <th>tpc_eci</th>\n",
128 |        "    </tr>\n",
129 |        "    <tr>\n",
130 |        "      <th>RECID</th>\n",
131 |        "      <th></th>\n",
132 |        "      <th></th>\n",
133 |        "      <th></th>\n",
134 |        "      <th></th>\n",
135 |        "      <th></th>\n",
136 |        "      <th></th>\n",
137 |        "      <th></th>\n",
138 |        "      <th></th>\n",
139 |        "      <th></th>\n",
140 |        "      <th></th>\n",
141 |        "      <th></th>\n",
142 |        "      <th></th>\n",
143 |        "      <th></th>\n",
144 |        "      <th></th>\n",
145 |        "      <th></th>\n",
146 |        "      <th></th>\n",
147 |        "      <th></th>\n",
148 |        "      <th></th>\n",
149 |        "      <th></th>\n",
150 |        "      <th></th>\n",
151 |        "      <th></th>\n",
152 |        "    </tr>\n",
153 |        "  </thead>\n",
154 |        "  <tbody>\n",
155 |        "    <tr>\n",
156 |        "      <th>1</th>\n",
157 |        "      <td>0.00000</td>\n",
158 |        "      <td>0.0</td>\n",
159 |        "      <td>0.000000</td>\n",
160 |        "      <td>0.000000</td>\n",
161 |        "      <td>43371.012504</td>\n",
162 |        "      <td>0.0</td>\n",
163 |        "      <td>0.00000</td>\n",
164 |        "      <td>0.0</td>\n",
165 |        "      <td>250.14</td>\n",
166 |        "      <td>53636.919015</td>\n",
167 |        "      <td>...</td>\n",
168 |        "      <td>0.0</td>\n",
169 |        "      <td>0.000000</td>\n",
170 |        "      <td>0.000000</td>\n",
171 |        "      <td>2.0</td>\n",
172 |        "      <td>53636.919015</td>\n",
173 |        "      <td>0.000000</td>\n",
174 |        "      <td>10265.906511</td>\n",
175 |        "      <td>0.000250</td>\n",
176 |        "      <td>0.000500</td>\n",
177 |        "      <td>53636.919015</td>\n",
178 |        "    </tr>\n",
179 |        "    <tr>\n",
180 |        "      <th>2</th>\n",
181 |        "      <td>0.00000</td>\n",
182 |        "      <td>0.0</td>\n",
183 |        "      <td>0.000000</td>\n",
184 |        "      <td>0.000000</td>\n",
185 |        "      <td>20937.886511</td>\n",
186 |        "      <td>0.0</td>\n",
187 |        "      <td>0.00000</td>\n",
188 |        "      <td>0.0</td>\n",
189 |        "      <td>211.63</td>\n",
190 |        "      <td>18650.034959</td>\n",
191 |        "      <td>...</td>\n",
192 |        "      <td>0.0</td>\n",
193 |        "      <td>0.000000</td>\n",
194 |        "      <td>0.000000</td>\n",
195 |        "      <td>3.0</td>\n",
196 |        "      <td>18650.034959</td>\n",
197 |        "      <td>0.000000</td>\n",
198 |        "      <td>-2287.851553</td>\n",
199 |        "      <td>0.000212</td>\n",
200 |        "      <td>0.000635</td>\n",
201 |        "      <td>18650.034959</td>\n",
202 |        "    </tr>\n",
203 |        "    <tr>\n",
204 |        "      <th>3</th>\n",
205 |        "      <td>1734.12939</td>\n",
206 |        "      <td>0.0</td>\n",
207 |        "      <td>8211.593627</td>\n",
208 |        "      <td>13640.390612</td>\n",
209 |        "      <td>52516.165397</td>\n",
210 |        "      <td>0.0</td>\n",
211 |        "      <td>3374.52239</td>\n",
212 |        "      <td>0.0</td>\n",
213 |        "      <td>323.50</td>\n",
214 |        "      <td>52516.165397</td>\n",
215 |        "      <td>...</td>\n",
216 |        "      <td>0.0</td>\n",
217 |        "      <td>6663.701623</td>\n",
218 |        "      <td>13227.079816</td>\n",
219 |        "      <td>1.0</td>\n",
220 |        "      <td>0.000000</td>\n",
221 |        "      <td>52516.165397</td>\n",
222 |        "      <td>0.000000</td>\n",
223 |        "      <td>0.000324</td>\n",
224 |        "      <td>0.000324</td>\n",
225 |        "      <td>24999.433219</td>\n",
226 |        "    </tr>\n",
227 |        "    <tr>\n",
228 |        "      <th>4</th>\n",
229 |        "      <td>0.00000</td>\n",
230 |        "      <td>0.0</td>\n",
231 |        "      <td>8211.593627</td>\n",
232 |        "      <td>0.000000</td>\n",
233 |        "      <td>36857.709188</td>\n",
234 |        "      <td>0.0</td>\n",
235 |        "      <td>0.00000</td>\n",
236 |        "      <td>0.0</td>\n",
237 |        "      <td>186.32</td>\n",
238 |        "      <td>37764.286717</td>\n",
239 |        "      <td>...</td>\n",
240 |        "      <td>0.0</td>\n",
241 |        "      <td>3906.542368</td>\n",
242 |        "      <td>0.000000</td>\n",
243 |        "      <td>2.0</td>\n",
244 |        "      <td>25646.150723</td>\n",
245 |        "      <td>12118.135995</td>\n",
246 |        "      <td>906.577529</td>\n",
247 |        "      <td>0.000186</td>\n",
248 |        "      <td>0.000373</td>\n",
249 |        "      <td>29552.693091</td>\n",
250 |        "    </tr>\n",
251 |        "    <tr>\n",
252 |        "      <th>5</th>\n",
253 |        "      <td>0.00000</td>\n",
254 |        "      <td>0.0</td>\n",
255 |        "      <td>0.000000</td>\n",
256 |        "      <td>27280.781223</td>\n",
257 |        "      <td>63941.158283</td>\n",
258 |        "      <td>0.0</td>\n",
259 |        "      <td>0.00000</td>\n",
260 |        "      <td>0.0</td>\n",
261 |        "      <td>343.08</td>\n",
262 |        "      <td>63941.158283</td>\n",
263 |        "      <td>...</td>\n",
264 |        "      <td>0.0</td>\n",
265 |        "      <td>0.000000</td>\n",
266 |        "      <td>35560.553286</td>\n",
267 |        "      <td>2.0</td>\n",
268 |        "      <td>1099.823774</td>\n",
269 |        "      <td>62841.334509</td>\n",
270 |        "      <td>0.000000</td>\n",
271 |        "      <td>0.000343</td>\n",
272 |        "      <td>0.000686</td>\n",
273 |        "      <td>36660.377060</td>\n",
274 |        "    </tr>\n",
275 |        "  </tbody>\n",
276 |        "</table>\n",
277 |        "<p>5 rows × 21 columns</p>\n",
278 |        "</div>"
279 |       ],
280 |       "text/plain": [
281 |        "         snap_ben  vet_ben    mcaid_ben     mcare_ben  aftertax_income  \\\n",
282 |        "RECID                                                                    \n",
283 |        "1         0.00000      0.0     0.000000      0.000000     43371.012504   \n",
284 |        "2         0.00000      0.0     0.000000      0.000000     20937.886511   \n",
285 |        "3      1734.12939      0.0  8211.593627  13640.390612     52516.165397   \n",
286 |        "4         0.00000      0.0  8211.593627      0.000000     36857.709188   \n",
287 |        "5         0.00000      0.0     0.000000  27280.781223     63941.158283   \n",
288 |        "\n",
289 |        "       e02300     ssi_ben  wic_ben    s006  expanded_income  ...  tanf_ben  \\\n",
290 |        "RECID                                                        ...             \n",
291 |        "1         0.0     0.00000      0.0  250.14     53636.919015  ...       0.0   \n",
292 |        "2         0.0     0.00000      0.0  211.63     18650.034959  ...       0.0   \n",
293 |        "3         0.0  3374.52239      0.0  323.50     52516.165397  ...       0.0   \n",
294 |        "4         0.0     0.00000      0.0  186.32     37764.286717  ...       0.0   \n",
295 |        "5         0.0     0.00000      0.0  343.08     63941.158283  ...       0.0   \n",
296 |        "\n",
297 |        "         other_ben        e02400  XTOT  market_income          bens  \\\n",
298 |        "RECID                                                                 \n",
299 |        "1         0.000000      0.000000   2.0   53636.919015      0.000000   \n",
300 |        "2         0.000000      0.000000   3.0   18650.034959      0.000000   \n",
301 |        "3      6663.701623  13227.079816   1.0       0.000000  52516.165397   \n",
302 |        "4      3906.542368      0.000000   2.0   25646.150723  12118.135995   \n",
303 |        "5         0.000000  35560.553286   2.0    1099.823774  62841.334509   \n",
304 |        "\n",
305 |        "                tax    s006_m    XTOT_m       tpc_eci  \n",
306 |        "RECID                                                  \n",
307 |        "1      10265.906511  0.000250  0.000500  53636.919015  \n",
308 |        "2      -2287.851553  0.000212  0.000635  18650.034959  \n",
309 |        "3          0.000000  0.000324  0.000324  24999.433219  \n",
310 |        "4        906.577529  0.000186  0.000373  29552.693091  \n",
311 |        "5          0.000000  0.000343  0.000686  36660.377060  \n",
312 |        "\n",
313 |        "[5 rows x 21 columns]"
314 |       ]
315 |      },
316 |      "execution_count": 10,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "df.head()"
323 |    ]
324 |   }
325 |  ],
326 |  "metadata": {
327 |   "kernelspec": {
328 |    "display_name": "Python 3",
329 |    "language": "python",
330 |    "name": "python3"
331 |   },
332 |   "language_info": {
333 |    "codemirror_mode": {
334 |     "name": "ipython",
335 |     "version": 3
336 |    },
337 |    "file_extension": ".py",
338 |    "mimetype": "text/x-python",
339 |    "name": "python",
340 |    "nbconvert_exporter": "python",
341 |    "pygments_lexer": "ipython3",
342 |    "version": "3.7.3"
343 |   },
344 |   "toc": {
345 |    "base_numbering": 1,
346 |    "nav_menu": {},
347 |    "number_sections": true,
348 |    "sideBar": true,
349 |    "skip_h1_title": false,
350 |    "title_cell": "Table of Contents",
351 |    "title_sidebar": "Contents",
352 |    "toc_cell": false,
353 |    "toc_position": {},
354 |    "toc_section_display": true,
355 |    "toc_window_display": false
356 |   }
357 |  },
358 |  "nbformat": 4,
359 |  "nbformat_minor": 2
360 | }
361 | 


--------------------------------------------------------------------------------
/docs/microdf_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PolicyEngine/microdf/ccf2e54e559ce7563ca9c19b144ab8d41986e1fb/docs/microdf_logo.png


--------------------------------------------------------------------------------
/docs/weighting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Weighting in taxcalc_helpers\n",
  8 |     "\n",
  9 |     "## Setup"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "\n",
 21 |     "import taxcalc as tc\n",
 22 |     "import microdf as mdf"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "'3.0.0'"
 34 |       ]
 35 |      },
 36 |      "execution_count": 2,
 37 |      "metadata": {},
 38 |      "output_type": "execute_result"
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "tc.__version__"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Load data\n",
 50 |     "\n",
 51 |     "Start with a `DataFrame` with `nu18` and `XTOT`, and also calculate `XTOT_m`."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "Index(['s006', 'other_ben', 'snap_ben', 'aftertax_income', 'mcaid_ben',\n",
 63 |        "       'mcare_ben', 'ssi_ben', 'e02300', 'nu18', 'expanded_income',\n",
 64 |        "       'housing_ben', 'vet_ben', 'wic_ben', 'e02400', 'tanf_ben', 'XTOT',\n",
 65 |        "       'market_income', 'bens', 'tax', 's006_m', 'XTOT_m'],\n",
 66 |        "      dtype='object')"
 67 |       ]
 68 |      },
 69 |      "execution_count": 3,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "df = mdf.calc_df(group_vars=['nu18'], metric_vars=['XTOT'])\n",
 76 |     "df.columns"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "From this we can calculate the number of people and tax units by the tax unit's number of children."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 4,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/html": [
 94 |        "<div>\n",
 95 |        "<style scoped>\n",
 96 |        "    .dataframe tbody tr th:only-of-type {\n",
 97 |        "        vertical-align: middle;\n",
 98 |        "    }\n",
 99 |        "\n",
100 |        "    .dataframe tbody tr th {\n",
101 |        "        vertical-align: top;\n",
102 |        "    }\n",
103 |        "\n",
104 |        "    .dataframe thead th {\n",
105 |        "        text-align: right;\n",
106 |        "    }\n",
107 |        "</style>\n",
108 |        "<table border=\"1\" class=\"dataframe\">\n",
109 |        "  <thead>\n",
110 |        "    <tr style=\"text-align: right;\">\n",
111 |        "      <th></th>\n",
112 |        "      <th>s006_m</th>\n",
113 |        "      <th>XTOT_m</th>\n",
114 |        "    </tr>\n",
115 |        "    <tr>\n",
116 |        "      <th>nu18</th>\n",
117 |        "      <th></th>\n",
118 |        "      <th></th>\n",
119 |        "    </tr>\n",
120 |        "  </thead>\n",
121 |        "  <tbody>\n",
122 |        "    <tr>\n",
123 |        "      <th>0.0</th>\n",
124 |        "      <td>152.988772</td>\n",
125 |        "      <td>209.816367</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>1.0</th>\n",
129 |        "      <td>22.688253</td>\n",
130 |        "      <td>54.115850</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>2.0</th>\n",
134 |        "      <td>18.859945</td>\n",
135 |        "      <td>68.880292</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>3.0</th>\n",
139 |        "      <td>7.438481</td>\n",
140 |        "      <td>34.795527</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>4.0</th>\n",
144 |        "      <td>2.371111</td>\n",
145 |        "      <td>13.539261</td>\n",
146 |        "    </tr>\n",
147 |        "    <tr>\n",
148 |        "      <th>5.0</th>\n",
149 |        "      <td>0.744276</td>\n",
150 |        "      <td>5.015182</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>6.0</th>\n",
154 |        "      <td>0.216158</td>\n",
155 |        "      <td>1.688063</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <th>7.0</th>\n",
159 |        "      <td>0.090332</td>\n",
160 |        "      <td>0.790239</td>\n",
161 |        "    </tr>\n",
162 |        "    <tr>\n",
163 |        "      <th>8.0</th>\n",
164 |        "      <td>0.026501</td>\n",
165 |        "      <td>0.258552</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>9.0</th>\n",
169 |        "      <td>0.012238</td>\n",
170 |        "      <td>0.134320</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>10.0</th>\n",
174 |        "      <td>0.007196</td>\n",
175 |        "      <td>0.084201</td>\n",
176 |        "    </tr>\n",
177 |        "    <tr>\n",
178 |        "      <th>12.0</th>\n",
179 |        "      <td>0.000265</td>\n",
180 |        "      <td>0.003715</td>\n",
181 |        "    </tr>\n",
182 |        "  </tbody>\n",
183 |        "</table>\n",
184 |        "</div>"
185 |       ],
186 |       "text/plain": [
187 |        "          s006_m      XTOT_m\n",
188 |        "nu18                        \n",
189 |        "0.0   152.988772  209.816367\n",
190 |        "1.0    22.688253   54.115850\n",
191 |        "2.0    18.859945   68.880292\n",
192 |        "3.0     7.438481   34.795527\n",
193 |        "4.0     2.371111   13.539261\n",
194 |        "5.0     0.744276    5.015182\n",
195 |        "6.0     0.216158    1.688063\n",
196 |        "7.0     0.090332    0.790239\n",
197 |        "8.0     0.026501    0.258552\n",
198 |        "9.0     0.012238    0.134320\n",
199 |        "10.0    0.007196    0.084201\n",
200 |        "12.0    0.000265    0.003715"
201 |       ]
202 |      },
203 |      "execution_count": 4,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "df.groupby('nu18')[['s006_m', 'XTOT_m']].sum()"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "What if we also want to calculate the total number of *children* by the tax unit's number of children?\n",
217 |     "\n",
218 |     "For this we can use `add_weighted_metrics`, the function called within `calc_df`."
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 5,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "mdf.add_weighted_metrics(df, ['nu18'])"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "Now we can do the same thing as before, with the new `nu18_m` column."
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 6,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/html": [
245 |        "<div>\n",
246 |        "<style scoped>\n",
247 |        "    .dataframe tbody tr th:only-of-type {\n",
248 |        "        vertical-align: middle;\n",
249 |        "    }\n",
250 |        "\n",
251 |        "    .dataframe tbody tr th {\n",
252 |        "        vertical-align: top;\n",
253 |        "    }\n",
254 |        "\n",
255 |        "    .dataframe thead th {\n",
256 |        "        text-align: right;\n",
257 |        "    }\n",
258 |        "</style>\n",
259 |        "<table border=\"1\" class=\"dataframe\">\n",
260 |        "  <thead>\n",
261 |        "    <tr style=\"text-align: right;\">\n",
262 |        "      <th></th>\n",
263 |        "      <th>nu18_m</th>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>nu18</th>\n",
267 |        "      <th></th>\n",
268 |        "    </tr>\n",
269 |        "  </thead>\n",
270 |        "  <tbody>\n",
271 |        "    <tr>\n",
272 |        "      <th>0.0</th>\n",
273 |        "      <td>0.000000</td>\n",
274 |        "    </tr>\n",
275 |        "    <tr>\n",
276 |        "      <th>1.0</th>\n",
277 |        "      <td>22.688253</td>\n",
278 |        "    </tr>\n",
279 |        "    <tr>\n",
280 |        "      <th>2.0</th>\n",
281 |        "      <td>37.719889</td>\n",
282 |        "    </tr>\n",
283 |        "    <tr>\n",
284 |        "      <th>3.0</th>\n",
285 |        "      <td>22.315444</td>\n",
286 |        "    </tr>\n",
287 |        "    <tr>\n",
288 |        "      <th>4.0</th>\n",
289 |        "      <td>9.484444</td>\n",
290 |        "    </tr>\n",
291 |        "    <tr>\n",
292 |        "      <th>5.0</th>\n",
293 |        "      <td>3.721381</td>\n",
294 |        "    </tr>\n",
295 |        "    <tr>\n",
296 |        "      <th>6.0</th>\n",
297 |        "      <td>1.296949</td>\n",
298 |        "    </tr>\n",
299 |        "    <tr>\n",
300 |        "      <th>7.0</th>\n",
301 |        "      <td>0.632325</td>\n",
302 |        "    </tr>\n",
303 |        "    <tr>\n",
304 |        "      <th>8.0</th>\n",
305 |        "      <td>0.212008</td>\n",
306 |        "    </tr>\n",
307 |        "    <tr>\n",
308 |        "      <th>9.0</th>\n",
309 |        "      <td>0.110139</td>\n",
310 |        "    </tr>\n",
311 |        "    <tr>\n",
312 |        "      <th>10.0</th>\n",
313 |        "      <td>0.071958</td>\n",
314 |        "    </tr>\n",
315 |        "    <tr>\n",
316 |        "      <th>12.0</th>\n",
317 |        "      <td>0.003184</td>\n",
318 |        "    </tr>\n",
319 |        "  </tbody>\n",
320 |        "</table>\n",
321 |        "</div>"
322 |       ],
323 |       "text/plain": [
324 |        "         nu18_m\n",
325 |        "nu18           \n",
326 |        "0.0    0.000000\n",
327 |        "1.0   22.688253\n",
328 |        "2.0   37.719889\n",
329 |        "3.0   22.315444\n",
330 |        "4.0    9.484444\n",
331 |        "5.0    3.721381\n",
332 |        "6.0    1.296949\n",
333 |        "7.0    0.632325\n",
334 |        "8.0    0.212008\n",
335 |        "9.0    0.110139\n",
336 |        "10.0   0.071958\n",
337 |        "12.0   0.003184"
338 |       ]
339 |      },
340 |      "execution_count": 6,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "df.groupby('nu18')[['nu18_m']].sum()"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "We can also calculate weighted sums without adding the weighted metric."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 7,
359 |    "metadata": {},
360 |    "outputs": [
361 |     {
362 |      "data": {
363 |       "text/plain": [
364 |        "'Total children: 98M.'"
365 |       ]
366 |      },
367 |      "execution_count": 7,
368 |      "metadata": {},
369 |      "output_type": "execute_result"
370 |     }
371 |    ],
372 |    "source": [
373 |     "total_children = mdf.weighted_sum(df, 'nu18', 's006')\n",
374 |     "# Fix this decimal.\n",
375 |     "'Total children: ' + str(round(total_children / 1e6)) + 'M.'"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "We can also calculate the weighted mean and median."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 8,
388 |    "metadata": {},
389 |    "outputs": [
390 |     {
391 |      "data": {
392 |       "text/plain": [
393 |        "0.4782626894263673"
394 |       ]
395 |      },
396 |      "execution_count": 8,
397 |      "metadata": {},
398 |      "output_type": "execute_result"
399 |     }
400 |    ],
401 |    "source": [
402 |     "mdf.weighted_mean(df, 'nu18', 's006')"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 9,
408 |    "metadata": {},
409 |    "outputs": [
410 |     {
411 |      "ename": "TypeError",
412 |      "evalue": "weighted_quantile() missing 1 required positional argument: 'quantiles'",
413 |      "output_type": "error",
414 |      "traceback": [
415 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
416 |       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
417 |       "\u001b[0;32m<ipython-input-9-896d4579ebca>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweighted_median\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nu18'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m's006'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
418 |       "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/microdf/weighted.py\u001b[0m in \u001b[0;36mweighted_median\u001b[0;34m(df, col, w)\u001b[0m\n\u001b[1;32m     84\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m     \"\"\"\n\u001b[0;32m---> 86\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mweighted_quantile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0.5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
419 |       "\u001b[0;31mTypeError\u001b[0m: weighted_quantile() missing 1 required positional argument: 'quantiles'"
420 |      ]
421 |     }
422 |    ],
423 |    "source": [
424 |     "mdf.weighted_median(df, 'nu18', 's006')"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "We can also look at more quantiles.\n",
432 |     "\n",
433 |     "*Note that weighted quantiles have a different interface.*"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "decile_bounds = np.arange(0, 1.1, 0.1)\n",
443 |     "deciles = mdf.weighted_quantile(df, 'nu18', 's006', decile_bounds)\n",
444 |     "pd.DataFrame(deciles, index=decile_bounds)"
445 |    ]
446 |   }
447 |  ],
448 |  "metadata": {
449 |   "kernelspec": {
450 |    "display_name": "Python 3",
451 |    "language": "python",
452 |    "name": "python3"
453 |   },
454 |   "language_info": {
455 |    "codemirror_mode": {
456 |     "name": "ipython",
457 |     "version": 3
458 |    },
459 |    "file_extension": ".py",
460 |    "mimetype": "text/x-python",
461 |    "name": "python",
462 |    "nbconvert_exporter": "python",
463 |    "pygments_lexer": "ipython3",
464 |    "version": "3.7.9"
465 |   },
466 |   "toc": {
467 |    "base_numbering": 1,
468 |    "nav_menu": {},
469 |    "number_sections": true,
470 |    "sideBar": true,
471 |    "skip_h1_title": false,
472 |    "title_cell": "Table of Contents",
473 |    "title_sidebar": "Contents",
474 |    "toc_cell": false,
475 |    "toc_position": {},
476 |    "toc_section_display": true,
477 |    "toc_window_display": false
478 |   }
479 |  },
480 |  "nbformat": 4,
481 |  "nbformat_minor": 2
482 | }
483 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: microdf
 2 | channels:
 3 | - conda-forge
 4 | dependencies:
 5 | - codecov
 6 | - flake8
 7 | - matplotlib
 8 | - numpy
 9 | - pandas
10 | - pip
11 | - pytest
12 | - seaborn
13 | - setuptools
14 | - pip:
15 |   - jupyter-book
16 | 


--------------------------------------------------------------------------------
/microdf/__init__.py:
--------------------------------------------------------------------------------
  1 | from .agg import agg, combine_base_reform, pctchg_base_reform
  2 | from .chart_utils import dollar_format, currency_format
  3 | from .charts import quantile_pct_chg_plot
  4 | from .concat import concat
  5 | from .constants import (
  6 |     BENS,
  7 |     ECI_REMOVE_COLS,
  8 |     HOUSING_CASH_SHARE,
  9 |     MCAID_CASH_SHARE,
 10 |     MCARE_CASH_SHARE,
 11 |     MED_BENS,
 12 |     OTHER_CASH_SHARE,
 13 |     SNAP_CASH_SHARE,
 14 |     SSI_CASH_SHARE,
 15 |     TANF_CASH_SHARE,
 16 |     VET_CASH_SHARE,
 17 |     WIC_CASH_SHARE,
 18 | )
 19 | from .custom_taxes import (
 20 |     CARBON_TAX_INCIDENCE,
 21 |     FTT_INCIDENCE,
 22 |     VAT_INCIDENCE,
 23 |     add_carbon_tax,
 24 |     add_custom_tax,
 25 |     add_ftt,
 26 |     add_vat,
 27 | )
 28 | from .income_measures import cash_income, market_income, tpc_eci
 29 | from .inequality import (
 30 |     bottom_50_pct_share,
 31 |     bottom_x_pct_share,
 32 |     gini,
 33 |     t10_b50,
 34 |     top_0_1_pct_share,
 35 |     top_10_pct_share,
 36 |     top_1_pct_share,
 37 |     top_50_pct_share,
 38 |     top_x_pct_share,
 39 | )
 40 | from .io import read_stata_zip
 41 | from .poverty import (
 42 |     fpl,
 43 |     poverty_rate,
 44 |     deep_poverty_rate,
 45 |     poverty_gap,
 46 |     squared_poverty_gap,
 47 |     deep_poverty_gap,
 48 | )
 49 | from .style import AXIS_COLOR, DPI, GRID_COLOR, TITLE_COLOR, set_plot_style
 50 | from .tax import mtr, tax_from_mtrs
 51 | from .taxcalc import (
 52 |     add_weighted_metrics,
 53 |     calc_df,
 54 |     n65,
 55 |     recalculate,
 56 |     static_baseline_calc,
 57 | )
 58 | from .ubi import ubi_or_bens
 59 | from .utils import (
 60 |     cartesian_product,
 61 |     dedup_list,
 62 |     flatten,
 63 |     listify,
 64 |     ordinal_label,
 65 | )
 66 | from .weighted import (
 67 |     add_weighted_quantiles,
 68 |     quantile_chg,
 69 |     weight,
 70 |     weighted_mean,
 71 |     weighted_median,
 72 |     weighted_quantile,
 73 |     weighted_sum,
 74 | )
 75 | from .generic import MicroDataFrame, MicroSeries
 76 | 
 77 | name = "microdf"
 78 | __version__ = "0.1.0"
 79 | 
 80 | __all__ = [
 81 |     # agg.py
 82 |     "combine_base_reform",
 83 |     "pctchg_base_reform",
 84 |     "agg",
 85 |     # chart_utils.py
 86 |     "dollar_format",
 87 |     "currency_format",
 88 |     # charts.py
 89 |     "quantile_pct_chg_plot",
 90 |     # concat.py
 91 |     "concat",
 92 |     # constants.py
 93 |     "BENS",
 94 |     "ECI_REMOVE_COLS",
 95 |     "HOUSING_CASH_SHARE",
 96 |     "MCAID_CASH_SHARE",
 97 |     "MCARE_CASH_SHARE",
 98 |     "MED_BENS",
 99 |     "OTHER_CASH_SHARE",
100 |     "SNAP_CASH_SHARE",
101 |     "SSI_CASH_SHARE",
102 |     "TANF_CASH_SHARE",
103 |     "VET_CASH_SHARE",
104 |     "WIC_CASH_SHARE",
105 |     # custom_taxes.py
106 |     "CARBON_TAX_INCIDENCE",
107 |     "FTT_INCIDENCE",
108 |     "VAT_INCIDENCE",
109 |     "add_custom_tax",
110 |     "add_vat",
111 |     "add_carbon_tax",
112 |     "add_ftt",
113 |     # income_measures.py
114 |     "cash_income",
115 |     "tpc_eci",
116 |     "market_income",
117 |     # inequality.py
118 |     "gini",
119 |     "top_x_pct_share",
120 |     "bottom_x_pct_share",
121 |     "bottom_50_pct_share",
122 |     "top_10_pct_share",
123 |     "top_1_pct_share",
124 |     "top_0_1_pct_share",
125 |     "top_50_pct_share",
126 |     "t10_b50",
127 |     # io.py
128 |     "read_stata_zip",
129 |     # poverty.py
130 |     "fpl",
131 |     "poverty_rate",
132 |     "deep_poverty_rate",
133 |     "poverty_gap",
134 |     "squared_poverty_gap",
135 |     "deep_poverty_gap",
136 |     # style.py
137 |     "AXIS_COLOR",
138 |     "DPI",
139 |     "GRID_COLOR",
140 |     "TITLE_COLOR",
141 |     "set_plot_style",
142 |     # tax.py
143 |     "mtr",
144 |     "tax_from_mtrs",
145 |     # taxcalc.py
146 |     "static_baseline_calc",
147 |     "add_weighted_metrics",
148 |     "n65",
149 |     "calc_df",
150 |     "recalculate",
151 |     # ubi.py
152 |     "ubi_or_bens",
153 |     # utils.py
154 |     "ordinal_label",
155 |     "dedup_list",
156 |     "listify",
157 |     "flatten",
158 |     "cartesian_product",
159 |     # weighted.py
160 |     "weight",
161 |     "weighted_sum",
162 |     "weighted_mean",
163 |     "weighted_quantile",
164 |     "weighted_median",
165 |     "add_weighted_quantiles",
166 |     "quantile_chg",
167 |     # generic.py
168 |     "MicroSeries",
169 |     "MicroDataFrame",
170 | ]
171 | 


--------------------------------------------------------------------------------
/microdf/_optional.py:
--------------------------------------------------------------------------------
 1 | import distutils.version
 2 | import importlib
 3 | import types
 4 | import warnings
 5 | 
 6 | 
 7 | # Adapted from:
 8 | # https://github.com/pandas-dev/pandas/blob/master/pandas/compat/_optional.py
 9 | 
10 | VERSIONS = {
11 |     "taxcalc": "2.0.0",
12 | }
13 | 
14 | 
15 | def _get_version(module: types.ModuleType) -> str:
16 |     """
17 | 
18 |     :param module: types.ModuleType:
19 |     :param module: types.ModuleType:
20 | 
21 |     """
22 |     version = getattr(module, "__version__", None)
23 |     if version is None:
24 |         # xlrd uses a capitalized attribute name
25 |         version = getattr(module, "__VERSION__", None)
26 | 
27 |     if version is None:
28 |         raise ImportError(f"Can't determine version for {module.__name__}")
29 |     return version
30 | 
31 | 
32 | def import_optional_dependency(
33 |     name: str,
34 |     extra: str = "",
35 |     raise_on_missing: bool = True,
36 |     on_version: str = "raise",
37 | ):
38 |     """Import an optional dependency.
39 |     By default, if a dependency is missing an ImportError with a nice
40 |     message will be raised. If a dependency is present, but too old,
41 |     we raise.
42 | 
43 |     :param name: The module name. This should be top-level only, so that the
44 |         version may be checked.
45 |     :type name: str
46 |     :param extra: Additional text to include in the ImportError message.
47 |     :type extra: str
48 |     :param raise_on_missing: Whether to raise if the optional dependency is
49 |         not found. When False and the module is not present, None is returned.
50 |     :type raise_on_missing: bool, default True
51 |     :param on_version: What to do when a dependency's version is too old.
52 |         * raise : Raise an ImportError
53 |         * warn : Warn that the version is too old. Returns None
54 |         * ignore: Return the module, even if the version is too old.
55 |           It's expected that users validate the version locally when
56 |     :type on_version: str {'raise', 'warn'}
57 |     """
58 |     msg = (
59 |         f"Missing optional dependency '{name}'. {extra} "
60 |         f"Use pip or conda to install {name}."
61 |     )
62 |     try:
63 |         module = importlib.import_module(name)
64 |     except ImportError:
65 |         if raise_on_missing:
66 |             raise ImportError(msg) from None
67 |         else:
68 |             return None
69 | 
70 |     minimum_version = VERSIONS.get(name)
71 |     if minimum_version:
72 |         version = _get_version(module)
73 |         if distutils.version.LooseVersion(version) < minimum_version:
74 |             assert on_version in {"warn", "raise", "ignore"}
75 |             msg = (
76 |                 f"microdf requires version '{minimum_version}' or newer of "
77 |                 f"'{name}' "
78 |                 f"(version '{version}' currently installed)."
79 |             )
80 |             if on_version == "warn":
81 |                 warnings.warn(msg, UserWarning)
82 |                 return None
83 |             elif on_version == "raise":
84 |                 raise ImportError(msg)
85 | 
86 |     return module
87 | 


--------------------------------------------------------------------------------
/microdf/agg.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from typing import Optional
 3 | 
 4 | import microdf as mdf
 5 | 
 6 | 
 7 | def combine_base_reform(
 8 |     base: pd.DataFrame,
 9 |     reform: pd.DataFrame,
10 |     base_cols: Optional[list],
11 |     cols: Optional[list],
12 |     reform_cols: Optional[list],
13 | ) -> pd.DataFrame:
14 |     """Combine base and reform with certain columns.
15 | 
16 |     :param base: Base DataFrame. Index must match reform.
17 |     :type base: pd.DataFrame
18 |     :param reform: Reform DataFrame. Index must match base.
19 |     :type reform: pd.DataFrame
20 |     :param base_cols: Columns in base to keep.
21 |     :type base_cols: list, optional
22 |     :param cols: Columns to keep from both base and reform.
23 |     :type cols: list, optional
24 |     :param reform_cols: Columns in reform to keep.
25 |     :type reform_cols: list, optional
26 |     :returns: DataFrame with columns for base ("_base") and reform ("_reform").
27 |     :rtype: pd.DataFrame
28 | 
29 |     """
30 |     all_base_cols = mdf.listify([base_cols] + [cols])
31 |     all_reform_cols = mdf.listify([reform_cols] + [cols])
32 |     return base[all_base_cols].join(
33 |         reform[all_reform_cols], lsuffix="_base", rsuffix="_reform"
34 |     )
35 | 
36 | 
37 | def pctchg_base_reform(combined: pd.DataFrame, metric: str) -> pd.Series:
38 |     """Calculates the percentage change in a metric for a combined
39 |         dataset.
40 | 
41 |     :param combined: Combined DataFrame with _base and _reform columns.
42 |     :type combined: pd.DataFrame
43 |     :param metric: String of the column to calculate the difference.
44 |         Must exist as metric_m_base and metric_m_reform in combined.
45 |     :type metric: str
46 |     :returns: Series with percentage change.
47 |     :rtype: pd.Series
48 | 
49 |     """
50 |     return combined[metric + "_m_reform"] / combined[metric + "_m_base"] - 1
51 | 
52 | 
53 | def agg(
54 |     base: pd.DataFrame,
55 |     reform: pd.DataFrame,
56 |     groupby: str,
57 |     metrics: list,
58 |     base_metrics: Optional[list],
59 |     reform_metrics: Optional[list],
60 | ) -> pd.DataFrame:
61 |     """Aggregates differences between base and reform.
62 | 
63 |     :param base: Base DataFrame. Index must match reform.
64 |     :type base: pd.DataFrame
65 |     :param reform: Reform DataFrame. Index must match base.
66 |     :type reform: pd.DataFrame
67 |     :param groupby: Variable in base to group on.
68 |     :type groupby: str
69 |     :param metrics: List of variables to agg and calculate the % change of.
70 |         These should have associated weighted columns ending in _m in base
71 |         and reform.
72 |     :type metrics: list
73 |     :param base_metrics: List of variables from base to sum.
74 |     :type base_metrics: Optional[list]
75 |     :param reform_metrics: List of variables from reform to sum.
76 |     :type reform_metrics: Optional[list]
77 |     :returns: DataFrame with groupby and metrics, and _pctchg metrics.
78 |     :rtype: pd.DataFrame
79 | 
80 |     """
81 |     metrics = mdf.listify(metrics)
82 |     metrics_m = [i + "_m" for i in metrics]
83 |     combined = combine_base_reform(
84 |         base,
85 |         reform,
86 |         base_cols=mdf.listify([groupby, base_metrics]),
87 |         cols=mdf.listify(metrics_m),
88 |         reform_cols=mdf.listify(reform_metrics),
89 |     )
90 |     grouped = combined.groupby(groupby).sum()
91 |     for metric in metrics:
92 |         grouped[metric + "_pctchg"] = pctchg_base_reform(grouped, metric)
93 |     return grouped
94 | 


--------------------------------------------------------------------------------
/microdf/chart_utils.py:
--------------------------------------------------------------------------------
 1 | def dollar_format(suffix=""):
 2 |     """Dollar formatter for matplotlib.
 3 | 
 4 |     :param suffix: Suffix to append, e.g. 'B'. Defaults to ''.
 5 |     :returns: FuncFormatter.
 6 | 
 7 |     """
 8 |     return currency_format(currency="USD", suffix=suffix)
 9 | 
10 | 
11 | def currency_format(currency="USD", suffix=""):
12 |     """Currency formatter for matplotlib.
13 | 
14 |     :param currency: Name of the currency, e.g. 'USD', 'GBP'.
15 |     :param suffix: Suffix to append, e.g. 'B'. Defaults to ''.
16 |     :returns: FuncFormatter.
17 | 
18 |     """
19 |     try:
20 |         import matplotlib as mpl
21 |     except ImportError:
22 |         raise ImportError(
23 |             "The function you've called requires extra dependencies. " +
24 |             "Please install microdf with the 'charts' extra by running " +
25 |             "'pip install microdf[charts]'"
26 |         )
27 | 
28 |     prefix = {"USD": "$", "GBP": "£"}[currency]
29 | 
30 |     return mpl.ticker.FuncFormatter(
31 |         lambda x, _: prefix + format(int(x), ",") + suffix
32 |     )
33 | 


--------------------------------------------------------------------------------
/microdf/charts.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import microdf as mdf
 4 | 
 5 | 
 6 | def quantile_pct_chg_plot(df1, df2, col1, col2, w1=None, w2=None, q=None):
 7 |     """Create stem plot with percent change in decile boundaries.
 8 | 
 9 |     :param df1: DataFrame with first set of values.
10 |     :param df2: DataFrame with second set of values.
11 |     :param col1: Name of columns with values in df1.
12 |     :param col2: Name of columns with values in df2.
13 |     :param w1: Name of weight column in df1.
14 |     :param w2: Name of weight column in df2.
15 |     :param q: Quantiles. Defaults to decile boundaries.
16 |     :returns: Axis.
17 | 
18 |     """
19 |     try:
20 |         import seaborn as sns
21 |         import matplotlib as mpl
22 |         import matplotlib.pyplot as plt
23 |     except ImportError:
24 |         raise ImportError(
25 |             "The function you've called requires extra dependencies. " +
26 |             "Please install microdf with the 'charts' extra by running " +
27 |             "'pip install microdf[charts]'"
28 |         )
29 | 
30 |     if q is None:
31 |         q = np.arange(0.1, 1, 0.1)
32 |     # Calculate weighted quantiles.
33 |     df = mdf.quantile_chg(df1, df2, col1, col2, w1, w2, q).transpose()
34 |     # Prepare dataset for plotting.
35 |     df.columns = ["base", "reform"]
36 |     df["pct_chg"] = df.reform / df.base - 1
37 |     # Multiply by 100 pending github.com/matplotlib/matplotlib/issues/17113
38 |     df.pct_chg *= 100
39 |     df["index_newline"] = np.where(
40 |         df.index == "50th (median)", "50th\n(median)", df.index
41 |     )
42 |     # Plot.
43 |     fig, ax = plt.subplots()
44 |     markerline, stemlines, baseline = ax.stem(
45 |         df.index_newline, df.pct_chg
46 |     )
47 |     plt.setp(baseline, color="gray", linewidth=0)
48 |     ax.yaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))
49 |     ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(xmax=100))
50 |     plt.title("Change to percentiles", loc="left")
51 |     plt.ylabel("Change at the percentile boundary")
52 |     plt.xlabel("Percentile")
53 |     sns.despine(left=True, bottom=True)
54 |     ax.grid(color=mdf.GRID_COLOR, axis="y")
55 |     plt.xticks(rotation=0)
56 |     return ax
57 | 


--------------------------------------------------------------------------------
/microdf/concat.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import inspect
 3 | import microdf as mdf
 4 | 
 5 | 
 6 | def concat(*args, **kwargs):
 7 |     """Concatenates MicroDataFrame objects, preserving weights.
 8 |     If concatenating horizontally, the first set of weights are used.
 9 |     All args and kwargs are passed to pd.concat.
10 | 
11 |     :return: MicroDataFrame with concatenated weights.
12 |     :rtype: mdf.MicroDataFrame
13 |     """
14 |     # Extract args with respect to pd.concat.
15 |     pd_args = inspect.getcallargs(pd.concat, *args, **kwargs)
16 |     objs = pd_args["objs"]
17 |     axis = pd_args["axis"]
18 |     # Create result, starting with pd.concat.
19 |     res = mdf.MicroDataFrame(pd.concat(*args, **kwargs))
20 |     # Assign weights depending on axis.
21 |     if axis == 0:
22 |         res.weights = pd.concat([obj.weights for obj in objs])
23 |     else:
24 |         # If concatenating horizontally, use the first set of weights.
25 |         res.weights = objs[0].weights
26 |     return res
27 | 


--------------------------------------------------------------------------------
/microdf/constants.py:
--------------------------------------------------------------------------------
 1 | # Constants for share of each benefit that is cash.
 2 | HOUSING_CASH_SHARE = 0.0
 3 | MCAID_CASH_SHARE = 0.0
 4 | MCARE_CASH_SHARE = 0.0
 5 | # https://github.com/open-source-economics/taxdata/issues/148
 6 | # https://docs.google.com/spreadsheets/d/1g_YdFd5idgLL764G0pZBiBnIlnCBGyxBmapXCOZ1OV4
 7 | OTHER_CASH_SHARE = 0.35
 8 | SNAP_CASH_SHARE = 0.0
 9 | SSI_CASH_SHARE = 1.0
10 | TANF_CASH_SHARE = 0.25
11 | # https://github.com/open-source-economics/C-TAM/issues/62.
12 | VET_CASH_SHARE = 0.48
13 | WIC_CASH_SHARE = 0.0
14 | 
15 | # Columns to remove from expanded_income to approximate TPC's Expanded Cash
16 | # Income.
17 | ECI_REMOVE_COLS = [
18 |     "wic_ben",
19 |     "housing_ben",
20 |     "vet_ben",
21 |     "mcare_ben",
22 |     "mcaid_ben",
23 | ]
24 | 
25 | # Benefits.
26 | BENS = [
27 |     "housing_ben",
28 |     "mcaid_ben",
29 |     "mcare_ben",
30 |     "vet_ben",
31 |     "other_ben",
32 |     "snap_ben",
33 |     "ssi_ben",
34 |     "tanf_ben",
35 |     "wic_ben",
36 |     "e02400",  # Social Security (OASDI).
37 |     "e02300",  # Unemployment insurance.
38 | ]
39 | 
40 | MED_BENS = ["mcaid_ben", "mcare_ben", "vet_ben"]
41 | 


--------------------------------------------------------------------------------
/microdf/custom_taxes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions and data for estimating taxes outside the income tax system.
  3 | Examples include value added tax, financial transaction tax, and carbon tax.
  4 | """
  5 | 
  6 | import microdf as mdf
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | 
 12 | # Source:
 13 | # https://www.taxpolicycenter.org/briefing-book/who-would-bear-burden-vat
 14 | VAT_INCIDENCE = pd.Series(
 15 |     index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9],
 16 |     data=[3.9, 3.9, 3.6, 3.6, 3.6, 3.6, 3.6, 3.4, 3.4, 3.2, 2.8, 2.5, 2.5],
 17 | )
 18 | VAT_INCIDENCE /= 100
 19 | 
 20 | # Source: Table 5 in
 21 | # https://www.treasury.gov/resource-center/tax-policy/tax-analysis/Documents/WP-115.pdf
 22 | CARBON_TAX_INCIDENCE = pd.Series(
 23 |     index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9],
 24 |     data=[0.8, 1.2, 1.4, 1.5, 1.6, 1.7, 1.8, 1.8, 1.8, 1.8, 1.6, 1.4, 0.7],
 25 | )
 26 | CARBON_TAX_INCIDENCE /= 100
 27 | 
 28 | # Source: Figure 1 in
 29 | # https://www.taxpolicycenter.org/sites/default/files/alfresco/publication-pdfs/2000587-financial-transaction-taxes.pdf
 30 | FTT_INCIDENCE = pd.Series(
 31 |     index=[-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99, 99.9],
 32 |     data=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.8, 1.0],
 33 | )
 34 | FTT_INCIDENCE /= 100
 35 | 
 36 | 
 37 | def add_custom_tax(
 38 |     df,
 39 |     segment_income,
 40 |     w,
 41 |     base_income,
 42 |     incidence,
 43 |     name,
 44 |     total=None,
 45 |     ratio=None,
 46 |     verbose=True,
 47 | ):
 48 |     """Add a custom tax based on incidence analysis driven by percentiles.
 49 | 
 50 |     :param df: DataFrame.
 51 |     :param segment_income: Income measure used to segment tax units into
 52 |             quantiles.
 53 |     :param w: Weight used to segment into quantiles (either s006 or XTOT_m).
 54 |     :param base_income: Income measure by which incidence is multiplied to
 55 |             estimate liability.
 56 |     :param incidence: pandas Series indexed on the floor of an income
 57 |         percentile, with values for the tax rate.
 58 |     :param name: Name of the column to add.
 59 |     :param total: Total amount the tax should generate. If not provided,
 60 |         liabilities are calculated only based on the incidence schedule.
 61 |         (Default value = None)
 62 |     :param ratio: Ratio to adjust the tax by, compared to the original tax.
 63 |         This acts as a multiplier for the incidence argument.
 64 |         (Default value = None)
 65 |     :param verbose: Whether to print the tax adjustment factor if needed.
 66 |         Defaults to True.
 67 |     :returns: Nothing. Adds the column name to df representing the tax
 68 |         liability. df is also sorted by segment_income.
 69 | 
 70 |     """
 71 |     if ratio is not None:
 72 |         incidence = incidence * ratio
 73 |         assert total is None, "ratio and total cannot both be provided."
 74 |     df.sort_values(segment_income, inplace=True)
 75 |     income_percentile = 100 * df[w].cumsum() / df[w].sum()
 76 |     tu_incidence = incidence.iloc[
 77 |         pd.cut(
 78 |             income_percentile,
 79 |             # Add a right endpoint. Should be 100 but sometimes a decimal
 80 |             # gets added.
 81 |             bins=incidence.index.tolist() + [101],
 82 |             labels=False,
 83 |         )
 84 |     ].values
 85 |     df[name] = np.maximum(0, tu_incidence * df[base_income])
 86 |     if total is not None:
 87 |         initial_total = mdf.weighted_sum(df, name, "s006")
 88 |         if verbose:
 89 |             print(
 90 |                 "Multiplying tax by "
 91 |                 + str(round(total / initial_total, 2))
 92 |                 + "."
 93 |             )
 94 |         df[name] *= total / initial_total
 95 | 
 96 | 
 97 | def add_vat(
 98 |     df,
 99 |     segment_income="tpc_eci",
100 |     w="XTOT_m",
101 |     base_income="aftertax_income",
102 |     incidence=VAT_INCIDENCE,
103 |     name="vat",
104 |     **kwargs
105 | ):
106 |     """Add value added tax based on incidence estimate from Tax Policy Center.
107 | 
108 |     :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income.
109 |     :param Other: arguments: Args to add_custom_tax with VAT defaults.
110 |     :param segment_income: Default value = "tpc_eci")
111 |     :param w: Default value = "XTOT_m")
112 |     :param base_income: Default value = "aftertax_income")
113 |     :param incidence: Default value = VAT_INCIDENCE)
114 |     :param name: Default value = "vat")
115 |     :param **kwargs: Other arguments passed to add_custom_tax().
116 |     :returns: Nothing. Adds vat to df.
117 |         df is also sorted by tpc_eci.
118 | 
119 |     """
120 |     add_custom_tax(
121 |         df, segment_income, w, base_income, incidence, name, **kwargs
122 |     )
123 | 
124 | 
125 | def add_carbon_tax(
126 |     df,
127 |     segment_income="tpc_eci",
128 |     w="XTOT_m",
129 |     base_income="aftertax_income",
130 |     incidence=CARBON_TAX_INCIDENCE,
131 |     name="carbon_tax",
132 |     **kwargs
133 | ):
134 |     """Add carbon tax based on incidence estimate from the US Treasury
135 |     Department.
136 | 
137 |     :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income.
138 |     :param Other: arguments: Args to add_custom_tax with carbon tax defaults.
139 |     :param segment_income: Default value = "tpc_eci")
140 |     :param w: Default value = "XTOT_m")
141 |     :param base_income: Default value = "aftertax_income")
142 |     :param incidence: Default value = CARBON_TAX_INCIDENCE)
143 |     :param name: Default value = "carbon_tax")
144 |     :param **kwargs: Other arguments passed to add_custom_tax().
145 |     :returns: Nothing. Adds carbon_tax to df.
146 |         df is also sorted by tpc_eci.
147 | 
148 |     """
149 |     add_custom_tax(
150 |         df, segment_income, w, base_income, incidence, name, **kwargs
151 |     )
152 | 
153 | 
154 | def add_ftt(
155 |     df,
156 |     segment_income="tpc_eci",
157 |     w="XTOT_m",
158 |     base_income="aftertax_income",
159 |     incidence=FTT_INCIDENCE,
160 |     name="ftt",
161 |     **kwargs
162 | ):
163 |     """Add financial transaction tax based on incidence estimate from Tax
164 |     Policy Center.
165 | 
166 |     :param df: DataFrame with columns for tpc_eci, XTOT_m, and aftertax_income.
167 |     :param Other: arguments: Args to add_custom_tax with FTT defaults.
168 |     :param segment_income: Default value = "tpc_eci")
169 |     :param w: Default value = "XTOT_m")
170 |     :param base_income: Default value = "aftertax_income")
171 |     :param incidence: Default value = FTT_INCIDENCE)
172 |     :param name: Default value = "ftt")
173 |     :param **kwargs: Other arguments passed to add_custom_tax().
174 |     :returns: Nothing. Adds ftt to df.
175 |         df is also sorted by tpc_eci.
176 | 
177 |     """
178 |     add_custom_tax(
179 |         df, segment_income, w, base_income, incidence, name, **kwargs
180 |     )
181 | 


--------------------------------------------------------------------------------
/microdf/generic.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Union
  2 | from functools import wraps
  3 | import warnings
  4 | import copy
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | 
  9 | class MicroSeries(pd.Series):
 10 |     def __init__(self, *args, weights: np.array = None, **kwargs):
 11 |         """A Series-inheriting class for weighted microdata.
 12 |         Weights can be provided at initialisation, or using set_weights.
 13 | 
 14 |         :param weights: Array of weights.
 15 |         :type weights: np.array
 16 |         """
 17 |         super().__init__(*args, **kwargs)
 18 |         self.set_weights(weights)
 19 | 
 20 |     def weighted_function(fn: Callable) -> Callable:
 21 |         @wraps(fn)
 22 |         def safe_fn(*args, **kwargs):
 23 |             try:
 24 |                 return fn(*args, **kwargs)
 25 |             except ZeroDivisionError:
 26 |                 return np.NaN
 27 | 
 28 |         return safe_fn
 29 | 
 30 |     @weighted_function
 31 |     def scalar_function(fn: Callable) -> Callable:
 32 |         fn._rtype = float
 33 |         return fn
 34 | 
 35 |     @weighted_function
 36 |     def vector_function(fn: Callable) -> Callable:
 37 |         fn._rtype = pd.Series
 38 |         return fn
 39 | 
 40 |     def set_weights(self, weights: np.array) -> None:
 41 |         """Sets the weight values.
 42 | 
 43 |         :param weights: Array of weights.
 44 |         :type weights: np.array.
 45 |         """
 46 |         if weights is None:
 47 |             self.weights = pd.Series(np.ones_like(self.values), dtype=float)
 48 |         else:
 49 |             self.weights = pd.Series(weights, dtype=float)
 50 | 
 51 |     @vector_function
 52 |     def weight(self) -> pd.Series:
 53 |         """Calculates the weighted value of the MicroSeries.
 54 | 
 55 |         :returns: A Series multiplying the MicroSeries by its weight.
 56 |         :rtype: pd.Series
 57 |         """
 58 |         return self.multiply(self.weights)
 59 | 
 60 |     @scalar_function
 61 |     def sum(self) -> float:
 62 |         """Calculates the weighted sum of the MicroSeries.
 63 | 
 64 |         :returns: The weighted sum.
 65 |         :rtype: float
 66 |         """
 67 |         return self.multiply(self.weights).sum()
 68 | 
 69 |     @scalar_function
 70 |     def count(self) -> float:
 71 |         """Calculates the weighted count of the MicroSeries.
 72 | 
 73 |         :returns: The weighted count.
 74 |         """
 75 |         return self.weights.sum()
 76 | 
 77 |     @scalar_function
 78 |     def mean(self) -> float:
 79 |         """Calculates the weighted mean of the MicroSeries
 80 | 
 81 |         :returns: The weighted mean.
 82 |         :rtype: float
 83 |         """
 84 |         return np.average(self.values, weights=self.weights)
 85 | 
 86 |     def quantile(self, q: np.array) -> pd.Series:
 87 |         """Calculates weighted quantiles of the MicroSeries.
 88 | 
 89 |         Doesn't exactly match unweighted quantiles of stacked values.
 90 |         See stackoverflow.com/q/21844024#comment102342137_29677616.
 91 | 
 92 |         :param q: Array of quantiles to calculate.
 93 |         :type q: np.array
 94 | 
 95 |         :return: Array of weighted quantiles.
 96 |         :rtype: pd.Series
 97 |         """
 98 |         values = np.array(self.values)
 99 |         quantiles = np.array(q)
100 |         sample_weight = np.array(self.weights)
101 |         assert np.all(quantiles >= 0) and np.all(
102 |             quantiles <= 1
103 |         ), "quantiles should be in [0, 1]"
104 |         sorter = np.argsort(values)
105 |         values = values[sorter]
106 |         sample_weight = sample_weight[sorter]
107 |         weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
108 |         weighted_quantiles /= np.sum(sample_weight)
109 |         result = np.interp(quantiles, weighted_quantiles, values)
110 |         if quantiles.shape == ():
111 |             return result
112 |         return pd.Series(result, index=quantiles)
113 | 
114 |     @scalar_function
115 |     def median(self) -> float:
116 |         """Calculates the weighted median of the MicroSeries.
117 | 
118 |         :returns: The weighted median of a DataFrame's column.
119 |         :rtype: float
120 |         """
121 |         return self.quantile(0.5)
122 | 
123 |     @scalar_function
124 |     def gini(self, negatives: str = None) -> float:
125 |         """Calculates Gini index.
126 | 
127 |         :param negatives: An optional string indicating how to treat negative
128 |             values of x:
129 |             'zero' replaces negative values with zeroes.
130 |             'shift' subtracts the minimum value from all values of x,
131 |             when this minimum is negative. That is, it adds the absolute
132 |             minimum value.
133 |             Defaults to None, which leaves negative values as they are.
134 |         :type q: str
135 |         :returns: Gini index.
136 |         :rtype: float
137 |         """
138 |         x = np.array(self).astype("float")
139 |         if negatives == "zero":
140 |             x[x < 0] = 0
141 |         if negatives == "shift" and np.amin(x) < 0:
142 |             x -= np.amin(x)
143 |         if (self.weights != np.ones(len(self))).any():  # Varying weights.
144 |             sorted_indices = np.argsort(self)
145 |             sorted_x = np.array(self[sorted_indices])
146 |             sorted_w = np.array(self.weights[sorted_indices])
147 |             cumw = np.cumsum(sorted_w)
148 |             cumxw = np.cumsum(sorted_x * sorted_w)
149 |             return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
150 |                 cumxw[-1] * cumw[-1]
151 |             )
152 |         else:
153 |             sorted_x = np.sort(self)
154 |             n = len(x)
155 |             cumxw = np.cumsum(sorted_x)
156 |             # The above formula, with all weights equal to 1 simplifies to:
157 |             return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n
158 | 
159 |     @scalar_function
160 |     def top_x_pct_share(self, top_x_pct: float) -> float:
161 |         """Calculates top x% share.
162 | 
163 |         :param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1,
164 |             0.001.
165 |         :type top_x_pct: float
166 |         :returns: The weighted share held by the top x%.
167 |         :rtype: float
168 |         """
169 |         threshold = self.quantile(1 - top_x_pct)
170 |         top_x_pct_sum = self[self >= threshold].sum()
171 |         total_sum = self.sum()
172 |         return top_x_pct_sum / total_sum
173 | 
174 |     @scalar_function
175 |     def bottom_x_pct_share(self, bottom_x_pct) -> float:
176 |         """Calculates bottom x% share.
177 | 
178 |         :param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1,
179 |             0.001.
180 |         :type bottom_x_pct: float
181 |         :returns: The weighted share held by the bottom x%.
182 |         :rtype: float
183 |         """
184 |         return 1 - self.top_x_pct_share(1 - bottom_x_pct)
185 | 
186 |     @scalar_function
187 |     def bottom_50_pct_share(self) -> float:
188 |         """Calculates bottom 50% share.
189 | 
190 |         :returns: The weighted share held by the bottom 50%.
191 |         :rtype: float
192 |         """
193 |         return self.bottom_x_pct_share(0.5)
194 | 
195 |     @scalar_function
196 |     def top_50_pct_share(self) -> float:
197 |         """Calculates top 50% share.
198 | 
199 |         :returns: The weighted share held by the top 50%.
200 |         :rtype: float
201 |         """
202 |         return self.top_x_pct_share(0.5)
203 | 
204 |     @scalar_function
205 |     def top_10_pct_share(self) -> float:
206 |         """Calculates top 10% share.
207 | 
208 |         :returns: The weighted share held by the top 10%.
209 |         :rtype: float
210 |         """
211 |         return self.top_x_pct_share(0.1)
212 | 
213 |     @scalar_function
214 |     def top_1_pct_share(self) -> float:
215 |         """Calculates top 1% share.
216 | 
217 |         :returns: The weighted share held by the top 50%.
218 |         :rtype: float
219 |         """
220 |         return self.top_x_pct_share(0.01)
221 | 
222 |     @scalar_function
223 |     def top_0_1_pct_share(self) -> float:
224 |         """Calculates top 0.1% share.
225 | 
226 |         :returns: The weighted share held by the top 0.1%.
227 |         :rtype: float
228 |         """
229 |         return self.top_x_pct_share(0.001)
230 | 
231 |     @scalar_function
232 |     def t10_b50(self) -> float:
233 |         """Calculates ratio between the top 10% and bottom 50% shares.
234 | 
235 |         :returns: The weighted share held by the top 10% divided by
236 |             the weighted share held by the bottom 50%.
237 | 
238 |         """
239 |         t10 = self.top_10_pct_share()
240 |         b50 = self.bottom_50_pct_share()
241 |         return t10 / b50
242 | 
243 |     @vector_function
244 |     def cumsum(self) -> pd.Series:
245 |         return pd.Series(self * self.weights).cumsum()
246 | 
247 |     @vector_function
248 |     def rank(self, pct=False) -> pd.Series:
249 |         order = np.argsort(self.values)
250 |         inverse_order = np.argsort(order)
251 |         ranks = np.array(self.weights.values)[order].cumsum()[inverse_order]
252 |         if pct:
253 |             ranks /= self.weights.values.sum()
254 |             np.where(ranks > 1.0, 1.0, ranks)
255 |         return pd.Series(ranks, index=self.index)
256 | 
257 |     @vector_function
258 |     def decile_rank(self):
259 |         return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 10), 10))
260 | 
261 |     @vector_function
262 |     def quintile_rank(self):
263 |         return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 5), 5))
264 | 
265 |     @vector_function
266 |     def quartile_rank(self):
267 |         return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 4), 4))
268 | 
269 |     @vector_function
270 |     def percentile_rank(self):
271 |         return MicroSeries(np.minimum(np.ceil(self.rank(pct=True) * 100), 100))
272 | 
273 |     def groupby(self, *args, **kwargs):
274 |         gb = super().groupby(*args, **kwargs)
275 |         gb.__class__ = MicroSeriesGroupBy
276 |         gb._init()
277 |         gb.weights = pd.Series(self.weights).groupby(*args, **kwargs)
278 |         return gb
279 | 
280 |     def copy(self, deep=True):
281 |         res = super().copy(deep)
282 |         res = MicroSeries(res, weights=self.weights.copy(deep))
283 |         return res
284 | 
285 |     def equals(self, other) -> bool:
286 |         equal_values = super().equals(other)
287 |         equal_weights = self.weights.equals(other.weights)
288 |         return equal_values and equal_weights
289 | 
290 |     def __getitem__(self, key):
291 |         result = super().__getitem__(key)
292 |         if isinstance(result, pd.Series):
293 |             weights = self.weights.__getitem__(key)
294 |             return MicroSeries(result, weights=weights)
295 |         return result
296 | 
297 |     def __getattr__(self, name):
298 |         return MicroSeries(super().__getattr__(name), weights=self.weights)
299 | 
300 |     # operators
301 | 
302 |     def __add__(self, other):
303 |         return MicroSeries(super().__add__(other), weights=self.weights)
304 | 
305 |     def __sub__(self, other):
306 |         return MicroSeries(super().__sub__(other), weights=self.weights)
307 | 
308 |     def __mul__(self, other):
309 |         return MicroSeries(super().__mul__(other), weights=self.weights)
310 | 
311 |     def __floordiv__(self, other):
312 |         return MicroSeries(super().__floordiv__(other), weights=self.weights)
313 | 
314 |     def __truediv__(self, other):
315 |         return MicroSeries(super().__truediv__(other), weights=self.weights)
316 | 
317 |     def __mod__(self, other):
318 |         return MicroSeries(super().__mod__(other), weights=self.weights)
319 | 
320 |     def __pow__(self, other):
321 |         return MicroSeries(super().__pow__(other), weights=self.weights)
322 | 
323 |     # comparators
324 | 
325 |     def __lt__(self, other):
326 |         return MicroSeries(super().__lt__(other), weights=self.weights)
327 | 
328 |     def __le__(self, other):
329 |         return MicroSeries(super().__le__(other), weights=self.weights)
330 | 
331 |     def __eq__(self, other):
332 |         return MicroSeries(super().__eq__(other), weights=self.weights)
333 | 
334 |     def __ne__(self, other):
335 |         return MicroSeries(super().__ne__(other), weights=self.weights)
336 | 
337 |     def __ge__(self, other):
338 |         return MicroSeries(super().__ge__(other), weights=self.weights)
339 | 
340 |     def __gt__(self, other):
341 |         return MicroSeries(super().__gt__(other), weights=self.weights)
342 | 
343 |     # assignment operators
344 | 
345 |     def __iadd__(self, other):
346 |         return MicroSeries(super().__iadd__(other), weights=self.weights)
347 | 
348 |     def __isub__(self, other):
349 |         return MicroSeries(super().__isub__(other), weights=self.weights)
350 | 
351 |     def __imul__(self, other):
352 |         return MicroSeries(super().__imul__(other), weights=self.weights)
353 | 
354 |     def __ifloordiv__(self, other):
355 |         return MicroSeries(super().__ifloordiv__(other), weights=self.weights)
356 | 
357 |     def __idiv__(self, other):
358 |         return MicroSeries(super().__idiv__(other), weights=self.weights)
359 | 
360 |     def __itruediv__(self, other):
361 |         return MicroSeries(super().__itruediv__(other), weights=self.weights)
362 | 
363 |     def __imod__(self, other):
364 |         return MicroSeries(super().__imod__(other), weights=self.weights)
365 | 
366 |     def __ipow__(self, other):
367 |         return MicroSeries(super().__ipow__(other), weights=self.weights)
368 | 
369 |     # other
370 | 
371 |     def __neg__(self, other):
372 |         return MicroSeries(super().__neg__(other), weights=self.weights)
373 | 
374 |     def __pos__(self, other):
375 |         return MicroSeries(super().__pos__(other), weights=self.weights)
376 | 
377 |     def __repr__(self):
378 |         return pd.DataFrame(
379 |             dict(value=self.values, weight=self.weights.values)
380 |         ).__repr__()
381 | 
382 | 
383 | MicroSeries.SCALAR_FUNCTIONS = [
384 |     fn
385 |     for fn in dir(MicroSeries)
386 |     if "_rtype" in dir(getattr(MicroSeries, fn))
387 |     and getattr(getattr(MicroSeries, fn), "_rtype") == float
388 | ]
389 | MicroSeries.VECTOR_FUNCTIONS = [
390 |     fn
391 |     for fn in dir(MicroSeries)
392 |     if "_rtype" in dir(getattr(MicroSeries, fn))
393 |     and getattr(getattr(MicroSeries, fn), "_rtype") == pd.Series
394 | ]
395 | MicroSeries.AGNOSTIC_FUNCTIONS = ["quantile"]
396 | MicroSeries.FUNCTIONS = sum(
397 |     [
398 |         MicroSeries.SCALAR_FUNCTIONS,
399 |         MicroSeries.VECTOR_FUNCTIONS,
400 |         MicroSeries.AGNOSTIC_FUNCTIONS,
401 |     ],
402 |     [],
403 | )
404 | 
405 | 
406 | class MicroSeriesGroupBy(pd.core.groupby.generic.SeriesGroupBy):
407 |     def _init(self):
408 |         def _weighted_agg(name) -> Callable:
409 |             def via_micro_series(row, *args, **kwargs):
410 |                 return getattr(MicroSeries(row.a, weights=row.w), name)(
411 |                     *args, **kwargs
412 |                 )
413 | 
414 |             fn = getattr(MicroSeries, name)
415 | 
416 |             @wraps(fn)
417 |             def _weighted_agg_fn(*args, **kwargs):
418 |                 arrays = self.apply(np.array)
419 |                 weights = self.weights.apply(np.array)
420 |                 df = pd.DataFrame(dict(a=arrays, w=weights))
421 |                 is_array = len(args) > 0 and hasattr(args[0], "__len__")
422 |                 if (
423 |                     name in MicroSeries.SCALAR_FUNCTIONS
424 |                     or name in MicroSeries.AGNOSTIC_FUNCTIONS
425 |                     and not is_array
426 |                 ):
427 |                     result = df.agg(
428 |                         lambda row: via_micro_series(row, *args, **kwargs),
429 |                         axis=1,
430 |                     )
431 |                 elif (
432 |                     name in MicroSeries.VECTOR_FUNCTIONS
433 |                     or name in MicroSeries.AGNOSTIC_FUNCTIONS
434 |                     and is_array
435 |                 ):
436 |                     result = df.apply(
437 |                         lambda row: via_micro_series(row, *args, **kwargs),
438 |                         axis=1,
439 |                     )
440 |                     return result.stack()
441 |                 return result
442 | 
443 |             return _weighted_agg_fn
444 | 
445 |         for fn_name in MicroSeries.FUNCTIONS:
446 |             setattr(self, fn_name, _weighted_agg(fn_name))
447 | 
448 | 
449 | class MicroDataFrameGroupBy(pd.core.groupby.generic.DataFrameGroupBy):
450 |     def _init(self, by: Union[str, list]):
451 |         self.columns = list(self.obj.columns)
452 |         if isinstance(by, list):
453 |             for column in by:
454 |                 self.columns.remove(column)
455 |         elif isinstance(by, str):
456 |             self.columns.remove(by)
457 |         self.columns.remove("__tmp_weights")
458 |         for fn_name in MicroSeries.SCALAR_FUNCTIONS:
459 | 
460 |             def get_fn(name):
461 |                 def fn(*args, **kwargs):
462 |                     return MicroDataFrame(
463 |                         {
464 |                             col: getattr(getattr(self, col), name)(
465 |                                 *args, **kwargs
466 |                             )
467 |                             for col in self.columns
468 |                         }
469 |                     )
470 | 
471 |                 return fn
472 | 
473 |             setattr(self, fn_name, get_fn(fn_name))
474 |         for fn_name in MicroSeries.VECTOR_FUNCTIONS:
475 | 
476 |             def get_fn(name):
477 |                 def fn(*args, **kwargs):
478 |                     return MicroDataFrame(
479 |                         {
480 |                             col: getattr(getattr(self, col), name)(
481 |                                 *args, **kwargs
482 |                             )
483 |                             for col in self.columns
484 |                         }
485 |                     )
486 | 
487 |                 return fn
488 | 
489 |             setattr(self, fn_name, get_fn(fn_name))
490 | 
491 | 
492 | class MicroDataFrame(pd.DataFrame):
493 |     def __init__(self, *args, weights=None, **kwargs):
494 |         """A DataFrame-inheriting class for weighted microdata.
495 |         Weights can be provided at initialisation, or using set_weights or
496 |         set_weight_col.
497 | 
498 |         :param weights: Array of weights.
499 |         :type weights: np.array
500 |         """
501 |         super().__init__(*args, **kwargs)
502 |         self.weights = None
503 |         self.set_weights(weights)
504 |         self._link_all_weights()
505 |         self.override_df_functions()
506 | 
507 |     def override_df_functions(self):
508 |         for name in MicroSeries.FUNCTIONS:
509 | 
510 |             def get_fn(name):
511 |                 def fn(*args, **kwargs):
512 |                     is_array = len(args) > 0 and hasattr(args[0], "__len__")
513 |                     if (
514 |                         name in MicroSeries.SCALAR_FUNCTIONS
515 |                         or name in MicroSeries.AGNOSTIC_FUNCTIONS
516 |                         and not is_array
517 |                     ):
518 |                         results = pd.Series(
519 |                             [
520 |                                 getattr(self[col], name)(*args, **kwargs)
521 |                                 for col in self.columns
522 |                             ]
523 |                         )
524 |                         results.index = self.columns
525 |                         return results
526 |                     elif (
527 |                         name in MicroSeries.VECTOR_FUNCTIONS
528 |                         or name in MicroSeries.AGNOSTIC_FUNCTIONS
529 |                         and is_array
530 |                     ):
531 |                         results = pd.DataFrame(
532 |                             [
533 |                                 getattr(self[col], name)(*args, **kwargs)
534 |                                 for col in self.columns
535 |                             ]
536 |                         )
537 |                         results.index = self.columns
538 |                         return results
539 | 
540 |                 return fn
541 | 
542 |             setattr(self, name, get_fn(name))
543 | 
544 |     def get_args_as_micro_series(*kwarg_names: tuple) -> Callable:
545 |         """Decorator for auto-parsing column names into MicroSeries objects.
546 |         If given, kwarg_names limits arguments checked to keyword arguments
547 |         specified.
548 | 
549 |         :param arg_names: argument names to restrict to.
550 |         :type arg_names: str
551 |         """
552 | 
553 |         def arg_series_decorator(fn):
554 |             @wraps(fn)
555 |             def series_function(self, *args, **kwargs):
556 |                 new_args = []
557 |                 new_kwargs = {}
558 |                 if len(kwarg_names) == 0:
559 |                     for value in args:
560 |                         if isinstance(value, str):
561 |                             if value not in self.columns:
562 |                                 raise Exception("Column not found")
563 |                             new_args += [self[value]]
564 |                         else:
565 |                             new_args += [value]
566 |                     for name, value in kwargs.items():
567 |                         if isinstance(value, str) and (
568 |                             len(kwarg_names) == 0 or name in kwarg_names
569 |                         ):
570 |                             if value not in self.columns:
571 |                                 raise Exception("Column not found")
572 |                             new_kwargs[name] = self[value]
573 |                         else:
574 |                             new_kwargs[name] = value
575 |                 return fn(self, *new_args, **new_kwargs)
576 | 
577 |             return series_function
578 | 
579 |         return arg_series_decorator
580 | 
581 |     def __setitem__(self, *args, **kwargs):
582 |         super().__setitem__(*args, **kwargs)
583 |         self._link_all_weights()
584 | 
585 |     def _link_weights(self, column):
586 |         # self[column] = ... triggers __setitem__, which forces pd.Series
587 |         # this workaround avoids that
588 |         self[column].__class__ = MicroSeries
589 |         self[column].set_weights(self.weights)
590 | 
591 |     def _link_all_weights(self):
592 |         if self.weights is None:
593 |             self.set_weights(np.ones((len(self))))
594 |         for column in self.columns:
595 |             if column != self.weights_col:
596 |                 self._link_weights(column)
597 | 
598 |     def set_weights(self, weights) -> None:
599 |         """Sets the weights for the MicroDataFrame. If a string is received,
600 |         it will be assumed to be the column name of the weight column.
601 | 
602 |         :param weights: Array of weights.
603 |         :type weights: np.array
604 |         """
605 |         if isinstance(weights, str):
606 |             self.weights_col = weights
607 |             self.weights = pd.Series(self[weights], dtype=float)
608 |         elif weights is not None:
609 |             self.weights_col = None
610 |             with warnings.catch_warnings():
611 |                 warnings.filterwarnings("ignore", category=UserWarning)
612 |                 self.weights = pd.Series(weights, dtype=float)
613 |             self._link_all_weights()
614 | 
615 |     def set_weight_col(self, column: str) -> None:
616 |         """Sets the weights for the MicroDataFrame by specifying the name of
617 |         the weight column.
618 | 
619 |         :param weights: Array of weights.
620 |         :type weights: np.array
621 |         """
622 |         self.weights = np.array(self[column])
623 |         self.weight_col = column
624 |         self._link_all_weights()
625 | 
626 |     def __getitem__(self, key):
627 |         result = super().__getitem__(key)
628 |         if isinstance(result, pd.DataFrame):
629 |             try:
630 |                 weights = self.weights[key]
631 |             except Exception:
632 |                 weights = self.weights
633 |             return MicroDataFrame(result, weights=weights)
634 |         return result
635 | 
636 |     def catch_series_relapse(self):
637 |         for col in self.columns:
638 |             if self[col].__class__ == pd.Series:
639 |                 self._link_weights(col)
640 | 
641 |     def __setattr__(self, key, value):
642 |         super().__setattr__(key, value)
643 |         self.catch_series_relapse()
644 | 
645 |     def reset_index(self):
646 |         res = super().reset_index()
647 |         res = MicroDataFrame(res, weights=self.weights)
648 |         return res
649 | 
650 |     def copy(self, deep=True):
651 |         res = super().copy(deep)
652 |         # This changes the original columns to Series. Undo it:
653 |         for col in self.columns:
654 |             self[col] = MicroSeries(self[col])
655 |         res = MicroDataFrame(res, weights=self.weights.copy(deep))
656 |         return res
657 | 
658 |     def equals(self, other) -> bool:
659 |         equal_values = super().equals(other)
660 |         equal_weights = self.weights.equals(other.weights)
661 |         return equal_values and equal_weights
662 | 
663 |     @get_args_as_micro_series()
664 |     def groupby(self, by: Union[str, list], *args, **kwargs):
665 |         """
666 |         Returns a GroupBy object with MicroSeriesGroupBy objects for
667 |         each column
668 | 
669 |         :param by: column to group by
670 |         :type by: Union[str, list]
671 | 
672 |         return: DataFrameGroupBy object with columns using weights
673 |         rtype: DataFrameGroupBy
674 |         """
675 |         self["__tmp_weights"] = self.weights
676 |         gb = super().groupby(by, *args, **kwargs)
677 |         weights = copy.deepcopy(gb["__tmp_weights"])
678 |         for col in self.columns:  # df.groupby(...)[col]s use weights
679 |             res = gb[col]
680 |             res.__class__ = MicroSeriesGroupBy
681 |             res._init()
682 |             res.weights = weights
683 |             setattr(gb, col, res)
684 |         gb.__class__ = MicroDataFrameGroupBy
685 |         gb._init(by)
686 |         return gb
687 | 
688 |     @get_args_as_micro_series()
689 |     def poverty_rate(self, income: str, threshold: str) -> float:
690 |         """Calculate poverty rate, i.e., the population share with income
691 |         below their poverty threshold.
692 | 
693 |         :param income: Column indicating income.
694 |         :type income: str
695 |         :param threshold: Column indicating threshold.
696 |         :type threshold: str
697 |         :return: Poverty rate between zero and one.
698 |         :rtype: float
699 |         """
700 |         pov = income < threshold
701 |         return pov.sum() / pov.count()
702 | 
703 |     @get_args_as_micro_series()
704 |     def deep_poverty_rate(self, income: str, threshold: str) -> float:
705 |         """Calculate deep poverty rate, i.e., the population share with income
706 |         below half their poverty threshold.
707 | 
708 |         :param income: Column indicating income.
709 |         :type income: str
710 |         :param threshold: Column indicating threshold.
711 |         :type threshold: str
712 |         :return: Deep poverty rate between zero and one.
713 |         :rtype: float
714 |         """
715 |         pov = income < (threshold / 2)
716 |         return pov.sum() / pov.count()
717 | 
718 |     @get_args_as_micro_series()
719 |     def poverty_gap(self, income: str, threshold: str) -> float:
720 |         """Calculate poverty gap, i.e., the total gap between income and
721 |         poverty thresholds for all people in poverty.
722 | 
723 |         :param income: Column indicating income.
724 |         :type income: str
725 |         :param threshold: Column indicating threshold.
726 |         :type threshold: str
727 |         :return: Poverty gap.
728 |         :rtype: float
729 |         """
730 |         gaps = (threshold - income)[threshold > income]
731 |         return gaps.sum()
732 | 
733 |     @get_args_as_micro_series()
734 |     def deep_poverty_gap(self, income: str, threshold: str) -> float:
735 |         """Calculate deep poverty gap, i.e., the total gap between income and
736 |         half of poverty thresholds for all people in deep poverty.
737 | 
738 |         :param income: Column indicating income.
739 |         :type income: str
740 |         :param threshold: Column indicating threshold.
741 |         :type threshold: str
742 |         :return: Deep poverty gap.
743 |         :rtype: float
744 |         """
745 |         deep_threshold = threshold / 2
746 |         gaps = (deep_threshold - income)[deep_threshold > income]
747 |         return gaps.sum()
748 | 
749 |     @get_args_as_micro_series()
750 |     def squared_poverty_gap(self, income: str, threshold: str) -> float:
751 |         """Calculate squared poverty gap, i.e., the total squared gap between
752 |         income and poverty thresholds for all people in poverty.
753 |         Also known as the poverty severity index.
754 | 
755 |         :param income: Column indicating income.
756 |         :type income: str
757 |         :param threshold: Column indicating threshold.
758 |         :type threshold: str
759 |         :return: Squared poverty gap.
760 |         :rtype: float
761 |         """
762 |         gaps = (threshold - income)[threshold > income]
763 |         squared_gaps = gaps ** 2
764 |         return squared_gaps.sum()
765 | 
766 |     @get_args_as_micro_series()
767 |     def poverty_count(
768 |         self,
769 |         income: Union[MicroSeries, str],
770 |         threshold: Union[MicroSeries, str],
771 |     ) -> int:
772 |         """
773 |         Calculates the number of entities with income below a poverty
774 |         threshold.
775 | 
776 |         :param income: income array or column name
777 |         :type income: Union[MicroSeries, str]
778 | 
779 |         :param threshold: threshold array or column name
780 |         :type threshold: Union[MicroSeries, str]
781 | 
782 |         return: number of entities in poverty
783 |         rtype: int
784 |         """
785 |         in_poverty = income < threshold
786 |         return in_poverty.sum()
787 | 
788 |     def __repr__(self):
789 |         df = pd.DataFrame(self)
790 |         df["weight"] = self.weights
791 |         return df[[df.columns[-1]] + list(df.columns[:-1])].__repr__()
792 | 


--------------------------------------------------------------------------------
/microdf/income_measures.py:
--------------------------------------------------------------------------------
 1 | import microdf as mdf
 2 | 
 3 | # See
 4 | # https://docs.google.com/spreadsheets/d/1I-Qe8uD58bLnPkimc9eaPgs4AE7x5FZYmTZwVX_WyT8
 5 | # for a comparison of income measures used here.
 6 | 
 7 | 
 8 | def cash_income(df):
 9 |     """Calculates income after taxes and cash transfers.
10 | 
11 |     Defined as aftertax_income minus non-cash benefits.
12 | 
13 |     :param df: A Tax-Calculator pandas DataFrame with columns for
14 |             * aftertax_income
15 |             * housing_ben
16 |             * mcaid_ben
17 |             * mcare_ben
18 |             * other_ben
19 |             * snap_ben
20 |             * ssi_bn
21 |             * tanf_ben
22 |             * vet_ben
23 |             * wic_ben
24 |     :returns: A pandas Series with the cash income for each row in df.
25 | 
26 |     """
27 |     return (
28 |         df.aftertax_income
29 |         - (1 - mdf.HOUSING_CASH_SHARE) * df.housing_ben
30 |         - (1 - mdf.MCAID_CASH_SHARE) * df.mcaid_ben
31 |         - (1 - mdf.MCARE_CASH_SHARE) * df.mcare_ben
32 |         - (1 - mdf.OTHER_CASH_SHARE) * df.other_ben
33 |         - (1 - mdf.SNAP_CASH_SHARE) * df.snap_ben
34 |         - (1 - mdf.SSI_CASH_SHARE) * df.ssi_ben
35 |         - (1 - mdf.TANF_CASH_SHARE) * df.tanf_ben
36 |         - (1 - mdf.VET_CASH_SHARE) * df.vet_ben
37 |         - (1 - mdf.WIC_CASH_SHARE) * df.wic_ben
38 |     )
39 | 
40 | 
41 | def tpc_eci(df):
42 |     """Approximates Tax Policy Center's Expanded Cash Income measure.
43 | 
44 |     Subtracts WIC, housing assistance, veteran's benefits, Medicare, and
45 |     Medicaid from expanded_income. ECI adds income measures not modeled in
46 |     Tax-Calculator, so these are ignored and will create a discrepancy
47 |     compared to TPC's ECI.
48 | 
49 |     :param df: DataFrame with columns from Tax-Calculator.
50 |     :returns: pandas Series with TPC's ECI.
51 | 
52 |     """
53 |     return df.expanded_income - df[mdf.ECI_REMOVE_COLS].sum(axis=1)
54 | 
55 | 
56 | def market_income(df):
57 |     """Approximates CBO's market income concept, which is income
58 |         before social insurance, means-tested transfers, and taxes.
59 | 
60 |     :param df: DataFrame with expanded_income and benefits.
61 |     :returns: pandas Series of the same length as df.
62 | 
63 |     """
64 |     return df.expanded_income - df[mdf.BENS].sum(axis=1)
65 | 


--------------------------------------------------------------------------------
/microdf/inequality.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import microdf as mdf
  4 | 
  5 | 
  6 | def gini(df, col, w=None, negatives=None, groupby=None):
  7 |     """Calculates Gini index.
  8 | 
  9 |     :param df: DataFrame.
 10 |     :param col: Name of column in df representing value.
 11 |     :param w: Column representing weight in df.
 12 |     :param negatives: An optional string indicating how to treat negative
 13 |         values of x:
 14 |         'zero' replaces negative values with zeroes.
 15 |         'shift' subtracts the minimum value from all values of x,
 16 |         when this minimum is negative. That is, it adds the absolute
 17 |         minimum value.
 18 |         Defaults to None, which leaves negative values as they are.
 19 |     :param groupby: Column, or list of columns, to group by.
 20 |     :returns: A float, the Gini index.
 21 | 
 22 |     """
 23 | 
 24 |     def _gini(df, col, w=None, negatives=None):
 25 |         # Requires float numpy arrays (not pandas Series or lists) to work.
 26 |         x = np.array(df[col]).astype("float")
 27 |         if negatives == "zero":
 28 |             x[x < 0] = 0
 29 |         if negatives == "shift" and np.amin(x) < 0:
 30 |             x -= np.amin(x)
 31 |         if w is not None:
 32 |             w = np.array(df[w]).astype("float")
 33 |             sorted_indices = np.argsort(x)
 34 |             sorted_x = x[sorted_indices]
 35 |             sorted_w = w[sorted_indices]
 36 |             cumw = np.cumsum(sorted_w)
 37 |             cumxw = np.cumsum(sorted_x * sorted_w)
 38 |             return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
 39 |                 cumxw[-1] * cumw[-1]
 40 |             )
 41 |         else:
 42 |             sorted_x = np.sort(x)
 43 |             n = len(x)
 44 |             cumxw = np.cumsum(sorted_x)
 45 |             # The above formula, with all weights equal to 1 simplifies to:
 46 |             return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n
 47 | 
 48 |     if groupby is None:
 49 |         return _gini(df, col, w, negatives)
 50 |     return df.groupby(groupby).apply(lambda x: _gini(x, col, w, negatives))
 51 | 
 52 | 
 53 | def top_x_pct_share(df, col, top_x_pct, w=None, groupby=None):
 54 |     """Calculates top x% share.
 55 | 
 56 |     :param df: DataFrame.
 57 |     :param col: Name of column in df representing value.
 58 |     :param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
 59 |     :param w: Column representing weight in df.
 60 |     :param groupby: Column, or list of columns, to group by.
 61 |     :returns: The share of w-weighted val held by the top x%.
 62 | 
 63 |     """
 64 | 
 65 |     def _top_x_pct_share(df, col, top_x_pct, w=None):
 66 |         threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct)
 67 |         top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w)
 68 |         total_sum = mdf.weighted_sum(df, col, w)
 69 |         return top_x_pct_sum / total_sum
 70 | 
 71 |     if groupby is None:
 72 |         return _top_x_pct_share(df, col, top_x_pct, w)
 73 |     return df.groupby(groupby).apply(
 74 |         lambda x: _top_x_pct_share(x, col, top_x_pct, w)
 75 |     )
 76 | 
 77 | 
 78 | def bottom_x_pct_share(df, col, bottom_x_pct, w=None, groupby=None):
 79 |     """Calculates bottom x% share.
 80 | 
 81 |     :param df: DataFrame.
 82 |     :param col: Name of column in df representing value.
 83 |     :param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
 84 |     :param w: Column representing weight in df.
 85 |     :param groupby: Column, or list of columns, to group by.
 86 |     :returns: The share of w-weighted val held by the bottom x%.
 87 | 
 88 |     """
 89 |     return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, groupby)
 90 | 
 91 | 
 92 | def bottom_50_pct_share(df, col, w=None, groupby=None):
 93 |     """Calculates bottom 50% share.
 94 | 
 95 |     :param df: DataFrame.
 96 |     :param col: Name of column in df representing value.
 97 |     :param w: Column representing weight in df.
 98 |     :param groupby: Column, or list of columns, to group by.
 99 |     :returns: The share of w-weighted val held by the bottom 50%.
100 | 
101 |     """
102 |     return bottom_x_pct_share(df, col, 0.5, w, groupby)
103 | 
104 | 
105 | def top_50_pct_share(df, col, w=None, groupby=None):
106 |     """Calculates top 50% share.
107 | 
108 |     :param df: DataFrame.
109 |     :param col: Name of column in df representing value.
110 |     :param w: Column representing weight in df.
111 |     :param groupby: Column, or list of columns, to group by.
112 |     :returns: The share of w-weighted val held by the top 50%.
113 | 
114 |     """
115 |     return top_x_pct_share(df, col, 0.5, w, groupby)
116 | 
117 | 
118 | def top_10_pct_share(df, col, w=None, groupby=None):
119 |     """Calculates top 10% share.
120 | 
121 |     :param df: DataFrame.
122 |     :param col: Name of column in df representing value.
123 |     :param w: Column representing weight in df.
124 |     :param groupby: Column, or list of columns, to group by.
125 |     :returns: The share of w-weighted val held by the top 10%.
126 | 
127 |     """
128 |     return top_x_pct_share(df, col, 0.1, w, groupby)
129 | 
130 | 
131 | def top_1_pct_share(df, col, w=None, groupby=None):
132 |     """Calculates top 1% share.
133 | 
134 |     :param df: DataFrame.
135 |     :param col: Name of column in df representing value.
136 |     :param w: Column representing weight in df.
137 |     :param groupby: Column, or list of columns, to group by.
138 |     :returns: The share of w-weighted val held by the top 1%.
139 | 
140 |     """
141 |     return top_x_pct_share(df, col, 0.01, w, groupby)
142 | 
143 | 
144 | def top_0_1_pct_share(df, col, w=None, groupby=None):
145 |     """Calculates top 0.1% share.
146 | 
147 |     :param df: DataFrame.
148 |     :param col: Name of column in df representing value.
149 |     :param w: Column representing weight in df.
150 |     :param groupby: Column, or list of columns, to group by.
151 |     :returns: The share of w-weighted val held by the top 0.1%.
152 | 
153 |     """
154 |     return top_x_pct_share(df, col, 0.001, w, groupby)
155 | 
156 | 
157 | def t10_b50(df, col, w=None, groupby=None):
158 |     """Calculates ratio between the top 10% and bottom 50% shares.
159 | 
160 |     :param df: DataFrame.
161 |     :param col: Name of column in df representing value.
162 |     :param w: Column representing weight in df.
163 |     :param groupby: Column, or list of columns, to group by.
164 |     :returns: The share of w-weighted val held by the top 10% divided by
165 |         the share of w-weighted val held by the bottom 50%.
166 | 
167 |     """
168 |     t10 = top_10_pct_share(df, col, w, groupby)
169 |     b50 = bottom_50_pct_share(df, col, w, groupby)
170 |     return t10 / b50
171 | 


--------------------------------------------------------------------------------
/microdf/io.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import zipfile
 3 | import requests
 4 | import pandas as pd
 5 | 
 6 | HEADER = {
 7 |     "User-Agent":
 8 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) " +
 9 |     "AppleWebKit/537.36 (KHTML, like Gecko) " +
10 |     "Chrome/50.0.2661.102 Safari/537.36"
11 |     }
12 | 
13 | 
14 | def read_stata_zip(url: str, **kwargs) -> pd.DataFrame:
15 |     """Reads zipped Stata file by URL.
16 | 
17 |     From https://stackoverflow.com/a/59122689/1840471
18 | 
19 |     Pending native support in
20 |     https://github.com/pandas-dev/pandas/issues/26599.
21 | 
22 |     :param url: URL string of .zip file containing a single
23 |             .dta file.
24 |     :param **kwargs: Arguments passed to pandas.read_stata().
25 |     :returns: DataFrame.
26 | 
27 |     """
28 |     r = requests.get(url, headers=HEADER)
29 |     data = io.BytesIO(r.content)
30 |     with zipfile.ZipFile(data) as archive:
31 |         with archive.open(archive.namelist()[0]) as stata:
32 |             return pd.read_stata(stata, **kwargs)
33 | 


--------------------------------------------------------------------------------
/microdf/poverty.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | def fpl(people: int):
  6 |     """Calculates the federal poverty guideline for a household of a certain
  7 |        size.
  8 | 
  9 |     :param XTOT: The number of people in the household.
 10 |     :param people: returns: The federal poverty guideline for the contiguous
 11 |       48 states.
 12 |     :returns: The federal poverty guideline for the contiguous 48 states.
 13 | 
 14 |     """
 15 |     return 7820 + 4320 * people
 16 | 
 17 | 
 18 | def poverty_rate(
 19 |     df: pd.DataFrame, income: str, threshold: str, w: str = None
 20 | ) -> float:
 21 |     """Calculate poverty rate, i.e., the population share with income
 22 |        below their poverty threshold.
 23 | 
 24 |     :param df: DataFrame with income, threshold, and possibly weight columns
 25 |         for each person/household.
 26 |     :type df: pd.DataFrame
 27 |     :param income: Column indicating income.
 28 |     :type income: str
 29 |     :param threshold: Column indicating threshold.
 30 |     :type threshold: str
 31 |     :param w: Column indicating weight, defaults to None (unweighted).
 32 |     :type w: str, optional
 33 |     :return: Poverty rate between zero and one.
 34 |     :rtype: float
 35 |     """
 36 |     pov = df[income] < df[threshold]
 37 |     if w is None:
 38 |         return pov.mean()
 39 |     return (pov * df[w]).sum() / df[w].sum()
 40 | 
 41 | 
 42 | def deep_poverty_rate(
 43 |     df: pd.DataFrame, income: str, threshold: str, w: str = None
 44 | ) -> float:
 45 |     """Calculate deep poverty rate, i.e., the population share with income
 46 |        below half their poverty threshold.
 47 | 
 48 |     :param df: DataFrame with income, threshold, and possibly weight columns
 49 |         for each person/household.
 50 |     :type df: pd.DataFrame
 51 |     :param income: Column indicating income.
 52 |     :type income: str
 53 |     :param threshold: Column indicating threshold.
 54 |     :type threshold: str
 55 |     :param w: Column indicating weight, defaults to None (unweighted).
 56 |     :type w: str, optional
 57 |     :return: Deep poverty rate between zero and one.
 58 |     :rtype: float
 59 |     """
 60 |     pov = df[income] < df[threshold] / 2
 61 |     if w is None:
 62 |         return pov.mean()
 63 |     return (pov * df[w]).sum() / df[w].sum()
 64 | 
 65 | 
 66 | def poverty_gap(
 67 |     df: pd.DataFrame, income: str, threshold: str, w: str = None
 68 | ) -> float:
 69 |     """Calculate poverty gap, i.e., the total gap between income and poverty
 70 |        thresholds for all people in poverty.
 71 | 
 72 |     :param df: DataFrame with income, threshold, and possibly weight columns
 73 |         for each household (data should represent households, not persons).
 74 |     :type df: pd.DataFrame
 75 |     :param income: Column indicating income.
 76 |     :type income: str
 77 |     :param threshold: Column indicating threshold.
 78 |     :type threshold: str
 79 |     :param w: Column indicating weight, defaults to None (unweighted).
 80 |     :type w: str, optional
 81 |     :return: Poverty gap.
 82 |     :rtype: float
 83 |     """
 84 |     gap = np.maximum(df[threshold] - df[income], 0)
 85 |     if w is None:
 86 |         return gap.sum()
 87 |     return (gap * df[w]).sum()
 88 | 
 89 | 
 90 | def squared_poverty_gap(
 91 |     df: pd.DataFrame, income: str, threshold: str, w: str = None
 92 | ) -> float:
 93 |     """Calculate squared poverty gap, i.e., the total squared gap between
 94 |        income and poverty thresholds for all people in poverty.
 95 |        Also known as poverty severity index.
 96 | 
 97 |     :param df: DataFrame with income, threshold, and possibly weight columns
 98 |         for each household (data should represent households, not persons).
 99 |     :type df: pd.DataFrame
100 |     :param income: Column indicating income.
101 |     :type income: str
102 |     :param threshold: Column indicating threshold.
103 |     :type threshold: str
104 |     :param w: Column indicating weight, defaults to None (unweighted).
105 |     :type w: str, optional
106 |     :return: Squared poverty gap.
107 |     :rtype: float
108 |     """
109 |     gap = np.maximum(df[threshold] - df[income], 0)
110 |     sq_gap = np.power(gap, 2)
111 |     if w is None:
112 |         return sq_gap.sum()
113 |     return (sq_gap * df[w]).sum()
114 | 
115 | 
116 | def deep_poverty_gap(
117 |     df: pd.DataFrame, income: str, threshold: str, w: str = None
118 | ) -> float:
119 |     """Calculate deep poverty gap, i.e., the total gap between income and
120 |        halved poverty thresholds for all people in deep poverty.
121 | 
122 |     :param df: DataFrame with income, threshold, and possibly weight columns
123 |         for each household (data should represent households, not persons).
124 |     :type df: pd.DataFrame
125 |     :param income: Column indicating income.
126 |     :type income: str
127 |     :param threshold: Column indicating threshold.
128 |     :type threshold: str
129 |     :param w: Column indicating weight, defaults to None (unweighted).
130 |     :type w: str, optional
131 |     :return: Deep poverty gap.
132 |     :rtype: float
133 |     """
134 |     gap = np.maximum((df[threshold] / 2) - df[income], 0)
135 |     if w is None:
136 |         return gap.sum()
137 |     return (gap * df[w]).sum()
138 | 


--------------------------------------------------------------------------------
/microdf/style.py:
--------------------------------------------------------------------------------
 1 | TITLE_COLOR = "#212121"
 2 | AXIS_COLOR = "#757575"
 3 | GRID_COLOR = "#eeeeee"  # Previously lighter #f5f5f5.
 4 | DPI = 200
 5 | 
 6 | 
 7 | def set_plot_style(dpi: int = DPI):
 8 |     """Set plot style.
 9 | 
10 |     :param dpi: DPI for saving and displaying figures, defaults to microdf.DPI
11 |         (200).
12 |     :type dpi: int, optional
13 |     """
14 |     try:
15 |         import seaborn as sns
16 |         import matplotlib as mpl
17 |         import matplotlib.font_manager as fm
18 |     except ImportError:
19 |         raise ImportError(
20 |             "The function you've called requires extra dependencies. " +
21 |             "Please install microdf with the 'charts' extra by running " +
22 |             "'pip install microdf[charts]'"
23 |         )
24 | 
25 |     sns.set_style("white")
26 | 
27 |     # Set up Roboto. Must be downloaded in the current directory.
28 |     # See https://stackoverflow.com/a/51844978/1840471.
29 |     fm.fontManager.ttflist += fm.createFontList(["Roboto-Regular.ttf"])
30 | 
31 |     STYLE = {
32 |         "savefig.dpi": dpi,
33 |         "figure.dpi": dpi,
34 |         "figure.figsize": (6.4, 4.8),  # Default.
35 |         "font.sans-serif": "Roboto",
36 |         "font.family": "sans-serif",
37 |         # Set title text color to dark gray (https://material.io/color) not
38 |         # black.
39 |         "text.color": TITLE_COLOR,
40 |         # Axis titles and tick marks are medium gray.
41 |         "axes.labelcolor": AXIS_COLOR,
42 |         "xtick.color": AXIS_COLOR,
43 |         "ytick.color": AXIS_COLOR,
44 |         # Grid is light gray.
45 |         "axes.grid": True,
46 |         "grid.color": GRID_COLOR,
47 |         # Equivalent to seaborn.despine(left=True, bottom=True).
48 |         "axes.spines.left": False,
49 |         "axes.spines.right": False,
50 |         "axes.spines.top": False,
51 |         "axes.spines.bottom": False,
52 |     }
53 | 
54 |     mpl.rcParams.update(STYLE)
55 | 


--------------------------------------------------------------------------------
/microdf/tax.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def mtr(val, brackets, rates):
 6 |     """Calculates the marginal tax rate applied to a value depending on a
 7 |     tax schedule.
 8 | 
 9 |     :param val: Value to assess tax on, e.g. wealth or income (list or Series).
10 |     :param brackets: Left side of each bracket (list or Series).
11 |     :param rates: Rate corresponding to each bracket.
12 |     :returns: Series of the size of val representing the marginal tax rate.
13 | 
14 |     """
15 |     df_tax = pd.DataFrame({"brackets": brackets, "rates": rates})
16 |     df_tax["base_tax"] = (
17 |         df_tax.brackets.sub(df_tax.brackets.shift(fill_value=0))
18 |         .mul(df_tax.rates.shift(fill_value=0))
19 |         .cumsum()
20 |     )
21 |     rows = df_tax.brackets.searchsorted(val, side="right") - 1
22 |     income_bracket_df = df_tax.loc[rows].reset_index(drop=True)
23 |     return income_bracket_df.rates
24 | 
25 | 
26 | def tax_from_mtrs(
27 |     val,
28 |     brackets,
29 |     rates,
30 |     avoidance_rate=0,
31 |     avoidance_elasticity=0,
32 |     avoidance_elasticity_flat=0,
33 | ):
34 |     """Calculates tax liability based on a marginal tax rate schedule.
35 | 
36 |     :param val: Value to assess tax on, e.g. wealth or income (list or Series).
37 |     :param brackets: Left side of each bracket (list or Series).
38 |     :param rates: Rate corresponding to each bracket.
39 |     :param avoidance_rate: Constant avoidance/evasion rate in percentage terms.
40 |                         Defaults to zero.
41 |     :param avoidance_elasticity: Avoidance/evasion elasticity.
42 |                               Response of log taxable value with respect
43 |                               to tax rate.
44 |                               Defaults to zero. Should be positive.
45 |     :param avoidance_elasticity_flat: Response of taxable value with respect
46 |                                    to tax rate.
47 |                                    Use avoidance_elasticity in most cases.
48 |                                    Defaults to zero. Should be positive.
49 |     :returns: Series of tax liabilities with the same size as val.
50 | 
51 |     """
52 |     assert (
53 |         avoidance_rate == 0
54 |         or avoidance_elasticity == 0
55 |         or avoidance_elasticity_flat == 0
56 |     ), "Cannot supply multiple avoidance parameters."
57 |     assert (
58 |         avoidance_elasticity >= 0
59 |     ), "Provide nonnegative avoidance_elasticity."
60 |     df_tax = pd.DataFrame({"brackets": brackets, "rates": rates})
61 |     df_tax["base_tax"] = (
62 |         df_tax.brackets.sub(df_tax.brackets.shift(fill_value=0))
63 |         .mul(df_tax.rates.shift(fill_value=0))
64 |         .cumsum()
65 |     )
66 |     if avoidance_rate == 0:  # Only need MTRs if elasticity is supplied.
67 |         mtrs = mtr(val, brackets, rates)
68 |     if avoidance_elasticity > 0:
69 |         avoidance_rate = 1 - np.exp(-avoidance_elasticity * mtrs)
70 |     if avoidance_elasticity_flat > 0:
71 |         avoidance_rate = avoidance_elasticity_flat * mtrs
72 |     taxable = pd.Series(val) * (1 - avoidance_rate)
73 |     rows = df_tax.brackets.searchsorted(taxable, side="right") - 1
74 |     income_bracket_df = df_tax.loc[rows].reset_index(drop=True)
75 |     return (
76 |         pd.Series(taxable)
77 |         .sub(income_bracket_df.brackets)
78 |         .mul(income_bracket_df.rates)
79 |         .add(income_bracket_df.base_tax)
80 |     )
81 | 


--------------------------------------------------------------------------------
/microdf/taxcalc.py:
--------------------------------------------------------------------------------
  1 | import microdf as mdf
  2 | from microdf._optional import import_optional_dependency
  3 | 
  4 | 
  5 | def static_baseline_calc(recs, year):
  6 |     """Creates a static Calculator object.
  7 | 
  8 |     :param recs: Records object.
  9 |     :param year: Year to advance to.
 10 |     :returns: Calculator object.
 11 | 
 12 |     """
 13 |     tc = import_optional_dependency("taxcalc")
 14 |     calc = tc.Calculator(records=recs, policy=tc.Policy())
 15 |     calc.advance_to_year(year)
 16 |     calc.calc_all()
 17 |     return calc
 18 | 
 19 | 
 20 | def add_weighted_metrics(df, metric_vars, w="s006", divisor=1e6, suffix="_m"):
 21 |     """Adds weighted metrics in millions to a Tax-Calculator pandas DataFrame.
 22 | 
 23 |     Columns are renamed to *_m.
 24 | 
 25 |     :param df: A pandas DataFrame containing Tax-Calculator data.
 26 |     :param metric_vars: A list of column names to weight, or a single column
 27 |         name.
 28 |     :param w: Weight column. Defaults to s006.
 29 |     :param divisor: Number by which the product is divided. Defaults to 1e6.
 30 |     :param suffix: Suffix to add to each weighted total. Defaults to '_m'
 31 |             to match divisor default of 1e6.
 32 |     :returns: Nothing. Weighted columns are added in place.
 33 | 
 34 |     """
 35 |     df[w + suffix] = df[w] / divisor
 36 |     metric_vars = mdf.listify(metric_vars)
 37 |     for metric_var in metric_vars:
 38 |         df[metric_var + suffix] = df[metric_var] * df[w + suffix]
 39 | 
 40 | 
 41 | def n65(age_head, age_spouse, elderly_dependents):
 42 |     """Calculates number of people in the tax unit age 65 or older.
 43 | 
 44 |     :param age_head: Series representing age_head from taxcalc data.
 45 |     :param age_spouse: Series representing age_spouse from taxcalc data.
 46 |     :param elderly_dependents: Series representing elderly_dependents from
 47 |             taxcalc data.
 48 |     :returns: Series representing the number of people age 65 or older.
 49 | 
 50 |     """
 51 |     return (
 52 |         (age_head >= 65).astype(int)
 53 |         + (age_spouse >= 65).astype(int)
 54 |         + elderly_dependents
 55 |     )
 56 | 
 57 | 
 58 | def calc_df(
 59 |     records=None,
 60 |     policy=None,
 61 |     year=2020,
 62 |     reform=None,
 63 |     group_vars=None,
 64 |     metric_vars=None,
 65 |     group_n65=False,
 66 | ):
 67 |     """Creates a pandas DataFrame for given Tax-Calculator data.
 68 | 
 69 |     s006 is always included, and RECID is used as an index.
 70 | 
 71 |     :param records: An optional Records object. If not provided, uses CPS
 72 |         records. (Default value = None)
 73 |     :param policy: An optional Policy object. If not provided, uses default
 74 |             Policy.
 75 |     :param year: An optional year to advance to. If not provided, defaults to
 76 |             2020.
 77 |     :param reform: An optional reform to implement for the Policy object.
 78 |         (Default value = None)
 79 |     :param group_vars: An optional list of column names to include in the
 80 |             DataFrame. (Default value = None)
 81 |     :param metric_vars: An optional list of column names to include and
 82 |         calculate weighted sums of (in millions named as *_m) in the DataFrame.
 83 |         (Default value = None)
 84 |     :param group_n65: Whether to calculate and group by n65. Defaults to False.
 85 |     :returns: A pandas DataFrame. market_income is also always calculated.
 86 | 
 87 |     """
 88 |     tc = import_optional_dependency("taxcalc")
 89 |     # Assign defaults.
 90 |     if records is None:
 91 |         records = tc.Records.cps_constructor()
 92 |     if policy is None:
 93 |         policy = tc.Policy()
 94 |     if reform is not None:
 95 |         policy.implement_reform(reform)
 96 |     # Calculate.
 97 |     calc = tc.Calculator(records=records, policy=policy, verbose=False)
 98 |     calc.advance_to_year(year)
 99 |     calc.calc_all()
100 |     # Get a deduplicated list of all columns.
101 |     if group_n65:
102 |         group_vars = group_vars + [
103 |             "age_head",
104 |             "age_spouse",
105 |             "elderly_dependents",
106 |         ]
107 |     # Include expanded_income and benefits to produce market_income.
108 |     all_cols = mdf.listify(
109 |         [
110 |             "RECID",
111 |             "s006",
112 |             "expanded_income",
113 |             "aftertax_income",
114 |             mdf.BENS,
115 |             group_vars,
116 |             metric_vars,
117 |         ]
118 |     )
119 |     df = calc.dataframe(all_cols)
120 |     # Create core elements.
121 |     df["market_income"] = mdf.market_income(df)
122 |     df["bens"] = df[mdf.BENS].sum(axis=1)
123 |     df["tax"] = df.expanded_income - df.aftertax_income
124 |     if group_n65:
125 |         df["n65"] = n65(df.age_head, df.age_spouse, df.elderly_dependents)
126 |         df.drop(
127 |             ["age_head", "age_spouse", "elderly_dependents"],
128 |             axis=1,
129 |             inplace=True,
130 |         )
131 |     # Add calculated columns for metrics.
132 |     mdf.add_weighted_metrics(df, metric_vars)
133 |     # Set RECID to int and set it as index before returning.
134 |     df["RECID"] = df.RECID.map(int)
135 |     return df.set_index("RECID")
136 | 
137 | 
138 | def recalculate(df):
139 |     """Recalculates fields in the DataFrame for after components have changed.
140 | 
141 |     :param df: DataFrame for use in microdf.
142 |     :returns: Nothing. Updates the DataFrame in place.
143 | 
144 |     """
145 |     # Recalculate TPC's Expanded Cash Income measure.
146 |     cols = df.columns
147 |     if "tpc_eci" in cols:
148 |         df.tpc_eci = mdf.tpc_eci(df)
149 |     # Recalculate weighted metrics (anything ending in _m).
150 |     mcols = cols[cols.str.endswith("_m")]
151 |     mdf.add_weighted_metrics(df, mcols)
152 | 


--------------------------------------------------------------------------------
/microdf/tests/__pycache__/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/home/mghenis/anaconda3/bin/python3"
3 | }


--------------------------------------------------------------------------------
/microdf/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def tests_path():
 8 |     """ """
 9 |     return os.path.abspath(os.path.dirname(__file__))
10 | 


--------------------------------------------------------------------------------
/microdf/tests/test_compare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | import microdf as mdf
 7 | 
 8 | 
 9 | def differences(actual, expected, f_actual, f_expected):
10 |     """Check for differences between results in afilename and efilename files.
11 | 
12 |     :param actual: Actual DataFrame.
13 |     :param expected: Expected DataFrame.
14 |     :param f_actual: Filename of the actual CSV.
15 |     :param f_expected: Filename of the expected CSV.
16 |     """
17 |     if not np.allclose(actual, expected):
18 |         msg = "COMPARE RESULTS DIFFER\n"
19 |         msg += "-------------------------------------------------\n"
20 |         msg += "--- NEW RESULTS IN {} FILE ---\n"
21 |         msg += "--- if new OK, copy {} to  ---\n"
22 |         msg += "---                 {}     ---\n"
23 |         msg += "---            and rerun test.                ---\n"
24 |         msg += "-------------------------------------------------\n"
25 |         raise ValueError(msg.format(f_actual, f_actual, f_expected))
26 | 
27 | 
28 | def test_percentile_agg_compare(tests_path):
29 |     """
30 |     :param tests_path: Folder path to write test results.
31 |     """
32 |     N = 1000
33 |     np.random.seed(0)
34 |     df = pd.DataFrame({"val": np.random.rand(N), "w": np.random.rand(N)})
35 |     mdf.add_weighted_quantiles(df, "val", "w")
36 |     percentile_sum = df.groupby("val_percentile")[["val", "w"]].sum()
37 |     F_ACTUAL = "test_percentile_actual.csv"
38 |     F_EXPECTED = "test_percentile_expected.csv"
39 |     percentile_sum.to_csv(os.path.join(tests_path, F_ACTUAL))
40 |     # Re-read as CSV to remove index and ensure CSVs are equal.
41 |     actual = pd.read_csv(os.path.join(tests_path, F_ACTUAL))
42 |     expected = pd.read_csv(os.path.join(tests_path, F_EXPECTED))
43 |     differences(actual, expected, F_ACTUAL, F_EXPECTED)
44 | 


--------------------------------------------------------------------------------
/microdf/tests/test_generic.py:
--------------------------------------------------------------------------------
  1 | from microdf.generic import MicroDataFrame, MicroSeries
  2 | import numpy as np
  3 | import microdf as mdf
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def test_df_init():
  8 |     arr = np.array([0, 1, 1])
  9 |     w = np.array([3, 0, 9])
 10 |     df = mdf.MicroDataFrame({"a": arr}, weights=w)
 11 |     assert df.a.mean() == np.average(arr, weights=w)
 12 | 
 13 |     df = mdf.MicroDataFrame()
 14 |     df["a"] = arr
 15 |     df.set_weights(w)
 16 |     assert df.a.mean() == np.average(arr, weights=w)
 17 | 
 18 |     df = mdf.MicroDataFrame()
 19 |     df["a"] = arr
 20 |     df["w"] = w
 21 |     df.set_weight_col("w")
 22 |     assert df.a.mean() == np.average(arr, weights=w)
 23 | 
 24 | 
 25 | def test_series_getitem():
 26 |     arr = np.array([0, 1, 1])
 27 |     w = np.array([3, 0, 9])
 28 |     s = mdf.MicroSeries(arr, weights=w)
 29 |     assert s[[1, 2]].sum() == np.sum(arr[[1, 2]] * w[[1, 2]])
 30 | 
 31 |     assert s[1:3].sum() == np.sum(arr[1:3] * w[1:3])
 32 | 
 33 | 
 34 | def test_sum():
 35 |     arr = np.array([0, 1, 1])
 36 |     w = np.array([3, 0, 9])
 37 |     series = mdf.MicroSeries(arr, weights=w)
 38 |     assert series.sum() == (arr * w).sum()
 39 | 
 40 |     arr = np.linspace(-20, 100, 100)
 41 |     w = np.linspace(1, 3, 100)
 42 |     series = mdf.MicroSeries(arr)
 43 |     series.set_weights(w)
 44 |     assert series.sum() == (arr * w).sum()
 45 | 
 46 |     # Verify that an error is thrown when passing weights of different size
 47 |     # from the values.
 48 |     w = np.linspace(1, 3, 101)
 49 |     series = mdf.MicroSeries(arr)
 50 |     try:
 51 |         series.set_weights(w)
 52 |         assert False
 53 |     except Exception:
 54 |         pass
 55 | 
 56 | 
 57 | def test_mean():
 58 |     arr = np.array([3, 0, 2])
 59 |     w = np.array([4, 1, 1])
 60 |     series = mdf.MicroSeries(arr, weights=w)
 61 |     assert series.mean() == np.average(arr, weights=w)
 62 | 
 63 |     arr = np.linspace(-20, 100, 100)
 64 |     w = np.linspace(1, 3, 100)
 65 |     series = mdf.MicroSeries(arr)
 66 |     series.set_weights(w)
 67 |     assert series.mean() == np.average(arr, weights=w)
 68 | 
 69 |     w = np.linspace(1, 3, 101)
 70 |     series = mdf.MicroSeries(arr)
 71 |     try:
 72 |         series.set_weights(w)
 73 |         assert False
 74 |     except Exception:
 75 |         pass
 76 | 
 77 | 
 78 | def test_poverty_count():
 79 |     arr = np.array([10000, 20000, 50000])
 80 |     w = np.array([1123, 1144, 2211])
 81 |     df = MicroDataFrame(weights=w)
 82 |     df["income"] = arr
 83 |     df["threshold"] = 16000
 84 |     assert df.poverty_count("income", "threshold") == w[0]
 85 | 
 86 | 
 87 | def test_median():
 88 |     # 1, 2, 3, 4, *4*, 4, 5, 5, 5
 89 |     arr = np.array([1, 2, 3, 4, 5])
 90 |     w = np.array([1, 1, 1, 3, 3])
 91 |     series = mdf.MicroSeries(arr, weights=w)
 92 |     assert series.median() == 4
 93 | 
 94 | 
 95 | def test_unweighted_groupby():
 96 |     df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]})
 97 |     assert (df.groupby("x").z.sum().values == np.array([5.0, 6.0])).all()
 98 | 
 99 | 
100 | def test_multiple_groupby():
101 |     df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]})
102 |     assert (df.groupby(["x", "y"]).z.sum() == np.array([5, 6])).all()
103 | 
104 | 
105 | def test_concat():
106 |     df1 = mdf.MicroDataFrame({"x": [1, 2]}, weights=[3, 4])
107 |     df2 = mdf.MicroDataFrame({"y": [5, 6]}, weights=[7, 8])
108 |     # Verify that pd.concat returns DataFrame (probably no way to fix this).
109 |     pd_long = pd.concat([df1, df2])
110 |     assert isinstance(pd_long, pd.DataFrame)
111 |     assert not isinstance(pd_long, mdf.MicroDataFrame)
112 |     # Verify that mdf.concat works.
113 |     mdf_long = mdf.concat([df1, df2])
114 |     assert isinstance(mdf_long, mdf.MicroDataFrame)
115 |     # Weights should be preserved.
116 |     assert mdf_long.weights.equals(pd.concat([df1.weights, df2.weights]))
117 |     # Verify it works horizontally too (take the first set of weights).
118 |     mdf_wide = mdf.concat([df1, df2], axis=1)
119 |     assert isinstance(mdf_wide, mdf.MicroDataFrame)
120 |     assert mdf_wide.weights.equals(df1.weights)
121 | 
122 | 
123 | def test_set_index():
124 |     d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6])
125 |     assert d.x.__class__ == MicroSeries
126 |     d.index = [1, 2, 3]
127 |     assert d.x.__class__ == MicroSeries
128 | 
129 | 
130 | def test_reset_index():
131 |     d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6])
132 |     assert d.reset_index().__class__ == MicroDataFrame
133 | 
134 | 
135 | def test_cumsum():
136 |     s = mdf.MicroSeries([1, 2, 3], weights=[4, 5, 6])
137 |     assert np.array_equal(s.cumsum().values, [4, 14, 32])
138 | 
139 |     s = mdf.MicroSeries([2, 1, 3], weights=[5, 4, 6])
140 |     assert np.array_equal(s.cumsum().values, [10, 14, 32])
141 | 
142 |     s = mdf.MicroSeries([3, 1, 2], weights=[6, 4, 5])
143 |     assert np.array_equal(s.cumsum().values, [18, 22, 32])
144 | 
145 | 
146 | def test_rank():
147 |     s = mdf.MicroSeries([1, 2, 3], weights=[4, 5, 6])
148 |     assert np.array_equal(s.rank().values, [4, 9, 15])
149 | 
150 |     s = mdf.MicroSeries([3, 1, 2], weights=[6, 4, 5])
151 |     assert np.array_equal(s.rank().values, [15, 4, 9])
152 | 
153 |     s = mdf.MicroSeries([2, 1, 3], weights=[5, 4, 6])
154 |     assert np.array_equal(s.rank().values, [9, 4, 15])
155 | 
156 | 
157 | def test_percentile_rank():
158 |     s = mdf.MicroSeries([4, 2, 3, 1], weights=[20, 40, 20, 20])
159 |     assert np.array_equal(s.percentile_rank().values, [100, 60, 80, 20])
160 | 
161 | 
162 | def test_quartile_rank():
163 |     s = mdf.MicroSeries([4, 2, 3], weights=[25, 50, 25])
164 |     assert np.array_equal(s.quartile_rank().values, [4, 2, 3])
165 | 
166 | 
167 | def test_quintile_rank():
168 |     s = mdf.MicroSeries([4, 2, 3], weights=[20, 60, 20])
169 |     assert np.array_equal(s.quintile_rank().values, [5, 3, 4])
170 | 
171 | 
172 | def test_decile_rank_rank():
173 |     s = mdf.MicroSeries(
174 |         [5, 4, 3, 2, 1, 6, 7, 8, 9],
175 |         weights=[10, 20, 10, 10, 10, 10, 10, 10, 10, 10],
176 |     )
177 |     assert np.array_equal(s.decile_rank().values, [6, 5, 3, 2, 1, 7, 8, 9, 10])
178 | 
179 | 
180 | def test_copy_equals():
181 |     d = mdf.MicroDataFrame(
182 |         {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8]
183 |     )
184 |     d_copy = d.copy()
185 |     d_copy_diff_weights = d_copy.copy()
186 |     d_copy_diff_weights.weights *= 2
187 |     assert d.equals(d_copy)
188 |     assert not d.equals(d_copy_diff_weights)
189 |     # Same for a MicroSeries.
190 |     assert d.x.equals(d_copy.x)
191 |     assert not d.x.equals(d_copy_diff_weights.x)
192 | 
193 | 
194 | def test_subset():
195 |     df = mdf.MicroDataFrame(
196 |         {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8]
197 |     )
198 |     df_no_z = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4]}, weights=[7, 8])
199 |     assert df[["x", "y"]].equals(df_no_z)
200 |     df_no_z_diff_weights = df_no_z.copy()
201 |     df_no_z_diff_weights.weights += 1
202 |     assert not df[["x", "y"]].equals(df_no_z_diff_weights)
203 | 
204 | 
205 | def test_value_subset():
206 |     d = mdf.MicroDataFrame({"x": [1, 2, 3], "y": [1, 2, 2]}, weights=[4, 5, 6])
207 |     d2 = d[d.y > 1]
208 |     assert d2.y.shape == d2.weights.shape
209 | 


--------------------------------------------------------------------------------
/microdf/tests/test_inequality.py:
--------------------------------------------------------------------------------
 1 | import microdf as mdf
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def test_top_pct():
 7 |     x = list(range(1, 11))  # 1 to 10. Sum = 10 * 11 / 2 = 55.
 8 |     df = pd.DataFrame({"x": x})
 9 |     ms = mdf.MicroSeries(x)
10 |     RES = 10 / 55
11 |     assert mdf.top_10_pct_share(df, "x") == RES
12 |     assert ms.top_10_pct_share() == RES
13 |     x = list(range(1, 4))
14 |     df = pd.DataFrame({"x": x, "w": x})
15 |     ms = mdf.MicroSeries(x, weights=x)
16 |     # This is equivalent to [1, 2, 2, 3, 3, 3]
17 |     # Sum = 14, top half is 9.
18 |     RES = 9 / 14
19 |     assert mdf.top_50_pct_share(df, "x", "w") == RES
20 |     assert ms.top_50_pct_share() == RES
21 | 


--------------------------------------------------------------------------------
/microdf/tests/test_io.py:
--------------------------------------------------------------------------------
 1 | import microdf as mdf
 2 | 
 3 | 
 4 | def test_read_stata_zip():
 5 |     """ """
 6 |     SCF2016 = "https://www.federalreserve.gov/econres/files/scfp2016s.zip"
 7 |     COLS = ["wgt", "networth"]
 8 |     df = mdf.read_stata_zip(SCF2016, columns=COLS)
 9 |     assert df.columns.tolist() == COLS
10 |     assert df.shape[0] > 0
11 | 


--------------------------------------------------------------------------------
/microdf/tests/test_optional_dependency.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import types
 3 | 
 4 | import pytest
 5 | 
 6 | from microdf._optional import VERSIONS, import_optional_dependency
 7 | 
 8 | 
 9 | def test_import_optional():
10 |     """ """
11 |     match = "Missing .*notapackage.* pip .* conda .* notapackage"
12 |     with pytest.raises(ImportError, match=match):
13 |         import_optional_dependency("notapackage")
14 | 
15 |     result = import_optional_dependency("notapackage", raise_on_missing=False)
16 |     assert result is None
17 | 
18 | 
19 | def test_xlrd_version_fallback():
20 |     """ """
21 |     pytest.importorskip("xlrd")
22 |     import_optional_dependency("xlrd")
23 | 
24 | 
25 | def test_bad_version():
26 |     """ """
27 |     name = "fakemodule"
28 |     module = types.ModuleType(name)
29 |     module.__version__ = "0.9.0"
30 |     sys.modules[name] = module
31 |     VERSIONS[name] = "1.0.0"
32 | 
33 |     match = "microdf requires .*1.0.0.* of .fakemodule.*'0.9.0'"
34 |     with pytest.raises(ImportError, match=match):
35 |         import_optional_dependency("fakemodule")
36 | 
37 |     with pytest.warns(UserWarning):
38 |         result = import_optional_dependency("fakemodule", on_version="warn")
39 |     assert result is None
40 | 
41 |     module.__version__ = "1.0.0"  # exact match is OK
42 |     result = import_optional_dependency("fakemodule")
43 |     assert result is module
44 | 
45 | 
46 | def test_no_version_raises():
47 |     """ """
48 |     name = "fakemodule"
49 |     module = types.ModuleType(name)
50 |     sys.modules[name] = module
51 |     VERSIONS[name] = "1.0.0"
52 | 
53 |     with pytest.raises(ImportError, match="Can't determine .* fakemodule"):
54 |         import_optional_dependency(name)
55 | 


--------------------------------------------------------------------------------
/microdf/tests/test_percentile_actual.csv:
--------------------------------------------------------------------------------
  1 | val_percentile,val,w
  2 | 1,0.04936696707980226,5.0030480466019815
  3 | 2,0.17830779834685495,5.114684559704431
  4 | 3,0.2857988855674821,5.003202737366776
  5 | 4,0.33805302460864795,4.733066370107501
  6 | 5,0.5703591960673162,5.239285359053462
  7 | 6,0.6096331244255117,5.429420284539545
  8 | 7,0.5985039643349068,4.348564066721563
  9 | 8,0.7510884464659845,5.440450569600465
 10 | 9,1.0876105767407074,5.499343558417599
 11 | 10,0.8899045403172029,5.1538348751709435
 12 | 11,1.394683443411786,5.23502927227519
 13 | 12,0.9305982112361821,4.417722946149524
 14 | 13,0.9650569353812546,5.348799899633306
 15 | 14,1.3214822270450117,5.3325704209630524
 16 | 15,1.2715100776802908,5.061289605022857
 17 | 16,1.6720424048893725,5.305386843295495
 18 | 17,1.9424737512447825,5.255981017771979
 19 | 18,1.0348899262653468,4.336843836714941
 20 | 19,1.7975116284606523,5.1773144696335915
 21 | 20,2.1006171498258643,5.589603555523628
 22 | 21,1.8709513510005413,5.335531722226071
 23 | 22,1.7207120623536305,4.695751363757509
 24 | 23,2.00187368264572,5.054306807303892
 25 | 24,2.313857097420784,4.884140680676319
 26 | 25,2.896700637851371,5.5236380227234845
 27 | 26,2.489820947315488,4.791953107597212
 28 | 27,2.8411561635459712,5.5661465432344635
 29 | 28,2.4261357418555654,4.446073644558097
 30 | 29,3.357591023039818,5.291769801304535
 31 | 30,2.314506073869091,5.149625279108836
 32 | 31,3.551962771845865,5.402587912747316
 33 | 32,4.002233897939344,5.093801654006394
 34 | 33,3.527022677246467,5.10624451980372
 35 | 34,2.654138701549258,5.043397568469286
 36 | 35,2.7068858476732176,4.636131024815089
 37 | 36,4.511798986257631,5.713008105710415
 38 | 37,3.583675870814159,4.894200392814213
 39 | 38,3.669836624431939,5.033490962922036
 40 | 39,3.363028108896736,5.715252956264232
 41 | 40,2.269128705743366,4.687099540814201
 42 | 41,3.0962247363582,5.0205442523389685
 43 | 42,5.586938042279173,5.2375613191718084
 44 | 43,4.062902694908744,5.482945818496652
 45 | 44,3.3160969156123503,4.938943405325701
 46 | 45,4.241531465516574,5.242988142385608
 47 | 46,5.632990791952207,4.604524871392653
 48 | 47,4.003109598155357,5.449983106760843
 49 | 48,3.627194217198549,4.5252648036434655
 50 | 49,4.574489241426403,5.827965751590956
 51 | 50,3.26114249800974,5.07472388942889
 52 | 51,5.770693937000146,4.924561771632174
 53 | 52,6.928803378451052,5.390871201576556
 54 | 53,5.605614198095166,4.867506840251817
 55 | 54,5.712794397822728,4.802840821886715
 56 | 55,6.330048423625907,5.043215026134606
 57 | 56,6.49917632389808,5.549363062730124
 58 | 57,4.455249905003633,5.388617265562158
 59 | 58,3.9747504049202647,4.804030090560459
 60 | 59,4.591734071798827,4.79723029172527
 61 | 60,9.365137265130915,5.674437752031215
 62 | 61,7.18714959246965,4.968027795024001
 63 | 62,5.514056987390571,5.21537151235826
 64 | 63,6.199762618472243,5.007944951253934
 65 | 64,4.396249378665468,5.013683226930631
 66 | 65,7.678394207429429,5.4324805777516385
 67 | 66,8.535490990338737,4.983177889646899
 68 | 67,7.380492997015747,5.160276660633111
 69 | 68,7.495870406959786,5.166303035743688
 70 | 69,6.23459148472051,4.800326067433167
 71 | 70,4.8795947527845165,5.144612976891027
 72 | 71,6.331517082169763,4.923377667981699
 73 | 72,10.028408222423991,4.7524585511425315
 74 | 73,7.267010600317143,5.928033173228412
 75 | 74,5.856718614341236,5.153535683492981
 76 | 75,6.6513086587256645,4.4540875011243095
 77 | 76,6.009374270984309,5.563696325737205
 78 | 77,6.903278042059779,4.9610776312587195
 79 | 78,9.33170057141403,5.442361429739662
 80 | 79,4.741840898628048,4.385127095815581
 81 | 80,10.441998036333615,5.681838547736346
 82 | 81,8.15549502111278,5.3122709245136726
 83 | 82,8.264309038308452,4.777889650212625
 84 | 83,5.887391058178539,4.86549347601554
 85 | 84,7.6746735994204345,5.1933876664286505
 86 | 85,8.617091500906668,5.330823383917326
 87 | 86,5.205242021670235,5.116263366906492
 88 | 87,6.110525427774986,5.0877466081133615
 89 | 88,10.555886286183668,4.846433766268649
 90 | 89,9.79517076482145,5.551267106013298
 91 | 90,8.999880920529858,4.8964913320490835
 92 | 91,7.269834366073169,5.094073872052731
 93 | 92,8.288552906946475,5.482297252107353
 94 | 93,9.264743724824317,4.4854074855836235
 95 | 94,8.42071960033443,5.3698127342621085
 96 | 95,10.393061379249968,5.397446566018766
 97 | 96,9.545057271504412,5.083199029497632
 98 | 97,9.61390230771744,5.147118076577214
 99 | 98,8.719700395822183,4.537389380232294
100 | 99,14.670850665733171,5.177534031309767
101 | 100,12.905505941170981,5.722129244713919
102 | 


--------------------------------------------------------------------------------
/microdf/tests/test_percentile_expected.csv:
--------------------------------------------------------------------------------
  1 | val_percentile,val,w
  2 | 1,0.04936696707980226,5.0030480466019815
  3 | 2,0.17830779834685495,5.114684559704431
  4 | 3,0.2857988855674821,5.003202737366776
  5 | 4,0.33805302460864795,4.733066370107501
  6 | 5,0.5703591960673162,5.239285359053462
  7 | 6,0.6096331244255117,5.429420284539545
  8 | 7,0.5985039643349068,4.348564066721563
  9 | 8,0.7510884464659845,5.440450569600465
 10 | 9,1.0876105767407074,5.499343558417599
 11 | 10,0.8899045403172029,5.1538348751709435
 12 | 11,1.394683443411786,5.23502927227519
 13 | 12,0.9305982112361821,4.417722946149524
 14 | 13,0.9650569353812546,5.348799899633306
 15 | 14,1.3214822270450117,5.3325704209630524
 16 | 15,1.2715100776802908,5.061289605022857
 17 | 16,1.6720424048893725,5.305386843295495
 18 | 17,1.9424737512447825,5.255981017771979
 19 | 18,1.0348899262653468,4.336843836714941
 20 | 19,1.7975116284606523,5.1773144696335915
 21 | 20,2.1006171498258643,5.589603555523628
 22 | 21,1.8709513510005413,5.335531722226071
 23 | 22,1.7207120623536305,4.695751363757509
 24 | 23,2.00187368264572,5.054306807303892
 25 | 24,2.313857097420784,4.884140680676319
 26 | 25,2.896700637851371,5.5236380227234845
 27 | 26,2.489820947315488,4.791953107597212
 28 | 27,2.8411561635459712,5.5661465432344635
 29 | 28,2.4261357418555654,4.446073644558097
 30 | 29,3.357591023039818,5.291769801304535
 31 | 30,2.314506073869091,5.149625279108836
 32 | 31,3.551962771845865,5.402587912747316
 33 | 32,4.002233897939344,5.093801654006394
 34 | 33,3.527022677246467,5.10624451980372
 35 | 34,2.654138701549258,5.043397568469286
 36 | 35,2.7068858476732176,4.636131024815089
 37 | 36,4.511798986257631,5.713008105710415
 38 | 37,3.583675870814159,4.894200392814213
 39 | 38,3.669836624431939,5.033490962922036
 40 | 39,3.363028108896736,5.715252956264232
 41 | 40,2.269128705743366,4.687099540814201
 42 | 41,3.0962247363582,5.0205442523389685
 43 | 42,5.586938042279173,5.2375613191718084
 44 | 43,4.062902694908744,5.482945818496652
 45 | 44,3.3160969156123503,4.938943405325701
 46 | 45,4.241531465516574,5.242988142385608
 47 | 46,5.632990791952207,4.604524871392653
 48 | 47,4.003109598155357,5.449983106760843
 49 | 48,3.627194217198549,4.5252648036434655
 50 | 49,4.574489241426403,5.827965751590956
 51 | 50,3.26114249800974,5.07472388942889
 52 | 51,5.770693937000146,4.924561771632174
 53 | 52,6.928803378451052,5.390871201576556
 54 | 53,5.605614198095166,4.867506840251817
 55 | 54,5.712794397822728,4.802840821886715
 56 | 55,6.330048423625907,5.043215026134606
 57 | 56,6.49917632389808,5.549363062730124
 58 | 57,4.455249905003633,5.388617265562158
 59 | 58,3.9747504049202647,4.804030090560459
 60 | 59,4.591734071798827,4.79723029172527
 61 | 60,9.365137265130915,5.674437752031215
 62 | 61,7.18714959246965,4.968027795024001
 63 | 62,5.514056987390571,5.21537151235826
 64 | 63,6.199762618472243,5.007944951253934
 65 | 64,4.396249378665468,5.013683226930631
 66 | 65,7.678394207429429,5.4324805777516385
 67 | 66,8.535490990338737,4.983177889646899
 68 | 67,7.380492997015747,5.160276660633111
 69 | 68,7.495870406959786,5.166303035743688
 70 | 69,6.23459148472051,4.800326067433167
 71 | 70,4.8795947527845165,5.144612976891027
 72 | 71,6.331517082169763,4.923377667981699
 73 | 72,10.028408222423991,4.7524585511425315
 74 | 73,7.267010600317143,5.928033173228412
 75 | 74,5.856718614341236,5.153535683492981
 76 | 75,6.6513086587256645,4.4540875011243095
 77 | 76,6.009374270984309,5.563696325737205
 78 | 77,6.903278042059779,4.9610776312587195
 79 | 78,9.33170057141403,5.442361429739662
 80 | 79,4.741840898628048,4.385127095815581
 81 | 80,10.441998036333615,5.681838547736346
 82 | 81,8.15549502111278,5.3122709245136726
 83 | 82,8.264309038308452,4.777889650212625
 84 | 83,5.887391058178539,4.86549347601554
 85 | 84,7.6746735994204345,5.1933876664286505
 86 | 85,8.617091500906668,5.330823383917326
 87 | 86,5.205242021670235,5.116263366906492
 88 | 87,6.110525427774986,5.0877466081133615
 89 | 88,10.555886286183668,4.846433766268649
 90 | 89,9.79517076482145,5.551267106013298
 91 | 90,8.999880920529858,4.8964913320490835
 92 | 91,7.269834366073169,5.094073872052731
 93 | 92,8.288552906946475,5.482297252107353
 94 | 93,9.264743724824317,4.4854074855836235
 95 | 94,8.42071960033443,5.3698127342621085
 96 | 95,10.393061379249968,5.397446566018766
 97 | 96,9.545057271504412,5.083199029497632
 98 | 97,9.61390230771744,5.147118076577214
 99 | 98,8.719700395822183,4.537389380232294
100 | 99,14.670850665733171,5.177534031309767
101 | 100,12.905505941170981,5.722129244713919
102 | 


--------------------------------------------------------------------------------
/microdf/tests/test_poverty.py:
--------------------------------------------------------------------------------
 1 | import microdf as mdf
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | df = pd.DataFrame(
 7 |     {
 8 |         "income": [-10, 0, 10, 20],
 9 |         "threshold": [15, 10, 15, 10],
10 |         "weight": [1, 2, 3, 4],
11 |     }
12 | )
13 | md = mdf.MicroDataFrame(df[["income", "threshold"]], weights=df.weight)
14 | 
15 | 
16 | def test_poverty_rate():
17 |     # Unweighted
18 |     assert np.allclose(mdf.poverty_rate(df, "income", "threshold"), 3 / 4)
19 |     # Weighted
20 |     assert np.allclose(
21 |         mdf.poverty_rate(df, "income", "threshold", "weight"), 6 / 10
22 |     )
23 |     assert np.allclose(md.poverty_rate("income", "threshold"), 6 / 10)
24 | 
25 | 
26 | def test_deep_poverty_rate():
27 |     # Unweighted
28 |     assert np.allclose(mdf.deep_poverty_rate(df, "income", "threshold"), 2 / 4)
29 |     # Weighted
30 |     assert np.allclose(
31 |         mdf.deep_poverty_rate(df, "income", "threshold", "weight"), 3 / 10
32 |     )
33 |     assert np.allclose(md.deep_poverty_rate("income", "threshold"), 3 / 10)
34 | 
35 | 
36 | def test_poverty_gap():
37 |     # Unweighted
38 |     assert np.allclose(mdf.poverty_gap(df, "income", "threshold"), 25 + 10 + 5)
39 |     # Weighted
40 |     RES = 25 * 1 + 10 * 2 + 5 * 3
41 |     assert np.allclose(
42 |         mdf.poverty_gap(df, "income", "threshold", "weight"), RES
43 |     )
44 |     assert np.allclose(md.poverty_gap("income", "threshold"), RES)
45 | 
46 | 
47 | def test_squared_poverty_gap():
48 |     # Unweighted
49 |     assert np.allclose(
50 |         mdf.squared_poverty_gap(df, "income", "threshold"),
51 |         25 ** 2 + 10 ** 2 + 5 ** 2,
52 |     )
53 |     # Weighted
54 |     RES = 1 * (25 ** 2) + 2 * (10 ** 2) + 3 * (5 ** 2)
55 |     assert np.allclose(
56 |         mdf.squared_poverty_gap(df, "income", "threshold", "weight"), RES,
57 |     )
58 |     assert np.allclose(md.squared_poverty_gap("income", "threshold"), RES)
59 | 
60 | 
61 | def test_deep_poverty_gap():
62 |     # Unweighted
63 |     assert np.allclose(
64 |         mdf.deep_poverty_gap(df, "income", "threshold"), 17.5 + 5 + 0 + 0
65 |     )
66 |     # Weighted
67 |     RES = 17.5 * 1 + 5 * 2 + 0 * 3 + 0 * 4
68 |     assert np.allclose(
69 |         mdf.deep_poverty_gap(df, "income", "threshold", "weight"), RES
70 |     )
71 |     # Same in MicroDataFrame.
72 |     assert np.allclose(md.deep_poverty_gap("income", "threshold"), RES)
73 | 


--------------------------------------------------------------------------------
/microdf/tests/test_quantile_chg.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | import microdf as mdf
 4 | 
 5 | 
 6 | V1 = [1, 2, 3]
 7 | V2 = [4, 5, 6]
 8 | W1 = [7, 8, 9]
 9 | W2 = [10, 11, 12]
10 | DF1 = pd.DataFrame({"v": V1, "w": W1})
11 | DF2 = pd.DataFrame({"v": V2, "w": W2})
12 | 
13 | 
14 | def test_quantile_chg():
15 |     mdf.quantile_chg(DF1, DF2, "v", "w", "v", "w")
16 | 
17 | 
18 | def test_quantile_pct_chg_plot():
19 |     mdf.quantile_pct_chg_plot(DF1, DF2, "v", "w", "v", "w")
20 | 


--------------------------------------------------------------------------------
/microdf/tests/test_tax.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | import microdf as mdf
 6 | 
 7 | 
 8 | def test_tax():
 9 |     """ """
10 |     # Consider a MTR schedule of 0% up to 10,000, then 10% after that.
11 |     BRACKETS = [0, 10e3]
12 |     RATES = [0, 0.1]
13 |     INCOME = [0, 5e3, 10e3, 10e3 + 1, 20e3]
14 |     EXPECTED = [0, 0, 0, 0.1, 1e3]
15 |     res = mdf.tax_from_mtrs(INCOME, BRACKETS, RATES)
16 |     pd.testing.assert_series_equal(res, pd.Series(EXPECTED))
17 |     # Try with 10% avoidance.
18 |     EXPECTED_10PCT_AVOIDANCE = [0, 0, 0, 0, 800.0]
19 |     res_10pct_avoidance = mdf.tax_from_mtrs(INCOME, BRACKETS, RATES, 0.1)
20 |     pd.testing.assert_series_equal(
21 |         res_10pct_avoidance, pd.Series(EXPECTED_10PCT_AVOIDANCE)
22 |     )
23 |     # Try with avoidance elasticity of 2.
24 |     EXPECTED_E2_AVOIDANCE = [
25 |         0,
26 |         0,
27 |         0,
28 |         0,  # Taxable base becomes (10e3 + 1) * (1 - 2 * 0.1)
29 |         # Taxable base becomes 20e3 * (exp(-2 * 0.1)).
30 |         0.1 * (20e3 * np.exp(-0.2) - 10e3),
31 |     ]
32 |     res_e2_avoidance = mdf.tax_from_mtrs(
33 |         INCOME, BRACKETS, RATES, avoidance_elasticity=2
34 |     )
35 |     pd.testing.assert_series_equal(
36 |         res_e2_avoidance, pd.Series(EXPECTED_E2_AVOIDANCE)
37 |     )
38 |     # Try with flat avoidance elasticity of 2.
39 |     EXPECTED_E2_AVOIDANCE_FLAT = [
40 |         0,
41 |         0,
42 |         0,
43 |         0,  # Taxable base becomes (10e3 + 1) * (1 - 2 * 0.1)
44 |         600.0,
45 |     ]  # Taxable base becomes 20e3 * (1 - 2 * 0.1) = 16e3.
46 |     res_e2_avoidance_flat = mdf.tax_from_mtrs(
47 |         INCOME, BRACKETS, RATES, avoidance_elasticity_flat=2
48 |     )
49 |     pd.testing.assert_series_equal(
50 |         res_e2_avoidance_flat, pd.Series(EXPECTED_E2_AVOIDANCE_FLAT)
51 |     )
52 |     # Ensure error when passing both rate and elasticity.
53 |     with pytest.raises(Exception):
54 |         mdf.tax_from_mtrs(INCOME, BRACKETS, RATES, 0.1, 2)
55 | 


--------------------------------------------------------------------------------
/microdf/tests/test_taxcalc.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import microdf as mdf
 4 | 
 5 | 
 6 | try:
 7 |     import taxcalc as tc
 8 | 
 9 |     _HAVE_TAXCALC = True
10 | except ImportError:
11 |     _HAVE_TAXCALC = False
12 | 
13 | 
14 | def test_calc_df():
15 |     """ """
16 |     if not _HAVE_TAXCALC:
17 |         pytest.skip("taxcalc is not installed")
18 |     mdf.calc_df()
19 | 
20 | 
21 | def test_static_baseline_calc():
22 |     """ """
23 |     if not _HAVE_TAXCALC:
24 |         pytest.skip("taxcalc is not installed")
25 |     recs = tc.Records.cps_constructor()
26 |     mdf.static_baseline_calc(recs, 2020)
27 | 


--------------------------------------------------------------------------------
/microdf/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | import microdf as mdf
 4 | 
 5 | 
 6 | def test_cartesian_product():
 7 |     """ """
 8 |     res = mdf.cartesian_product(
 9 |         {"a": [1, 2, 3], "b": ["val1", "val2"], "c": [100, 101]}
10 |     )
11 |     EXPECTED = pd.DataFrame(
12 |         {
13 |             "a": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
14 |             "b": [
15 |                 "val1",
16 |                 "val1",
17 |                 "val2",
18 |                 "val2",
19 |                 "val1",
20 |                 "val1",
21 |                 "val2",
22 |                 "val2",
23 |                 "val1",
24 |                 "val1",
25 |                 "val2",
26 |                 "val2",
27 |             ],
28 |             "c": [100, 101, 100, 101, 100, 101, 100, 101, 100, 101, 100, 101],
29 |         }
30 |     )
31 |     pd.testing.assert_frame_equal(res, EXPECTED)
32 | 
33 | 
34 | def test_flatten():
35 |     """ """
36 |     L = [[[1, 2, 3], [4, 5]], 6]
37 |     res = list(mdf.flatten(L))
38 |     EXPECTED = [1, 2, 3, 4, 5, 6]
39 |     assert res == EXPECTED
40 | 


--------------------------------------------------------------------------------
/microdf/tests/test_weighted.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | import microdf as mdf
 5 | 
 6 | 
 7 | X = [1, 5, 2]
 8 | Y = [0, -6, 3]
 9 | W = [4, 1, 1]
10 | df = pd.DataFrame({"x": X, "y": Y, "w": W})
11 | ms = mdf.MicroSeries(X, weights=W)
12 | md = mdf.MicroDataFrame(df[["x", "y"]], weights=W)
13 | # Also make a version with groups.
14 | df2 = df.copy(deep=True)
15 | df2.x *= 2
16 | df2.y *= 1.5
17 | dfg = pd.concat([df, df2])
18 | dfg["g"] = ["a"] * 3 + ["b"] * 3
19 | mdg = mdf.MicroDataFrame(dfg[["x", "y", "g"]], weights=W)
20 | 
21 | 
22 | def test_weighted_quantile():
23 |     Q = [0, 0.5, 1]
24 |     mdf.weighted_quantile(df, "x", "w", Q).tolist()
25 | 
26 | 
27 | def test_weighted_median():
28 |     assert mdf.weighted_median(df, "x") == 2
29 |     mdf.weighted_median(df, "x", "w")
30 |     # Test with groups.
31 |     mdf.weighted_median(dfg, "x", "w", "g")
32 | 
33 | 
34 | def test_weighted_mean():
35 |     # Test umweighted.
36 |     assert mdf.weighted_mean(df, "x") == 8 / 3
37 |     # Test weighted.
38 |     assert mdf.weighted_mean(df, "x", "w") == 11 / 6
39 |     # Test weighted with multiple columns.
40 |     assert mdf.weighted_mean(df, ["x", "y"], "w").tolist() == [11 / 6, -3 / 6]
41 |     # Test grouped.
42 |     mdf.weighted_mean(dfg, "x", "w", "g")
43 |     mdf.weighted_mean(dfg, ["x", "y"], "w", "g")
44 | 
45 | 
46 | def test_weighted_sum():
47 |     # Test unweighted.
48 |     assert mdf.weighted_sum(df, "x") == 8
49 |     # Test weighted.
50 |     assert mdf.weighted_sum(df, "x", "w") == 11
51 |     # Test weighted with multiple columns.
52 |     assert mdf.weighted_sum(df, ["x", "y"], "w").tolist() == [11, -3]
53 |     # Test grouped.
54 |     mdf.weighted_sum(dfg, "x", "w", "g")
55 |     mdf.weighted_sum(dfg, ["x", "y"], "w", "g")
56 | 
57 | 
58 | def test_gini():
59 |     # Test nothing breaks.
60 |     ms.gini()
61 |     # Unweighted.
62 |     mdf.gini(df, "x")
63 |     # Weighted
64 |     mdf.gini(df, "x", "w")
65 |     # Unweighted, grouped
66 |     mdf.gini(dfg, "x", groupby="g")
67 |     # Weighted, grouped
68 |     mdf.gini(dfg, "x", "w", groupby="g")
69 |     # Test old and new match.
70 |     assert ms.gini() == mdf.gini(df, "x", "w")
71 | 
72 | 
73 | def test_add_weighted_quantiles():
74 |     with pytest.deprecated_call():
75 |         mdf.add_weighted_quantiles(df, "x", "w")
76 | 


--------------------------------------------------------------------------------
/microdf/ubi.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import microdf as mdf
 4 | 
 5 | 
 6 | def ubi_or_bens(
 7 |     df,
 8 |     ben_cols,
 9 |     max_ubi="max_ubi",
10 |     ubi="ubi",
11 |     bens="bens",
12 |     update_income_measures=None,
13 | ):
14 |     """Calculates whether a tax unit will take UBI or benefits,
15 |        and adjusts values accordingly.
16 | 
17 |     :param df: DataFrame.
18 |     :param ben_cols: List of columns for benefits.
19 |     :param max_ubi: Column name of the maximum UBI, before accounting
20 |             for benefits. Defaults to 'max_ubi'.
21 |     :param ubi: Column name to add representing the UBI. Defaults to 'ubi'.
22 |     :param bens: Column name to add representing total benefits (after
23 |             adjustment). Defaults to 'bens'.
24 |     :param update_income_measures: List of income measures to update.
25 |             Defaults to ['expanded_income', 'aftertax_income'].
26 |     :returns: Nothing. Benefits in ben_cols are adjusted, ubi and bens columns
27 |         are added, and expanded_income and aftertax_income are updated
28 |         according to the net difference.
29 | 
30 |     """
31 |     if update_income_measures is None:
32 |         update_income_measures = ["expanded_income", "aftertax_income"]
33 |     # Prep list args.
34 |     update_income_measures = mdf.listify(update_income_measures)
35 |     total_bens = df[ben_cols].sum(axis=1)
36 |     take_ubi = df[max_ubi] > total_bens
37 |     df[ubi] = np.where(take_ubi, df[max_ubi], 0)
38 |     for ben in ben_cols:
39 |         df[ben] *= np.where(take_ubi, 0, 1)
40 |     df[bens] = df[ben_cols].sum(axis=1)
41 |     # Update expanded and aftertax income.
42 |     diff = df.ubi + df.bens - total_bens
43 |     for i in update_income_measures:
44 |         df[i] += diff
45 | 


--------------------------------------------------------------------------------
/microdf/utils.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def ordinal_label(n):
 7 |     """Creates ordinal label from number.
 8 | 
 9 |     Adapted from https://stackoverflow.com/a/20007730/1840471.
10 | 
11 |     :param n: Number.
12 |     :returns: Ordinal label, e.g., 1st, 3rd, 24th, etc.
13 | 
14 |     """
15 |     n = int(n)
16 |     ix = (n / 10 % 10 != 1) * (n % 10 < 4) * n % 10
17 |     return "%d%s" % (n, "tsnrhtdd"[ix::4])
18 | 
19 | 
20 | def dedup_list(lst):
21 |     """Remove duplicate items from a list.
22 | 
23 |     :param lst: List.
24 |     :returns: List with duplicate items removed from lst.
25 | 
26 |     """
27 |     return list(set(lst))
28 | 
29 | 
30 | def listify(x, dedup=True):
31 |     """Return x as a list, if it isn't one already.
32 | 
33 |     :param x: A single item or a list
34 |     :param dedup: Default value = True)
35 |     :returns: x if x is a list, otherwise [x]. Also flattens the list
36 |             and removes Nones.
37 | 
38 |     """
39 |     if not isinstance(x, list):
40 |         x = [x]
41 |     res = flatten(x)
42 |     res = [x for x in res if x is not None]
43 |     if dedup:
44 |         return dedup_list(res)
45 |     return res
46 | 
47 | 
48 | def flatten(lst):
49 |     """Flatten list. From https://stackoverflow.com/a/2158532/1840471.
50 | 
51 |     :param lst: List.
52 |     :returns: Flattened version.
53 | 
54 |     """
55 |     for el in lst:
56 |         if isinstance(el, collections.abc.Iterable) and not isinstance(
57 |             el, (str, bytes)
58 |         ):
59 |             yield from flatten(el)
60 |         else:
61 |             yield el
62 | 
63 | 
64 | def cartesian_product(d):
65 |     """Produces a DataFrame as a Cartesian product of dictionary
66 |         keys and values.
67 | 
68 |     :param d: Dictionary where each item's key corresponds to a column
69 |            name, and each value is a list of values.
70 |     :returns: DataFrame with a Cartesian product of each dictionary item.
71 | 
72 |     """
73 |     index = pd.MultiIndex.from_product(d.values(), names=d.keys())
74 |     return pd.DataFrame(index=index).reset_index()
75 | 


--------------------------------------------------------------------------------
/microdf/weighted.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import warnings
  4 | 
  5 | import microdf as mdf
  6 | 
  7 | 
  8 | def weight(df, col, w=None):
  9 |     """Calculates the weighted value of a column in a DataFrame.
 10 | 
 11 |     :param df: A pandas DataFrame.
 12 |     :param col: A string indicating the column in the DataFrame to weight.
 13 |         Can also be a list of column strings.
 14 |     :param w: Weight column.
 15 |     :returns: A pandas Series multiplying the column by its weight.
 16 | 
 17 |     """
 18 |     if w is None:
 19 |         return df[col]
 20 |     return df[col].multiply(df[w], axis="index")
 21 | 
 22 | 
 23 | def weighted_sum(df, col, w=None, groupby=None):
 24 |     """Calculates the weighted sum of a column in a DataFrame.
 25 | 
 26 |     :param df: A pandas DataFrame.
 27 |     :param col: A string indicating the column in the DataFrame.
 28 |         Can also be a list of column strings.
 29 |     :param w: Weight column.
 30 |     :param groupby: Groupby column.
 31 |     :returns: The weighted sum of a DataFrame's column.
 32 | 
 33 |     """
 34 | 
 35 |     def _weighted_sum(df, col, w):
 36 |         """ For weighted sum with provided weight. """
 37 |         return weight(df, col, w).sum()
 38 | 
 39 |     if groupby is None:
 40 |         if w is None:
 41 |             return df[col].sum()
 42 |         return _weighted_sum(df, col, w)
 43 |     # If grouping.
 44 |     if w is None:
 45 |         return df.groupby(groupby)[col].sum()
 46 |     return df.groupby(groupby).apply(lambda x: _weighted_sum(x, col, w))
 47 | 
 48 | 
 49 | def weighted_mean(df, col, w=None, groupby=None):
 50 |     """Calculates the weighted mean of a column in a DataFrame.
 51 | 
 52 |     :param df: A pandas DataFrame.
 53 |     :param col: A string indicating the column in the DataFrame.
 54 |         Can also be a list of column strings.
 55 |     :param w: Weight column.
 56 |     :param groupby: Groupby column.
 57 |     :returns: The weighted mean of a DataFrame's column.
 58 | 
 59 |     """
 60 | 
 61 |     def _weighted_mean(df, col, w=None):
 62 |         """ For weighted mean with provided weight. """
 63 |         return weighted_sum(df, col, w) / df[w].sum()
 64 | 
 65 |     if groupby is None:
 66 |         if w is None:
 67 |             return df[col].mean()
 68 |         return _weighted_mean(df, col, w)
 69 |     # Group.
 70 |     if w is None:
 71 |         return df.groupby(groupby)[col].mean()
 72 |     return df.groupby(groupby).apply(lambda x: _weighted_mean(x, col, w))
 73 | 
 74 | 
 75 | def weighted_quantile(df: pd.DataFrame, col: str, w: str, quantiles: np.array):
 76 |     """Calculates weighted quantiles of a set of values.
 77 | 
 78 |     Doesn't exactly match unweighted quantiles of stacked values.
 79 |     See stackoverflow.com/q/21844024#comment102342137_29677616.
 80 | 
 81 |     :param df: DataFrame to calculate weighted quantiles from.
 82 |     :type df: pd.DataFrame
 83 |     :param col: Name of numeric column in df to calculate weighted quantiles
 84 |         from.
 85 |     :type col: str
 86 |     :param w: Name of weight column in df.
 87 |     :type w: str
 88 |     :param quantiles: Array of quantiles to calculate.
 89 |     :type quantiles: np.array
 90 |     :return: Array of weighted quantiles.
 91 |     :rtype: np.array
 92 |     """
 93 |     values = np.array(df[col])
 94 |     quantiles = np.array(quantiles)
 95 |     if w is None:
 96 |         sample_weight = np.ones(len(values))
 97 |     else:
 98 |         sample_weight = np.array(df[w])
 99 |     assert np.all(quantiles >= 0) and np.all(
100 |         quantiles <= 1
101 |     ), "quantiles should be in [0, 1]"
102 |     sorter = np.argsort(values)
103 |     values = values[sorter]
104 |     sample_weight = sample_weight[sorter]
105 |     weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
106 |     weighted_quantiles /= np.sum(sample_weight)
107 |     return np.interp(quantiles, weighted_quantiles, values)
108 | 
109 | 
110 | def weighted_median(df, col, w=None, groupby=None):
111 |     """Calculates the weighted median of a column in a DataFrame.
112 | 
113 |     :param df: A pandas DataFrame containing Tax-Calculator data.
114 |     :param col: A string indicating the column in the DataFrame.
115 |     :param w: Weight column.
116 |     :returns: The weighted median of a DataFrame's column.
117 | 
118 |     """
119 | 
120 |     def _weighted_median(df, col, w):
121 |         """ For weighted median with provided weight. """
122 |         return weighted_quantile(df, col, w, 0.5)
123 | 
124 |     if groupby is None:
125 |         if w is None:
126 |             return df[col].median()
127 |         return _weighted_median(df, col, w)
128 |     # Group.
129 |     if w is None:
130 |         return df.groupby(groupby)[col].median()
131 |     return df.groupby(groupby).apply(lambda x: _weighted_median(x, col, w))
132 | 
133 | 
134 | def add_weighted_quantiles(df, col, w):
135 |     """Adds weighted quantiles of a column to a DataFrame.
136 |     This will be deprecated in the next minor release. Please use
137 |     MicroSeries.rank instead.
138 | 
139 |     Adds columns for each of these types of quantiles to a DataFrame:
140 |     * *_percentile_exact: Exact percentile.
141 |     * *_percentile: Integer percentile (ceiling).
142 |     * *_2percentile: Integer percentile (ceiling, for each two percentiles).
143 |     * *_ventile: Integer percentile (ceiling, for each five percentiles).
144 |     * *_decile: Integer decile.
145 |     * *_quintile: Integer quintile.
146 |     * *_quartile: Integer quartile.
147 | 
148 |     Negative values are assigned -1.
149 | 
150 |     :param df: A pandas DataFrame.
151 |     :param col: A string indicating the column in the DataFrame to calculate.
152 |     :param w: Weight column.
153 |     :returns: Nothing. Columns are added in place. Also sorts df by col.
154 |     """
155 |     warnings.warn(
156 |         "This will be deprecated in the next minor release. "
157 |         "Please use MicroSeries.rank instead.",
158 |         DeprecationWarning,
159 |     )
160 |     df.sort_values(by=col, inplace=True)
161 |     col_pctile = col + "_percentile_exact"
162 |     df[col_pctile] = 100 * df[w].cumsum() / df[w].sum()
163 |     # "Null out" negatives using -1, since integer arrays can't be NaN.
164 |     df[col_pctile] = np.where(df[col] >= 0, df[col_pctile], 0)
165 |     # Reduce top record, otherwise it's incorrectly rounded up.
166 |     df[col_pctile] = np.where(
167 |         df[col_pctile] >= 99.99999, 99.99999, df[col_pctile]
168 |     )
169 |     df[col + "_percentile"] = np.ceil(df[col_pctile]).astype(int)
170 |     df[col + "_2percentile"] = 2 * np.ceil(df[col_pctile] / 2).astype(int)
171 |     df[col + "_ventile"] = 5 * np.ceil(df[col_pctile] / 5).astype(int)
172 |     df[col + "_decile"] = np.ceil(df[col_pctile] / 10).astype(int)
173 |     df[col + "_quintile"] = np.ceil(df[col_pctile] / 20).astype(int)
174 |     df[col + "_quartile"] = np.ceil(df[col_pctile] / 25).astype(int)
175 | 
176 | 
177 | def quantile_chg(df1, df2, col1, col2, w1=None, w2=None, q=None):
178 |     """Create table with two sets of quantiles.
179 | 
180 |     :param df1: DataFrame with first set of values.
181 |     :param df2: DataFrame with second set of values.
182 |     :param col1: Name of columns with values in df1.
183 |     :param col2: Name of columns with values in df2.
184 |     :param w1: Name of weight column in df1.
185 |     :param w2: Name of weight column in df2.
186 |     :param q: Quantiles. Defaults to decile boundaries.
187 |     :returns: DataFrame with two rows and a column for each quantile.
188 |         Column labels are "xth percentile" and a label is added
189 |         to the median.
190 | 
191 |     """
192 |     if q is None:
193 |         q = np.arange(0.1, 1, 0.1)
194 |     q1 = weighted_quantile(df1, col1, w1, q)
195 |     q2 = weighted_quantile(df2, col2, w2, q)
196 |     qdf = pd.DataFrame([q1, q2])
197 |     # Set decile labels.
198 |     q_print = [mdf.ordinal_label((i * 100)) for i in q]
199 |     try:  # List index throws an error if the value is not found.
200 |         median_index = q.tolist().index(0.5)
201 |         q_print[median_index] += " (median)"
202 |     except ValueError:
203 |         pass  # Don't assign median to any label.
204 |     qdf.columns = q_print
205 |     return qdf
206 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name="microdf-python",
 5 |     version="0.4.3",
 6 |     description="Survey microdata as DataFrames.",
 7 |     url="http://github.com/PSLmodels/microdf",
 8 |     author="Max Ghenis",
 9 |     author_email="max@ubicenter.org",
10 |     license="MIT",
11 |     packages=["microdf"],
12 |     install_requires=[
13 |         "numpy",
14 |         "pandas",
15 |     ],
16 |     extras_require={
17 |       "taxcalc": ["taxcalc"],
18 |       "charts": [
19 |         "seaborn",
20 |         "matplotlib",
21 |         "matplotlib-label-lines"
22 |       ]
23 |     },
24 |     zip_safe=False,
25 | )
26 | 


--------------------------------------------------------------------------------