├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── examples ├── data │ ├── acs-2015-pums-wy-simple.csv │ ├── acs-2015-pums-wy.csv │ ├── gss-extract.csv │ ├── ipsos-ssm-and-abortion-survey.csv │ └── simple.csv ├── notebooks │ └── example-usage.ipynb └── scripts │ └── process-acs.py ├── setup.py ├── test └── core_tests.py ├── tox.ini └── weightedcalcs ├── __init__.py ├── __version__.py └── core.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | #### joe made this: http://goel.io/joe 3 | 4 | #####=== Python ===##### 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | install: 8 | - pip install . 9 | - pip install nose 10 | - pip install coveralls 11 | script: nosetests --with-coverage --cover-package weightedcalcs 12 | after_success: coveralls 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/). 6 | 7 | ## [0.1.3] — 2024-11-10 8 | ### Fixed 9 | - Fix deprecation of `pandas.np`, h/t @simon-smart88. ([#9](https://github.com/jsvine/weightedcalcs/issues/9)) 10 | - Fix deprecation of `DataFrameGroupBy.apply operated on the grouping columns`. 11 | 12 | ### Changed 13 | - Change minimum `pandas` version to `2.0`. 14 | 15 | ## [0.1.2] — 2017-06-17 16 | ### Fixed 17 | - Fix incompatibility with pandas 0.20.1 18 | 19 | ## [0.1.1] — 2017-04-08 20 | ### Added 21 | - MANIFEST.in 22 | 23 | ## [0.1.0] — 2017-03-30 24 | ### Added 25 | - Support for Python 2.7 26 | - Support for non-pandas input 27 | - Full test coverage 28 | 29 | ## [0.0.0] — 2016-12-23 30 | 31 | Initial release 32 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016, Jeremy Singer-Vine 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt *.md *.rst 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Version](https://img.shields.io/pypi/v/weightedcalcs.svg)](https://pypi.python.org/pypi/weightedcalcs) [![Build status](https://travis-ci.org/jsvine/weightedcalcs.png)](https://travis-ci.org/jsvine/weightedcalcs) [![Code coverage](https://img.shields.io/coveralls/jsvine/weightedcalcs.svg)](https://coveralls.io/github/jsvine/weightedcalcs) [![Support Python versions](https://img.shields.io/pypi/pyversions/weightedcalcs.svg)](https://pypi.python.org/pypi/weightedcalcs) 2 | 3 | # weightedcalcs 4 | 5 | `weightedcalcs` is a `pandas`-based Python library for calculating weighted means, medians, standard deviations, and more. 6 | 7 | ## Features 8 | 9 | - Plays well with `pandas`. 10 | - Support for weighted means, medians, quantiles, standard deviations, and distributions. 11 | - Support for grouped calculations, using `DataFrameGroupBy` objects. 12 | - Raises an error when your data contains null-values. 13 | - Full test coverage. 14 | 15 | ## Installation 16 | 17 | ```sh 18 | pip install weightedcalcs 19 | ``` 20 | 21 | ## Usage 22 | 23 | ### Getting started 24 | 25 | Every weighted calculation in `weightedcalcs` begins with an instance of the `weightedcalcs.Calculator` class. `Calculator` takes one argument: the name of your weighting variable. So if you're analyzing a survey where the weighting variable is called `"resp_weight"`, you'd do this: 26 | 27 | ```python 28 | import weightedcalcs as wc 29 | calc = wc.Calculator("resp_weight") 30 | ``` 31 | 32 | ### Types of calculations 33 | 34 | Currently, `weightedcalcs.Calculator` supports the following calculations: 35 | 36 | - `calc.mean(my_data, value_var)`: The weighted arithmetic average of `value_var`. 37 | - `calc.quantile(my_data, value_var, q)`: The weighted quantile of `value_var`, where `q` is between 0 and 1. 38 | - `calc.median(my_data, value_var)`: The weighted median of `value_var`, equivalent to `.quantile(...)` where `q=0.5`. 39 | - `calc.std(my_data, value_var)`: The weighted standard deviation of `value_var`. 40 | - `calc.distribution(my_data, value_var)`: The weighted proportions of `value_var`, interpreting `value_var` as categories. 41 | - `calc.count(my_data)`: The weighted count of all observations, i.e., the total weight. 42 | - `calc.sum(my_data, value_var)`: The weighted sum of `value_var`. 43 | 44 | The `obj` parameter above should one of the following: 45 | 46 | - A `pandas` `DataFrame` object 47 | - A `pandas` `DataFrame.groupby` object 48 | - A plain Python dictionary where the keys are column names and the values are equal-length lists. 49 | 50 | ### Basic example 51 | 52 | Below is a basic example of using `weightedcalcs` to find what percentage of Wyoming residents are married, divorced, et cetera: 53 | 54 | ```python 55 | import pandas as pd 56 | import weightedcalcs as wc 57 | 58 | # Load the 2015 American Community Survey person-level responses for Wyoming 59 | responses = pd.read_csv("examples/data/acs-2015-pums-wy-simple.csv") 60 | 61 | # `PWGTP` is the weighting variable used in the ACS's person-level data 62 | calc = wc.Calculator("PWGTP") 63 | 64 | # Get the distribution of marriage-status responses 65 | calc.distribution(responses, "marriage_status").round(3).sort_values(ascending=False) 66 | 67 | # -- Output -- 68 | # marriage_status 69 | # Married 0.425 70 | # Never married or under 15 years old 0.421 71 | # Divorced 0.097 72 | # Widowed 0.046 73 | # Separated 0.012 74 | # Name: PWGTP, dtype: float64 75 | ``` 76 | 77 | ### More examples 78 | 79 | [See this notebook to see examples of other calculations, including grouped calculations.](examples/notebooks/example-usage.ipynb) 80 | 81 | [Max Ghenis](https://github.com/MaxGhenis) has created [a version of the example notebook that can be run directly in your browser](https://colab.research.google.com/gist/MaxGhenis/4c96163eacebc1005419c9533a568c7e/weightedcalcs-example-usage-scf.ipynb), via Google Colab. 82 | 83 | ### Weightedcalcs in the wild 84 | 85 | - "[Procesando los microdatos de la Encuesta Permanente de Hogares](http://blog.jazzido.com/2017/01/09/procesando-microdatos-eph)," by Manuel Aristarán 86 | - [BuzzFeedNews/2017-01-media-platform-and-news-trust-survey](https://github.com/BuzzFeedNews/2017-01-media-platform-and-news-trust-survey/blob/master/notebooks/platform-trust-additional-analysis.ipynb) 87 | - [BuzzFeedNews/2016-12-transgender-rights-survey](https://github.com/BuzzFeedNews/2016-12-transgender-rights-survey/blob/master/notebooks/additional-analysis.ipynb) 88 | 89 | ## Other Python weighted-calculation libraries 90 | 91 | - [`tinybike/weightedstats`](https://github.com/tinybike/weightedstats) 92 | - [`nudomarinero/wquantiles`](https://github.com/nudomarinero/wquantiles/) 93 | 94 | -------------------------------------------------------------------------------- /examples/data/ipsos-ssm-and-abortion-survey.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsvine/weightedcalcs/cbd2818e6f7ad82c29714f842228bfd4f65c008f/examples/data/ipsos-ssm-and-abortion-survey.csv -------------------------------------------------------------------------------- /examples/data/simple.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsvine/weightedcalcs/cbd2818e6f7ad82c29714f842228bfd4f65c008f/examples/data/simple.csv -------------------------------------------------------------------------------- /examples/notebooks/example-usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example usage for `weightedcalcs`\n", 8 | "\n", 9 | "The example below uawa `weightedcalcs` to analyze a slice of the [American Community Survey's 2015 data](https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html) for Wyoming." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import weightedcalcs as wc\n", 21 | "import pandas as pd" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Load the ACS data into a `pandas.DataFrame`" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "responses = pd.read_csv(\"../data/acs-2015-pums-wy-simple.csv\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
SERIALNOPWGTPagegendermarriage_statusincome
0199014867MaleNever married or under 15 years old27000.0
1225337193FemaleWidowed0.0
2286128846FemaleDivorced44000.0
345375859MaleDivorced35000.0
4479713070MaleMarried0.0
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " SERIALNO PWGTP age gender marriage_status income\n", 128 | "0 1990 148 67 Male Never married or under 15 years old 27000.0\n", 129 | "1 2253 371 93 Female Widowed 0.0\n", 130 | "2 2861 288 46 Female Divorced 44000.0\n", 131 | "3 4537 58 59 Male Divorced 35000.0\n", 132 | "4 4797 130 70 Male Married 0.0" 133 | ] 134 | }, 135 | "execution_count": 3, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "responses.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "In addition to the full list of responses, let's create a subset including only adult respondents, since we'll be focusing on income later." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 4, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "adults = responses[responses[\"age\"] >= 18]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 5, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/html": [ 170 | "
\n", 171 | "\n", 184 | "\n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | "
SERIALNOPWGTPagegendermarriage_statusincome
0199014867MaleNever married or under 15 years old27000.0
1225337193FemaleWidowed0.0
2286128846FemaleDivorced44000.0
345375859MaleDivorced35000.0
4479713070MaleMarried0.0
\n", 244 | "
" 245 | ], 246 | "text/plain": [ 247 | " SERIALNO PWGTP age gender marriage_status income\n", 248 | "0 1990 148 67 Male Never married or under 15 years old 27000.0\n", 249 | "1 2253 371 93 Female Widowed 0.0\n", 250 | "2 2861 288 46 Female Divorced 44000.0\n", 251 | "3 4537 58 59 Male Divorced 35000.0\n", 252 | "4 4797 130 70 Male Married 0.0" 253 | ] 254 | }, 255 | "execution_count": 5, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "adults.head()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "## Create an instance of `weightedcalcs.Calculator`\n", 269 | "\n", 270 | "The ACS' `PWGTP` variable is respondents the Census-assigned survey weight. All our weighted calculations will use this variable." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 6, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "calc = wc.Calculator(\"PWGTP\")" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "## Basic weighted calculations" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "### Weighted mean income" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 7, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "30709.0" 307 | ] 308 | }, 309 | "execution_count": 7, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "calc.mean(adults, \"income\").round()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "### Weighted standard deviation of income" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 8, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "46093.0" 334 | ] 335 | }, 336 | "execution_count": 8, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | } 340 | ], 341 | "source": [ 342 | "calc.std(adults, \"income\").round()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "### Weighted median income" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 9, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "18000.0" 361 | ] 362 | }, 363 | "execution_count": 9, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "calc.median(adults, \"income\")" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "### Weighted 75th percentile of income" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 10, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "data": { 386 | "text/plain": [ 387 | "45000.0" 388 | ] 389 | }, 390 | "execution_count": 10, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "calc.quantile(adults, \"income\", 0.75)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "### Weighted distribution of marriage statuses" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": {}, 409 | "source": [ 410 | "~43% of Wyoming residents are married:" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 11, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "marriage_status\n", 422 | "Married 0.425\n", 423 | "Never married or under 15 years old 0.421\n", 424 | "Divorced 0.097\n", 425 | "Widowed 0.046\n", 426 | "Separated 0.012\n", 427 | "Name: PWGTP, dtype: float64" 428 | ] 429 | }, 430 | "execution_count": 11, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "calc.distribution(responses, \"marriage_status\").round(3).sort_values(ascending=False)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "~56% of *adult* Wyoming residents are married:" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 12, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "data": { 453 | "text/plain": [ 454 | "marriage_status\n", 455 | "Married 0.557\n", 456 | "Never married or under 15 years old 0.240\n", 457 | "Divorced 0.127\n", 458 | "Widowed 0.060\n", 459 | "Separated 0.016\n", 460 | "Name: PWGTP, dtype: float64" 461 | ] 462 | }, 463 | "execution_count": 12, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "calc.distribution(adults, \"marriage_status\").round(3).sort_values(ascending=False)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "## Grouped weighted calculations\n", 477 | "\n", 478 | "Below, we perform similar calculations as above, but now take advantage of the fact that `weightedcalcs` can handle `DataFrameGroupBy` objects. In the examples below, we group by the ACS's marriage status categories and gender." 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 13, 484 | "metadata": { 485 | "collapsed": true 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "grp_marriage_sex = adults.groupby([\"marriage_status\", \"gender\"])" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "For reference, here's how many responses fall into each category:" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 14, 502 | "metadata": { 503 | "scrolled": true 504 | }, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/html": [ 509 | "
\n", 510 | "\n", 523 | "\n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | "
genderFemaleMale
marriage_status
Divorced292279
Married13371337
Never married or under 15 years old382535
Separated2518
Widowed23275
\n", 564 | "
" 565 | ], 566 | "text/plain": [ 567 | "gender Female Male\n", 568 | "marriage_status \n", 569 | "Divorced 292 279\n", 570 | "Married 1337 1337\n", 571 | "Never married or under 15 years old 382 535\n", 572 | "Separated 25 18\n", 573 | "Widowed 232 75" 574 | ] 575 | }, 576 | "execution_count": 14, 577 | "metadata": {}, 578 | "output_type": "execute_result" 579 | } 580 | ], 581 | "source": [ 582 | "grp_marriage_sex.size().unstack()" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "### Weighted mean income\n" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 15, 595 | "metadata": {}, 596 | "outputs": [ 597 | { 598 | "data": { 599 | "text/html": [ 600 | "
\n", 601 | "\n", 614 | "\n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | "
genderFemaleMale
marriage_status
Divorced2780338884
Married2259250263
Never married or under 15 years old1562527531
Separated1544318553
Widowed589015421
\n", 655 | "
" 656 | ], 657 | "text/plain": [ 658 | "gender Female Male\n", 659 | "marriage_status \n", 660 | "Divorced 27803 38884\n", 661 | "Married 22592 50263\n", 662 | "Never married or under 15 years old 15625 27531\n", 663 | "Separated 15443 18553\n", 664 | "Widowed 5890 15421" 665 | ] 666 | }, 667 | "execution_count": 15, 668 | "metadata": {}, 669 | "output_type": "execute_result" 670 | } 671 | ], 672 | "source": [ 673 | "calc.mean(grp_marriage_sex, \"income\").round().astype(int)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "metadata": {}, 679 | "source": [ 680 | "### Weighted standard deviation of income" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 16, 686 | "metadata": {}, 687 | "outputs": [ 688 | { 689 | "data": { 690 | "text/html": [ 691 | "
\n", 692 | "\n", 705 | "\n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | "
genderFemaleMale
marriage_status
Divorced40039.040916.0
Married33602.063959.0
Never married or under 15 years old19885.034576.0
Separated14822.025867.0
Widowed17113.055463.0
\n", 746 | "
" 747 | ], 748 | "text/plain": [ 749 | "gender Female Male\n", 750 | "marriage_status \n", 751 | "Divorced 40039.0 40916.0\n", 752 | "Married 33602.0 63959.0\n", 753 | "Never married or under 15 years old 19885.0 34576.0\n", 754 | "Separated 14822.0 25867.0\n", 755 | "Widowed 17113.0 55463.0" 756 | ] 757 | }, 758 | "execution_count": 16, 759 | "metadata": {}, 760 | "output_type": "execute_result" 761 | } 762 | ], 763 | "source": [ 764 | "calc.std(grp_marriage_sex, \"income\").round()" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "### Weighted median income" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 17, 777 | "metadata": {}, 778 | "outputs": [ 779 | { 780 | "data": { 781 | "text/html": [ 782 | "
\n", 783 | "\n", 796 | "\n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | "
genderFemaleMale
marriage_status
Divorced21000.029000.0
Married11000.040200.0
Never married or under 15 years old8300.016000.0
Separated10000.00.0
Widowed0.00.0
\n", 837 | "
" 838 | ], 839 | "text/plain": [ 840 | "gender Female Male\n", 841 | "marriage_status \n", 842 | "Divorced 21000.0 29000.0\n", 843 | "Married 11000.0 40200.0\n", 844 | "Never married or under 15 years old 8300.0 16000.0\n", 845 | "Separated 10000.0 0.0\n", 846 | "Widowed 0.0 0.0" 847 | ] 848 | }, 849 | "execution_count": 17, 850 | "metadata": {}, 851 | "output_type": "execute_result" 852 | } 853 | ], 854 | "source": [ 855 | "calc.median(grp_marriage_sex, \"income\")" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "## Weighted 75th percentile of income" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 18, 868 | "metadata": {}, 869 | "outputs": [ 870 | { 871 | "data": { 872 | "text/html": [ 873 | "
\n", 874 | "\n", 887 | "\n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | "
genderFemaleMale
marriage_status
Divorced39000.065000.0
Married35000.070000.0
Never married or under 15 years old25000.038000.0
Separated32400.030000.0
Widowed0.00.0
\n", 928 | "
" 929 | ], 930 | "text/plain": [ 931 | "gender Female Male\n", 932 | "marriage_status \n", 933 | "Divorced 39000.0 65000.0\n", 934 | "Married 35000.0 70000.0\n", 935 | "Never married or under 15 years old 25000.0 38000.0\n", 936 | "Separated 32400.0 30000.0\n", 937 | "Widowed 0.0 0.0" 938 | ] 939 | }, 940 | "execution_count": 18, 941 | "metadata": {}, 942 | "output_type": "execute_result" 943 | } 944 | ], 945 | "source": [ 946 | "calc.quantile(grp_marriage_sex, \"income\", 0.75)" 947 | ] 948 | }, 949 | { 950 | "cell_type": "markdown", 951 | "metadata": {}, 952 | "source": [ 953 | "---\n", 954 | "\n", 955 | "---\n", 956 | "\n", 957 | "---" 958 | ] 959 | } 960 | ], 961 | "metadata": { 962 | "kernelspec": { 963 | "display_name": "Python 3", 964 | "language": "python", 965 | "name": "python3" 966 | }, 967 | "language_info": { 968 | "codemirror_mode": { 969 | "name": "ipython", 970 | "version": 3 971 | }, 972 | "file_extension": ".py", 973 | "mimetype": "text/x-python", 974 | "name": "python", 975 | "nbconvert_exporter": "python", 976 | "pygments_lexer": "ipython3", 977 | "version": "3.4.3" 978 | } 979 | }, 980 | "nbformat": 4, 981 | "nbformat_minor": 1 982 | } 983 | -------------------------------------------------------------------------------- /examples/scripts/process-acs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env 2 | import sys, os 3 | import pandas as pd 4 | 5 | responses = pd.read_csv(sys.stdin).rename(columns={ 6 | "AGEP": "age", 7 | "WAGP": "income" 8 | }) 9 | 10 | responses["marriage_status"] = responses["MAR"].apply({ 11 | 1: "Married", 12 | 2: "Widowed", 13 | 3: "Divorced", 14 | 4: "Separated", 15 | 5: "Never married or under 15 years old" 16 | }.get) 17 | 18 | responses["gender"] = responses["SEX"].apply({ 19 | 1: "Male", 20 | 2: "Female" 21 | }.get) 22 | 23 | responses[[ 24 | "SERIALNO", 25 | "PWGTP", 26 | "age", 27 | "gender", 28 | "marriage_status", 29 | "income" 30 | ]].to_csv(sys.stdout, index=False) 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | 4 | NAME = "weightedcalcs" 5 | HERE = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | version_ns = {} 8 | with open(os.path.join(HERE, NAME, '__version__.py')) as f: 9 | exec(f.read(), {}, version_ns) 10 | 11 | setup( 12 | name=NAME, 13 | version=version_ns['__version__'], 14 | description="Pandas-based utility to calculate weighted means, medians, distributions, standard deviations, and more.", 15 | url="http://github.com/jsvine/weightedcalcs", 16 | author="Jeremy Singer-Vine", 17 | author_email="jsvine@gmail.com", 18 | license="MIT", 19 | packages=[ 20 | NAME 21 | ], 22 | install_requires=[ 23 | "pandas>=2.0" 24 | ], 25 | classifiers=[ 26 | "License :: OSI Approved :: MIT License", 27 | "Operating System :: OS Independent", 28 | "Programming Language :: Python :: 3", 29 | "Intended Audience :: Developers", 30 | "Intended Audience :: Science/Research", 31 | "Topic :: Scientific/Engineering :: Information Analysis", 32 | ], 33 | ) 34 | -------------------------------------------------------------------------------- /test/core_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import weightedcalcs as wc 3 | import pandas as pd 4 | import sys 5 | import os 6 | 7 | calc = wc.Calculator("weights") 8 | 9 | class WCTest(unittest.TestCase): 10 | 11 | def test_mean(self): 12 | # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean 13 | assert(calc.mean(pd.DataFrame({ 14 | "values": [ 80, 90 ], 15 | "weights": [ 20, 30 ], 16 | }), "values") == 86) 17 | 18 | def test_mean_non_pandas(self): 19 | # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean 20 | assert(calc.mean({ 21 | "values": [ 80, 90 ], 22 | "weights": [ 20, 30 ], 23 | }, "values") == 86) 24 | 25 | def test_quantile(self): 26 | # Example via https://en.wikipedia.org/wiki/Weighted_median 27 | df = pd.DataFrame({ 28 | "values": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ], 29 | "weights": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ], 30 | }) 31 | assert(df["values"].median() == 0.1) 32 | assert(calc.quantile(df, "values", 0.5) == 0.2) 33 | assert(calc.median(df, "values") == 0.2) 34 | 35 | def test_quantile_split(self): 36 | df = pd.DataFrame({ 37 | "values": [ 0, 1, 2, 3 ], 38 | "weights": [ 1, 1, 1, 1 ], 39 | }) 40 | assert(calc.quantile(df, "values", 0.5) == 1.5) 41 | 42 | def test_bad_quantile(self): 43 | with self.assertRaises(Exception) as context: 44 | q = calc.quantile(pd.DataFrame({ 45 | "values": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ], 46 | "weights": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ], 47 | }), "values", -1) 48 | 49 | def test_std(self): 50 | # Example via http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weightsd.pdf 51 | assert(calc.std(pd.DataFrame({ 52 | "values": [ 2, 3, 5, 7, 11, 13, 17, 19, 23 ], 53 | "weights": [ 1, 1, 0, 0, 4, 1, 2, 1, 0 ], 54 | }), "values").round(2) == 5.82) 55 | 56 | def test_distribution(self): 57 | dist = calc.distribution(pd.DataFrame({ 58 | "values": [ "a", "b", "b", "b", "c" ], 59 | "weights": [ 3, 2, 0, 1, 2 ], 60 | }), "values") 61 | assert(dist["a"] == 0.375) 62 | assert(dist["b"] == 0.375) 63 | assert(dist["c"] == 0.250) 64 | 65 | def test_count(self): 66 | count = calc.count(pd.DataFrame({ 67 | "values": [ "a", "b", "b", "b", "c" ], 68 | "weights": [ 3, 2, 0, 1, 2 ], 69 | })) 70 | assert(count == 8) 71 | 72 | def test_sum(self): 73 | _sum = calc.sum(pd.DataFrame({ 74 | "values": [ 1, 2, 3, 4, 5 ], 75 | "weights": [ 3, 2, 0, 1, 2 ], 76 | }), "values") 77 | assert(_sum == 21) 78 | 79 | def test_grouped(self): 80 | dist = calc.distribution(pd.DataFrame({ 81 | "group": [ "x", "x", "x", "x", "x" ], 82 | "values": [ "a", "b", "b", "b", "c" ], 83 | "weights": [ 3, 2, 0, 1, 2 ], 84 | }).groupby("group"), "values") 85 | assert(dist.loc["x"]["a"] == 0.375) 86 | assert(dist.loc["x"]["b"] == 0.375) 87 | assert(dist.loc["x"]["c"] == 0.250) 88 | 89 | def test_multi_grouped(self): 90 | dist = calc.distribution(pd.DataFrame({ 91 | "group_a": [ "x", "x", "x", "x", "x" ], 92 | "group_b": [ "x", "x", "x", "x", "x" ], 93 | "values": [ "a", "b", "b", "b", "c" ], 94 | "weights": [ 3, 2, 0, 1, 2 ], 95 | }).groupby([ "group_a", "group_b" ]), "values") 96 | assert(dist.loc[("x", "x")]["a"] == 0.375) 97 | 98 | def test_multi_grouped_two(self): 99 | dist = calc.distribution(pd.DataFrame({ 100 | "group_a": [ "x", "x", "x", "y", "y" ], 101 | "group_b": [ "x", "x", "x", "y", "y" ], 102 | "values": [ "a", "b", "b", "b", "c" ], 103 | "weights": [ 3, 2, 0, 1, 2 ], 104 | }).groupby([ "group_a", "group_b" ]), "values") 105 | assert(dist.loc[("x", "x")]["a"] == 0.6) 106 | assert(dist.loc[("x", "x")]["b"] == 0.4) 107 | assert(dist.loc[("x", "x")]["c"] == 0) 108 | 109 | def test_null_values(self): 110 | with self.assertRaises(Exception) as context: 111 | dist = calc.distribution(pd.DataFrame({ 112 | "values": [ None, "b", "b", "b", "c" ], 113 | "weights": [ 3, 2, 0, 1, 2 ], 114 | }), "values") 115 | 116 | if __name__ == '__main__': 117 | unittest.main() 118 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py34,py35,py36 3 | toxworkdir={env:TOX_WORK_DIR:.tox} 4 | 5 | [testenv] 6 | deps=nose 7 | commands=nosetests --nocapture 8 | -------------------------------------------------------------------------------- /weightedcalcs/__init__.py: -------------------------------------------------------------------------------- 1 | from .__version__ import __version__ 2 | from .core import Calculator 3 | -------------------------------------------------------------------------------- /weightedcalcs/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION_TUPLE = (0, 1, 3) 2 | __version__ = ".".join(map(str, VERSION_TUPLE)) 3 | -------------------------------------------------------------------------------- /weightedcalcs/core.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pandas as pd 3 | 4 | PANDAS_TYPES = ( 5 | pd.DataFrame, 6 | pd.core.groupby.DataFrameGroupBy, 7 | ) 8 | 9 | def pandas_deco(func): 10 | def func_wrapper(self, thing, *args, **kwargs): 11 | if isinstance(thing, PANDAS_TYPES): 12 | return func(self, thing, *args, **kwargs) 13 | else: 14 | return func(self, pd.DataFrame(thing), *args, **kwargs) 15 | return func_wrapper 16 | 17 | def groupby_deco(func): 18 | def func_wrapper(self, thing, *args, **kwargs): 19 | if isinstance(thing, pd.core.groupby.DataFrameGroupBy): 20 | agg = thing.apply(lambda x: func(self, x, *args, **kwargs), include_groups=False) 21 | is_series = isinstance(agg, pd.core.series.Series) 22 | has_multiindex = isinstance(agg.index, pd.MultiIndex) 23 | if is_series and has_multiindex: 24 | return agg.unstack() 25 | else: 26 | return agg 27 | return func(self, thing, *args, **kwargs) 28 | return func_wrapper 29 | 30 | def fillna_deco(val): 31 | def deco(func): 32 | def func_wrapper(self, thing, *args, **kwargs): 33 | return func(self, thing, *args, **kwargs).fillna(val) 34 | return func_wrapper 35 | return deco 36 | 37 | def check_nulls(series): 38 | if series.isnull().sum() > 0: 39 | raise ValueError("value_var contains null values") 40 | return series 41 | 42 | class Calculator(object): 43 | def __init__(self, weight_var): 44 | self.weight_var = weight_var 45 | 46 | @groupby_deco 47 | @pandas_deco 48 | def count(self, thing): 49 | return thing[self.weight_var].sum() 50 | 51 | @groupby_deco 52 | @pandas_deco 53 | def sum(self, thing, value_var): 54 | weights = thing[self.weight_var] 55 | values = check_nulls(thing[value_var]) 56 | return (values * weights).sum() 57 | 58 | @groupby_deco 59 | @pandas_deco 60 | def mean(self, thing, value_var): 61 | weights = thing[self.weight_var] 62 | total_weight = weights.sum() 63 | values = check_nulls(thing[value_var]) 64 | return (values * weights).sum() / total_weight 65 | 66 | @groupby_deco 67 | @pandas_deco 68 | def std(self, thing, value_var): 69 | weights = thing[self.weight_var] 70 | n_nonzero_weights = (weights > 0).sum() 71 | if (n_nonzero_weights) < 2: return pd.NA 72 | values = check_nulls(thing[value_var]) 73 | mean = self.mean(thing, value_var) 74 | numerator = (weights * (values - mean).pow(2)).sum() 75 | denominator = (n_nonzero_weights - 1) * weights.sum() / n_nonzero_weights 76 | return pow(numerator / denominator, 0.5) 77 | 78 | @groupby_deco 79 | @pandas_deco 80 | def quantile(self, thing, value_var, q): 81 | if q < 0 or q > 1: 82 | raise ValueError("q must be between 0 and 1") 83 | df = pd.DataFrame({ 84 | "weights": thing[self.weight_var], 85 | "values": check_nulls(thing[value_var]) 86 | }).sort_values("values") 87 | df["cumul_prop"] = df["weights"].cumsum() / df["weights"].sum() 88 | shaved = df[df["cumul_prop"] >= q] 89 | if shaved.iloc[0]["cumul_prop"] == q: 90 | return shaved.head(2)["values"].mean() 91 | else: 92 | return shaved.iloc[0]["values"] 93 | 94 | @groupby_deco 95 | @pandas_deco 96 | def median(self, thing, value_var): 97 | return self.quantile(thing, value_var, 0.5) 98 | 99 | @fillna_deco(0) 100 | @groupby_deco 101 | @pandas_deco 102 | def distribution(self, thing, value_var): 103 | weights = thing[self.weight_var] 104 | total_weight = weights.sum() 105 | check_nulls(thing[value_var]) 106 | return thing.groupby(value_var)[self.weight_var].sum() / total_weight 107 | 108 | --------------------------------------------------------------------------------