├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── examples
├── data
│ ├── acs-2015-pums-wy-simple.csv
│ ├── acs-2015-pums-wy.csv
│ ├── gss-extract.csv
│ ├── ipsos-ssm-and-abortion-survey.csv
│ └── simple.csv
├── notebooks
│ └── example-usage.ipynb
└── scripts
│ └── process-acs.py
├── setup.py
├── test
└── core_tests.py
├── tox.ini
└── weightedcalcs
├── __init__.py
├── __version__.py
└── core.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | #### joe made this: http://goel.io/joe
3 |
4 | #####=== Python ===#####
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 |
55 | # Sphinx documentation
56 | docs/_build/
57 |
58 | # PyBuilder
59 | target/
60 |
61 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 | - "3.4"
5 | - "3.5"
6 | - "3.6"
7 | install:
8 | - pip install .
9 | - pip install nose
10 | - pip install coveralls
11 | script: nosetests --with-coverage --cover-package weightedcalcs
12 | after_success: coveralls
13 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 |
3 | All notable changes to this project will be documented in this file.
4 |
5 | The format is based on [Keep a Changelog](http://keepachangelog.com/).
6 |
7 | ## [0.1.3] — 2024-11-10
8 | ### Fixed
9 | - Fix deprecation of `pandas.np`, h/t @simon-smart88. ([#9](https://github.com/jsvine/weightedcalcs/issues/9))
10 | - Fix deprecation of `DataFrameGroupBy.apply operated on the grouping columns`.
11 |
12 | ### Changed
13 | - Change minimum `pandas` version to `2.0`.
14 |
15 | ## [0.1.2] — 2017-06-17
16 | ### Fixed
17 | - Fix incompatibility with pandas 0.20.1
18 |
19 | ## [0.1.1] — 2017-04-08
20 | ### Added
21 | - MANIFEST.in
22 |
23 | ## [0.1.0] — 2017-03-30
24 | ### Added
25 | - Support for Python 2.7
26 | - Support for non-pandas input
27 | - Full test coverage
28 |
29 | ## [0.0.0] — 2016-12-23
30 |
31 | Initial release
32 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016, Jeremy Singer-Vine
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt *.md *.rst
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://pypi.python.org/pypi/weightedcalcs) [](https://travis-ci.org/jsvine/weightedcalcs) [](https://coveralls.io/github/jsvine/weightedcalcs) [](https://pypi.python.org/pypi/weightedcalcs)
2 |
3 | # weightedcalcs
4 |
5 | `weightedcalcs` is a `pandas`-based Python library for calculating weighted means, medians, standard deviations, and more.
6 |
7 | ## Features
8 |
9 | - Plays well with `pandas`.
10 | - Support for weighted means, medians, quantiles, standard deviations, and distributions.
11 | - Support for grouped calculations, using `DataFrameGroupBy` objects.
12 | - Raises an error when your data contains null-values.
13 | - Full test coverage.
14 |
15 | ## Installation
16 |
17 | ```sh
18 | pip install weightedcalcs
19 | ```
20 |
21 | ## Usage
22 |
23 | ### Getting started
24 |
25 | Every weighted calculation in `weightedcalcs` begins with an instance of the `weightedcalcs.Calculator` class. `Calculator` takes one argument: the name of your weighting variable. So if you're analyzing a survey where the weighting variable is called `"resp_weight"`, you'd do this:
26 |
27 | ```python
28 | import weightedcalcs as wc
29 | calc = wc.Calculator("resp_weight")
30 | ```
31 |
32 | ### Types of calculations
33 |
34 | Currently, `weightedcalcs.Calculator` supports the following calculations:
35 |
36 | - `calc.mean(my_data, value_var)`: The weighted arithmetic average of `value_var`.
37 | - `calc.quantile(my_data, value_var, q)`: The weighted quantile of `value_var`, where `q` is between 0 and 1.
38 | - `calc.median(my_data, value_var)`: The weighted median of `value_var`, equivalent to `.quantile(...)` where `q=0.5`.
39 | - `calc.std(my_data, value_var)`: The weighted standard deviation of `value_var`.
40 | - `calc.distribution(my_data, value_var)`: The weighted proportions of `value_var`, interpreting `value_var` as categories.
41 | - `calc.count(my_data)`: The weighted count of all observations, i.e., the total weight.
42 | - `calc.sum(my_data, value_var)`: The weighted sum of `value_var`.
43 |
44 | The `obj` parameter above should one of the following:
45 |
46 | - A `pandas` `DataFrame` object
47 | - A `pandas` `DataFrame.groupby` object
48 | - A plain Python dictionary where the keys are column names and the values are equal-length lists.
49 |
50 | ### Basic example
51 |
52 | Below is a basic example of using `weightedcalcs` to find what percentage of Wyoming residents are married, divorced, et cetera:
53 |
54 | ```python
55 | import pandas as pd
56 | import weightedcalcs as wc
57 |
58 | # Load the 2015 American Community Survey person-level responses for Wyoming
59 | responses = pd.read_csv("examples/data/acs-2015-pums-wy-simple.csv")
60 |
61 | # `PWGTP` is the weighting variable used in the ACS's person-level data
62 | calc = wc.Calculator("PWGTP")
63 |
64 | # Get the distribution of marriage-status responses
65 | calc.distribution(responses, "marriage_status").round(3).sort_values(ascending=False)
66 |
67 | # -- Output --
68 | # marriage_status
69 | # Married 0.425
70 | # Never married or under 15 years old 0.421
71 | # Divorced 0.097
72 | # Widowed 0.046
73 | # Separated 0.012
74 | # Name: PWGTP, dtype: float64
75 | ```
76 |
77 | ### More examples
78 |
79 | [See this notebook to see examples of other calculations, including grouped calculations.](examples/notebooks/example-usage.ipynb)
80 |
81 | [Max Ghenis](https://github.com/MaxGhenis) has created [a version of the example notebook that can be run directly in your browser](https://colab.research.google.com/gist/MaxGhenis/4c96163eacebc1005419c9533a568c7e/weightedcalcs-example-usage-scf.ipynb), via Google Colab.
82 |
83 | ### Weightedcalcs in the wild
84 |
85 | - "[Procesando los microdatos de la Encuesta Permanente de Hogares](http://blog.jazzido.com/2017/01/09/procesando-microdatos-eph)," by Manuel Aristarán
86 | - [BuzzFeedNews/2017-01-media-platform-and-news-trust-survey](https://github.com/BuzzFeedNews/2017-01-media-platform-and-news-trust-survey/blob/master/notebooks/platform-trust-additional-analysis.ipynb)
87 | - [BuzzFeedNews/2016-12-transgender-rights-survey](https://github.com/BuzzFeedNews/2016-12-transgender-rights-survey/blob/master/notebooks/additional-analysis.ipynb)
88 |
89 | ## Other Python weighted-calculation libraries
90 |
91 | - [`tinybike/weightedstats`](https://github.com/tinybike/weightedstats)
92 | - [`nudomarinero/wquantiles`](https://github.com/nudomarinero/wquantiles/)
93 |
94 |
--------------------------------------------------------------------------------
/examples/data/ipsos-ssm-and-abortion-survey.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsvine/weightedcalcs/cbd2818e6f7ad82c29714f842228bfd4f65c008f/examples/data/ipsos-ssm-and-abortion-survey.csv
--------------------------------------------------------------------------------
/examples/data/simple.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsvine/weightedcalcs/cbd2818e6f7ad82c29714f842228bfd4f65c008f/examples/data/simple.csv
--------------------------------------------------------------------------------
/examples/notebooks/example-usage.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Example usage for `weightedcalcs`\n",
8 | "\n",
9 | "The example below uawa `weightedcalcs` to analyze a slice of the [American Community Survey's 2015 data](https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html) for Wyoming."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {
16 | "collapsed": true
17 | },
18 | "outputs": [],
19 | "source": [
20 | "import weightedcalcs as wc\n",
21 | "import pandas as pd"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## Load the ACS data into a `pandas.DataFrame`"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {
35 | "collapsed": true
36 | },
37 | "outputs": [],
38 | "source": [
39 | "responses = pd.read_csv(\"../data/acs-2015-pums-wy-simple.csv\")"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/html": [
50 | "
\n",
51 | "\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " | \n",
68 | " SERIALNO | \n",
69 | " PWGTP | \n",
70 | " age | \n",
71 | " gender | \n",
72 | " marriage_status | \n",
73 | " income | \n",
74 | "
\n",
75 | " \n",
76 | " \n",
77 | " \n",
78 | " 0 | \n",
79 | " 1990 | \n",
80 | " 148 | \n",
81 | " 67 | \n",
82 | " Male | \n",
83 | " Never married or under 15 years old | \n",
84 | " 27000.0 | \n",
85 | "
\n",
86 | " \n",
87 | " 1 | \n",
88 | " 2253 | \n",
89 | " 371 | \n",
90 | " 93 | \n",
91 | " Female | \n",
92 | " Widowed | \n",
93 | " 0.0 | \n",
94 | "
\n",
95 | " \n",
96 | " 2 | \n",
97 | " 2861 | \n",
98 | " 288 | \n",
99 | " 46 | \n",
100 | " Female | \n",
101 | " Divorced | \n",
102 | " 44000.0 | \n",
103 | "
\n",
104 | " \n",
105 | " 3 | \n",
106 | " 4537 | \n",
107 | " 58 | \n",
108 | " 59 | \n",
109 | " Male | \n",
110 | " Divorced | \n",
111 | " 35000.0 | \n",
112 | "
\n",
113 | " \n",
114 | " 4 | \n",
115 | " 4797 | \n",
116 | " 130 | \n",
117 | " 70 | \n",
118 | " Male | \n",
119 | " Married | \n",
120 | " 0.0 | \n",
121 | "
\n",
122 | " \n",
123 | "
\n",
124 | "
"
125 | ],
126 | "text/plain": [
127 | " SERIALNO PWGTP age gender marriage_status income\n",
128 | "0 1990 148 67 Male Never married or under 15 years old 27000.0\n",
129 | "1 2253 371 93 Female Widowed 0.0\n",
130 | "2 2861 288 46 Female Divorced 44000.0\n",
131 | "3 4537 58 59 Male Divorced 35000.0\n",
132 | "4 4797 130 70 Male Married 0.0"
133 | ]
134 | },
135 | "execution_count": 3,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "responses.head()"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "In addition to the full list of responses, let's create a subset including only adult respondents, since we'll be focusing on income later."
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 4,
154 | "metadata": {
155 | "collapsed": true
156 | },
157 | "outputs": [],
158 | "source": [
159 | "adults = responses[responses[\"age\"] >= 18]"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 5,
165 | "metadata": {},
166 | "outputs": [
167 | {
168 | "data": {
169 | "text/html": [
170 | "\n",
171 | "\n",
184 | "
\n",
185 | " \n",
186 | " \n",
187 | " | \n",
188 | " SERIALNO | \n",
189 | " PWGTP | \n",
190 | " age | \n",
191 | " gender | \n",
192 | " marriage_status | \n",
193 | " income | \n",
194 | "
\n",
195 | " \n",
196 | " \n",
197 | " \n",
198 | " 0 | \n",
199 | " 1990 | \n",
200 | " 148 | \n",
201 | " 67 | \n",
202 | " Male | \n",
203 | " Never married or under 15 years old | \n",
204 | " 27000.0 | \n",
205 | "
\n",
206 | " \n",
207 | " 1 | \n",
208 | " 2253 | \n",
209 | " 371 | \n",
210 | " 93 | \n",
211 | " Female | \n",
212 | " Widowed | \n",
213 | " 0.0 | \n",
214 | "
\n",
215 | " \n",
216 | " 2 | \n",
217 | " 2861 | \n",
218 | " 288 | \n",
219 | " 46 | \n",
220 | " Female | \n",
221 | " Divorced | \n",
222 | " 44000.0 | \n",
223 | "
\n",
224 | " \n",
225 | " 3 | \n",
226 | " 4537 | \n",
227 | " 58 | \n",
228 | " 59 | \n",
229 | " Male | \n",
230 | " Divorced | \n",
231 | " 35000.0 | \n",
232 | "
\n",
233 | " \n",
234 | " 4 | \n",
235 | " 4797 | \n",
236 | " 130 | \n",
237 | " 70 | \n",
238 | " Male | \n",
239 | " Married | \n",
240 | " 0.0 | \n",
241 | "
\n",
242 | " \n",
243 | "
\n",
244 | "
"
245 | ],
246 | "text/plain": [
247 | " SERIALNO PWGTP age gender marriage_status income\n",
248 | "0 1990 148 67 Male Never married or under 15 years old 27000.0\n",
249 | "1 2253 371 93 Female Widowed 0.0\n",
250 | "2 2861 288 46 Female Divorced 44000.0\n",
251 | "3 4537 58 59 Male Divorced 35000.0\n",
252 | "4 4797 130 70 Male Married 0.0"
253 | ]
254 | },
255 | "execution_count": 5,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "adults.head()"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "## Create an instance of `weightedcalcs.Calculator`\n",
269 | "\n",
270 | "The ACS' `PWGTP` variable is respondents the Census-assigned survey weight. All our weighted calculations will use this variable."
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 6,
276 | "metadata": {
277 | "collapsed": true
278 | },
279 | "outputs": [],
280 | "source": [
281 | "calc = wc.Calculator(\"PWGTP\")"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "## Basic weighted calculations"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "### Weighted mean income"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 7,
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "data": {
305 | "text/plain": [
306 | "30709.0"
307 | ]
308 | },
309 | "execution_count": 7,
310 | "metadata": {},
311 | "output_type": "execute_result"
312 | }
313 | ],
314 | "source": [
315 | "calc.mean(adults, \"income\").round()"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": [
322 | "### Weighted standard deviation of income"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 8,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "data": {
332 | "text/plain": [
333 | "46093.0"
334 | ]
335 | },
336 | "execution_count": 8,
337 | "metadata": {},
338 | "output_type": "execute_result"
339 | }
340 | ],
341 | "source": [
342 | "calc.std(adults, \"income\").round()"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "metadata": {},
348 | "source": [
349 | "### Weighted median income"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 9,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "data": {
359 | "text/plain": [
360 | "18000.0"
361 | ]
362 | },
363 | "execution_count": 9,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "calc.median(adults, \"income\")"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "### Weighted 75th percentile of income"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 10,
382 | "metadata": {},
383 | "outputs": [
384 | {
385 | "data": {
386 | "text/plain": [
387 | "45000.0"
388 | ]
389 | },
390 | "execution_count": 10,
391 | "metadata": {},
392 | "output_type": "execute_result"
393 | }
394 | ],
395 | "source": [
396 | "calc.quantile(adults, \"income\", 0.75)"
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {},
402 | "source": [
403 | "### Weighted distribution of marriage statuses"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "~43% of Wyoming residents are married:"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 11,
416 | "metadata": {},
417 | "outputs": [
418 | {
419 | "data": {
420 | "text/plain": [
421 | "marriage_status\n",
422 | "Married 0.425\n",
423 | "Never married or under 15 years old 0.421\n",
424 | "Divorced 0.097\n",
425 | "Widowed 0.046\n",
426 | "Separated 0.012\n",
427 | "Name: PWGTP, dtype: float64"
428 | ]
429 | },
430 | "execution_count": 11,
431 | "metadata": {},
432 | "output_type": "execute_result"
433 | }
434 | ],
435 | "source": [
436 | "calc.distribution(responses, \"marriage_status\").round(3).sort_values(ascending=False)"
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "metadata": {},
442 | "source": [
443 | "~56% of *adult* Wyoming residents are married:"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 12,
449 | "metadata": {},
450 | "outputs": [
451 | {
452 | "data": {
453 | "text/plain": [
454 | "marriage_status\n",
455 | "Married 0.557\n",
456 | "Never married or under 15 years old 0.240\n",
457 | "Divorced 0.127\n",
458 | "Widowed 0.060\n",
459 | "Separated 0.016\n",
460 | "Name: PWGTP, dtype: float64"
461 | ]
462 | },
463 | "execution_count": 12,
464 | "metadata": {},
465 | "output_type": "execute_result"
466 | }
467 | ],
468 | "source": [
469 | "calc.distribution(adults, \"marriage_status\").round(3).sort_values(ascending=False)"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {},
475 | "source": [
476 | "## Grouped weighted calculations\n",
477 | "\n",
478 | "Below, we perform similar calculations as above, but now take advantage of the fact that `weightedcalcs` can handle `DataFrameGroupBy` objects. In the examples below, we group by the ACS's marriage status categories and gender."
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 13,
484 | "metadata": {
485 | "collapsed": true
486 | },
487 | "outputs": [],
488 | "source": [
489 | "grp_marriage_sex = adults.groupby([\"marriage_status\", \"gender\"])"
490 | ]
491 | },
492 | {
493 | "cell_type": "markdown",
494 | "metadata": {},
495 | "source": [
496 | "For reference, here's how many responses fall into each category:"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 14,
502 | "metadata": {
503 | "scrolled": true
504 | },
505 | "outputs": [
506 | {
507 | "data": {
508 | "text/html": [
509 | "\n",
510 | "\n",
523 | "
\n",
524 | " \n",
525 | " \n",
526 | " gender | \n",
527 | " Female | \n",
528 | " Male | \n",
529 | "
\n",
530 | " \n",
531 | " marriage_status | \n",
532 | " | \n",
533 | " | \n",
534 | "
\n",
535 | " \n",
536 | " \n",
537 | " \n",
538 | " Divorced | \n",
539 | " 292 | \n",
540 | " 279 | \n",
541 | "
\n",
542 | " \n",
543 | " Married | \n",
544 | " 1337 | \n",
545 | " 1337 | \n",
546 | "
\n",
547 | " \n",
548 | " Never married or under 15 years old | \n",
549 | " 382 | \n",
550 | " 535 | \n",
551 | "
\n",
552 | " \n",
553 | " Separated | \n",
554 | " 25 | \n",
555 | " 18 | \n",
556 | "
\n",
557 | " \n",
558 | " Widowed | \n",
559 | " 232 | \n",
560 | " 75 | \n",
561 | "
\n",
562 | " \n",
563 | "
\n",
564 | "
"
565 | ],
566 | "text/plain": [
567 | "gender Female Male\n",
568 | "marriage_status \n",
569 | "Divorced 292 279\n",
570 | "Married 1337 1337\n",
571 | "Never married or under 15 years old 382 535\n",
572 | "Separated 25 18\n",
573 | "Widowed 232 75"
574 | ]
575 | },
576 | "execution_count": 14,
577 | "metadata": {},
578 | "output_type": "execute_result"
579 | }
580 | ],
581 | "source": [
582 | "grp_marriage_sex.size().unstack()"
583 | ]
584 | },
585 | {
586 | "cell_type": "markdown",
587 | "metadata": {},
588 | "source": [
589 | "### Weighted mean income\n"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": 15,
595 | "metadata": {},
596 | "outputs": [
597 | {
598 | "data": {
599 | "text/html": [
600 | "\n",
601 | "\n",
614 | "
\n",
615 | " \n",
616 | " \n",
617 | " gender | \n",
618 | " Female | \n",
619 | " Male | \n",
620 | "
\n",
621 | " \n",
622 | " marriage_status | \n",
623 | " | \n",
624 | " | \n",
625 | "
\n",
626 | " \n",
627 | " \n",
628 | " \n",
629 | " Divorced | \n",
630 | " 27803 | \n",
631 | " 38884 | \n",
632 | "
\n",
633 | " \n",
634 | " Married | \n",
635 | " 22592 | \n",
636 | " 50263 | \n",
637 | "
\n",
638 | " \n",
639 | " Never married or under 15 years old | \n",
640 | " 15625 | \n",
641 | " 27531 | \n",
642 | "
\n",
643 | " \n",
644 | " Separated | \n",
645 | " 15443 | \n",
646 | " 18553 | \n",
647 | "
\n",
648 | " \n",
649 | " Widowed | \n",
650 | " 5890 | \n",
651 | " 15421 | \n",
652 | "
\n",
653 | " \n",
654 | "
\n",
655 | "
"
656 | ],
657 | "text/plain": [
658 | "gender Female Male\n",
659 | "marriage_status \n",
660 | "Divorced 27803 38884\n",
661 | "Married 22592 50263\n",
662 | "Never married or under 15 years old 15625 27531\n",
663 | "Separated 15443 18553\n",
664 | "Widowed 5890 15421"
665 | ]
666 | },
667 | "execution_count": 15,
668 | "metadata": {},
669 | "output_type": "execute_result"
670 | }
671 | ],
672 | "source": [
673 | "calc.mean(grp_marriage_sex, \"income\").round().astype(int)"
674 | ]
675 | },
676 | {
677 | "cell_type": "markdown",
678 | "metadata": {},
679 | "source": [
680 | "### Weighted standard deviation of income"
681 | ]
682 | },
683 | {
684 | "cell_type": "code",
685 | "execution_count": 16,
686 | "metadata": {},
687 | "outputs": [
688 | {
689 | "data": {
690 | "text/html": [
691 | "\n",
692 | "\n",
705 | "
\n",
706 | " \n",
707 | " \n",
708 | " gender | \n",
709 | " Female | \n",
710 | " Male | \n",
711 | "
\n",
712 | " \n",
713 | " marriage_status | \n",
714 | " | \n",
715 | " | \n",
716 | "
\n",
717 | " \n",
718 | " \n",
719 | " \n",
720 | " Divorced | \n",
721 | " 40039.0 | \n",
722 | " 40916.0 | \n",
723 | "
\n",
724 | " \n",
725 | " Married | \n",
726 | " 33602.0 | \n",
727 | " 63959.0 | \n",
728 | "
\n",
729 | " \n",
730 | " Never married or under 15 years old | \n",
731 | " 19885.0 | \n",
732 | " 34576.0 | \n",
733 | "
\n",
734 | " \n",
735 | " Separated | \n",
736 | " 14822.0 | \n",
737 | " 25867.0 | \n",
738 | "
\n",
739 | " \n",
740 | " Widowed | \n",
741 | " 17113.0 | \n",
742 | " 55463.0 | \n",
743 | "
\n",
744 | " \n",
745 | "
\n",
746 | "
"
747 | ],
748 | "text/plain": [
749 | "gender Female Male\n",
750 | "marriage_status \n",
751 | "Divorced 40039.0 40916.0\n",
752 | "Married 33602.0 63959.0\n",
753 | "Never married or under 15 years old 19885.0 34576.0\n",
754 | "Separated 14822.0 25867.0\n",
755 | "Widowed 17113.0 55463.0"
756 | ]
757 | },
758 | "execution_count": 16,
759 | "metadata": {},
760 | "output_type": "execute_result"
761 | }
762 | ],
763 | "source": [
764 | "calc.std(grp_marriage_sex, \"income\").round()"
765 | ]
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "### Weighted median income"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": 17,
777 | "metadata": {},
778 | "outputs": [
779 | {
780 | "data": {
781 | "text/html": [
782 | "\n",
783 | "\n",
796 | "
\n",
797 | " \n",
798 | " \n",
799 | " gender | \n",
800 | " Female | \n",
801 | " Male | \n",
802 | "
\n",
803 | " \n",
804 | " marriage_status | \n",
805 | " | \n",
806 | " | \n",
807 | "
\n",
808 | " \n",
809 | " \n",
810 | " \n",
811 | " Divorced | \n",
812 | " 21000.0 | \n",
813 | " 29000.0 | \n",
814 | "
\n",
815 | " \n",
816 | " Married | \n",
817 | " 11000.0 | \n",
818 | " 40200.0 | \n",
819 | "
\n",
820 | " \n",
821 | " Never married or under 15 years old | \n",
822 | " 8300.0 | \n",
823 | " 16000.0 | \n",
824 | "
\n",
825 | " \n",
826 | " Separated | \n",
827 | " 10000.0 | \n",
828 | " 0.0 | \n",
829 | "
\n",
830 | " \n",
831 | " Widowed | \n",
832 | " 0.0 | \n",
833 | " 0.0 | \n",
834 | "
\n",
835 | " \n",
836 | "
\n",
837 | "
"
838 | ],
839 | "text/plain": [
840 | "gender Female Male\n",
841 | "marriage_status \n",
842 | "Divorced 21000.0 29000.0\n",
843 | "Married 11000.0 40200.0\n",
844 | "Never married or under 15 years old 8300.0 16000.0\n",
845 | "Separated 10000.0 0.0\n",
846 | "Widowed 0.0 0.0"
847 | ]
848 | },
849 | "execution_count": 17,
850 | "metadata": {},
851 | "output_type": "execute_result"
852 | }
853 | ],
854 | "source": [
855 | "calc.median(grp_marriage_sex, \"income\")"
856 | ]
857 | },
858 | {
859 | "cell_type": "markdown",
860 | "metadata": {},
861 | "source": [
862 | "## Weighted 75th percentile of income"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 18,
868 | "metadata": {},
869 | "outputs": [
870 | {
871 | "data": {
872 | "text/html": [
873 | "\n",
874 | "\n",
887 | "
\n",
888 | " \n",
889 | " \n",
890 | " gender | \n",
891 | " Female | \n",
892 | " Male | \n",
893 | "
\n",
894 | " \n",
895 | " marriage_status | \n",
896 | " | \n",
897 | " | \n",
898 | "
\n",
899 | " \n",
900 | " \n",
901 | " \n",
902 | " Divorced | \n",
903 | " 39000.0 | \n",
904 | " 65000.0 | \n",
905 | "
\n",
906 | " \n",
907 | " Married | \n",
908 | " 35000.0 | \n",
909 | " 70000.0 | \n",
910 | "
\n",
911 | " \n",
912 | " Never married or under 15 years old | \n",
913 | " 25000.0 | \n",
914 | " 38000.0 | \n",
915 | "
\n",
916 | " \n",
917 | " Separated | \n",
918 | " 32400.0 | \n",
919 | " 30000.0 | \n",
920 | "
\n",
921 | " \n",
922 | " Widowed | \n",
923 | " 0.0 | \n",
924 | " 0.0 | \n",
925 | "
\n",
926 | " \n",
927 | "
\n",
928 | "
"
929 | ],
930 | "text/plain": [
931 | "gender Female Male\n",
932 | "marriage_status \n",
933 | "Divorced 39000.0 65000.0\n",
934 | "Married 35000.0 70000.0\n",
935 | "Never married or under 15 years old 25000.0 38000.0\n",
936 | "Separated 32400.0 30000.0\n",
937 | "Widowed 0.0 0.0"
938 | ]
939 | },
940 | "execution_count": 18,
941 | "metadata": {},
942 | "output_type": "execute_result"
943 | }
944 | ],
945 | "source": [
946 | "calc.quantile(grp_marriage_sex, \"income\", 0.75)"
947 | ]
948 | },
949 | {
950 | "cell_type": "markdown",
951 | "metadata": {},
952 | "source": [
953 | "---\n",
954 | "\n",
955 | "---\n",
956 | "\n",
957 | "---"
958 | ]
959 | }
960 | ],
961 | "metadata": {
962 | "kernelspec": {
963 | "display_name": "Python 3",
964 | "language": "python",
965 | "name": "python3"
966 | },
967 | "language_info": {
968 | "codemirror_mode": {
969 | "name": "ipython",
970 | "version": 3
971 | },
972 | "file_extension": ".py",
973 | "mimetype": "text/x-python",
974 | "name": "python",
975 | "nbconvert_exporter": "python",
976 | "pygments_lexer": "ipython3",
977 | "version": "3.4.3"
978 | }
979 | },
980 | "nbformat": 4,
981 | "nbformat_minor": 1
982 | }
983 |
--------------------------------------------------------------------------------
/examples/scripts/process-acs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env
2 | import sys, os
3 | import pandas as pd
4 |
5 | responses = pd.read_csv(sys.stdin).rename(columns={
6 | "AGEP": "age",
7 | "WAGP": "income"
8 | })
9 |
10 | responses["marriage_status"] = responses["MAR"].apply({
11 | 1: "Married",
12 | 2: "Widowed",
13 | 3: "Divorced",
14 | 4: "Separated",
15 | 5: "Never married or under 15 years old"
16 | }.get)
17 |
18 | responses["gender"] = responses["SEX"].apply({
19 | 1: "Male",
20 | 2: "Female"
21 | }.get)
22 |
23 | responses[[
24 | "SERIALNO",
25 | "PWGTP",
26 | "age",
27 | "gender",
28 | "marriage_status",
29 | "income"
30 | ]].to_csv(sys.stdout, index=False)
31 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | import os
3 |
4 | NAME = "weightedcalcs"
5 | HERE = os.path.abspath(os.path.dirname(__file__))
6 |
7 | version_ns = {}
8 | with open(os.path.join(HERE, NAME, '__version__.py')) as f:
9 | exec(f.read(), {}, version_ns)
10 |
11 | setup(
12 | name=NAME,
13 | version=version_ns['__version__'],
14 | description="Pandas-based utility to calculate weighted means, medians, distributions, standard deviations, and more.",
15 | url="http://github.com/jsvine/weightedcalcs",
16 | author="Jeremy Singer-Vine",
17 | author_email="jsvine@gmail.com",
18 | license="MIT",
19 | packages=[
20 | NAME
21 | ],
22 | install_requires=[
23 | "pandas>=2.0"
24 | ],
25 | classifiers=[
26 | "License :: OSI Approved :: MIT License",
27 | "Operating System :: OS Independent",
28 | "Programming Language :: Python :: 3",
29 | "Intended Audience :: Developers",
30 | "Intended Audience :: Science/Research",
31 | "Topic :: Scientific/Engineering :: Information Analysis",
32 | ],
33 | )
34 |
--------------------------------------------------------------------------------
/test/core_tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import weightedcalcs as wc
3 | import pandas as pd
4 | import sys
5 | import os
6 |
7 | calc = wc.Calculator("weights")
8 |
9 | class WCTest(unittest.TestCase):
10 |
11 | def test_mean(self):
12 | # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
13 | assert(calc.mean(pd.DataFrame({
14 | "values": [ 80, 90 ],
15 | "weights": [ 20, 30 ],
16 | }), "values") == 86)
17 |
18 | def test_mean_non_pandas(self):
19 | # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
20 | assert(calc.mean({
21 | "values": [ 80, 90 ],
22 | "weights": [ 20, 30 ],
23 | }, "values") == 86)
24 |
25 | def test_quantile(self):
26 | # Example via https://en.wikipedia.org/wiki/Weighted_median
27 | df = pd.DataFrame({
28 | "values": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
29 | "weights": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
30 | })
31 | assert(df["values"].median() == 0.1)
32 | assert(calc.quantile(df, "values", 0.5) == 0.2)
33 | assert(calc.median(df, "values") == 0.2)
34 |
35 | def test_quantile_split(self):
36 | df = pd.DataFrame({
37 | "values": [ 0, 1, 2, 3 ],
38 | "weights": [ 1, 1, 1, 1 ],
39 | })
40 | assert(calc.quantile(df, "values", 0.5) == 1.5)
41 |
42 | def test_bad_quantile(self):
43 | with self.assertRaises(Exception) as context:
44 | q = calc.quantile(pd.DataFrame({
45 | "values": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
46 | "weights": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
47 | }), "values", -1)
48 |
49 | def test_std(self):
50 | # Example via http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weightsd.pdf
51 | assert(calc.std(pd.DataFrame({
52 | "values": [ 2, 3, 5, 7, 11, 13, 17, 19, 23 ],
53 | "weights": [ 1, 1, 0, 0, 4, 1, 2, 1, 0 ],
54 | }), "values").round(2) == 5.82)
55 |
56 | def test_distribution(self):
57 | dist = calc.distribution(pd.DataFrame({
58 | "values": [ "a", "b", "b", "b", "c" ],
59 | "weights": [ 3, 2, 0, 1, 2 ],
60 | }), "values")
61 | assert(dist["a"] == 0.375)
62 | assert(dist["b"] == 0.375)
63 | assert(dist["c"] == 0.250)
64 |
65 | def test_count(self):
66 | count = calc.count(pd.DataFrame({
67 | "values": [ "a", "b", "b", "b", "c" ],
68 | "weights": [ 3, 2, 0, 1, 2 ],
69 | }))
70 | assert(count == 8)
71 |
72 | def test_sum(self):
73 | _sum = calc.sum(pd.DataFrame({
74 | "values": [ 1, 2, 3, 4, 5 ],
75 | "weights": [ 3, 2, 0, 1, 2 ],
76 | }), "values")
77 | assert(_sum == 21)
78 |
79 | def test_grouped(self):
80 | dist = calc.distribution(pd.DataFrame({
81 | "group": [ "x", "x", "x", "x", "x" ],
82 | "values": [ "a", "b", "b", "b", "c" ],
83 | "weights": [ 3, 2, 0, 1, 2 ],
84 | }).groupby("group"), "values")
85 | assert(dist.loc["x"]["a"] == 0.375)
86 | assert(dist.loc["x"]["b"] == 0.375)
87 | assert(dist.loc["x"]["c"] == 0.250)
88 |
89 | def test_multi_grouped(self):
90 | dist = calc.distribution(pd.DataFrame({
91 | "group_a": [ "x", "x", "x", "x", "x" ],
92 | "group_b": [ "x", "x", "x", "x", "x" ],
93 | "values": [ "a", "b", "b", "b", "c" ],
94 | "weights": [ 3, 2, 0, 1, 2 ],
95 | }).groupby([ "group_a", "group_b" ]), "values")
96 | assert(dist.loc[("x", "x")]["a"] == 0.375)
97 |
98 | def test_multi_grouped_two(self):
99 | dist = calc.distribution(pd.DataFrame({
100 | "group_a": [ "x", "x", "x", "y", "y" ],
101 | "group_b": [ "x", "x", "x", "y", "y" ],
102 | "values": [ "a", "b", "b", "b", "c" ],
103 | "weights": [ 3, 2, 0, 1, 2 ],
104 | }).groupby([ "group_a", "group_b" ]), "values")
105 | assert(dist.loc[("x", "x")]["a"] == 0.6)
106 | assert(dist.loc[("x", "x")]["b"] == 0.4)
107 | assert(dist.loc[("x", "x")]["c"] == 0)
108 |
109 | def test_null_values(self):
110 | with self.assertRaises(Exception) as context:
111 | dist = calc.distribution(pd.DataFrame({
112 | "values": [ None, "b", "b", "b", "c" ],
113 | "weights": [ 3, 2, 0, 1, 2 ],
114 | }), "values")
115 |
116 | if __name__ == '__main__':
117 | unittest.main()
118 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27,py34,py35,py36
3 | toxworkdir={env:TOX_WORK_DIR:.tox}
4 |
5 | [testenv]
6 | deps=nose
7 | commands=nosetests --nocapture
8 |
--------------------------------------------------------------------------------
/weightedcalcs/__init__.py:
--------------------------------------------------------------------------------
1 | from .__version__ import __version__
2 | from .core import Calculator
3 |
--------------------------------------------------------------------------------
/weightedcalcs/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION_TUPLE = (0, 1, 3)
2 | __version__ = ".".join(map(str, VERSION_TUPLE))
3 |
--------------------------------------------------------------------------------
/weightedcalcs/core.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import pandas as pd
3 |
4 | PANDAS_TYPES = (
5 | pd.DataFrame,
6 | pd.core.groupby.DataFrameGroupBy,
7 | )
8 |
9 | def pandas_deco(func):
10 | def func_wrapper(self, thing, *args, **kwargs):
11 | if isinstance(thing, PANDAS_TYPES):
12 | return func(self, thing, *args, **kwargs)
13 | else:
14 | return func(self, pd.DataFrame(thing), *args, **kwargs)
15 | return func_wrapper
16 |
17 | def groupby_deco(func):
18 | def func_wrapper(self, thing, *args, **kwargs):
19 | if isinstance(thing, pd.core.groupby.DataFrameGroupBy):
20 | agg = thing.apply(lambda x: func(self, x, *args, **kwargs), include_groups=False)
21 | is_series = isinstance(agg, pd.core.series.Series)
22 | has_multiindex = isinstance(agg.index, pd.MultiIndex)
23 | if is_series and has_multiindex:
24 | return agg.unstack()
25 | else:
26 | return agg
27 | return func(self, thing, *args, **kwargs)
28 | return func_wrapper
29 |
30 | def fillna_deco(val):
31 | def deco(func):
32 | def func_wrapper(self, thing, *args, **kwargs):
33 | return func(self, thing, *args, **kwargs).fillna(val)
34 | return func_wrapper
35 | return deco
36 |
37 | def check_nulls(series):
38 | if series.isnull().sum() > 0:
39 | raise ValueError("value_var contains null values")
40 | return series
41 |
42 | class Calculator(object):
43 | def __init__(self, weight_var):
44 | self.weight_var = weight_var
45 |
46 | @groupby_deco
47 | @pandas_deco
48 | def count(self, thing):
49 | return thing[self.weight_var].sum()
50 |
51 | @groupby_deco
52 | @pandas_deco
53 | def sum(self, thing, value_var):
54 | weights = thing[self.weight_var]
55 | values = check_nulls(thing[value_var])
56 | return (values * weights).sum()
57 |
58 | @groupby_deco
59 | @pandas_deco
60 | def mean(self, thing, value_var):
61 | weights = thing[self.weight_var]
62 | total_weight = weights.sum()
63 | values = check_nulls(thing[value_var])
64 | return (values * weights).sum() / total_weight
65 |
66 | @groupby_deco
67 | @pandas_deco
68 | def std(self, thing, value_var):
69 | weights = thing[self.weight_var]
70 | n_nonzero_weights = (weights > 0).sum()
71 | if (n_nonzero_weights) < 2: return pd.NA
72 | values = check_nulls(thing[value_var])
73 | mean = self.mean(thing, value_var)
74 | numerator = (weights * (values - mean).pow(2)).sum()
75 | denominator = (n_nonzero_weights - 1) * weights.sum() / n_nonzero_weights
76 | return pow(numerator / denominator, 0.5)
77 |
78 | @groupby_deco
79 | @pandas_deco
80 | def quantile(self, thing, value_var, q):
81 | if q < 0 or q > 1:
82 | raise ValueError("q must be between 0 and 1")
83 | df = pd.DataFrame({
84 | "weights": thing[self.weight_var],
85 | "values": check_nulls(thing[value_var])
86 | }).sort_values("values")
87 | df["cumul_prop"] = df["weights"].cumsum() / df["weights"].sum()
88 | shaved = df[df["cumul_prop"] >= q]
89 | if shaved.iloc[0]["cumul_prop"] == q:
90 | return shaved.head(2)["values"].mean()
91 | else:
92 | return shaved.iloc[0]["values"]
93 |
94 | @groupby_deco
95 | @pandas_deco
96 | def median(self, thing, value_var):
97 | return self.quantile(thing, value_var, 0.5)
98 |
99 | @fillna_deco(0)
100 | @groupby_deco
101 | @pandas_deco
102 | def distribution(self, thing, value_var):
103 | weights = thing[self.weight_var]
104 | total_weight = weights.sum()
105 | check_nulls(thing[value_var])
106 | return thing.groupby(value_var)[self.weight_var].sum() / total_weight
107 |
108 |
--------------------------------------------------------------------------------