├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── examples
    ├── data
    │   ├── acs-2015-pums-wy-simple.csv
    │   ├── acs-2015-pums-wy.csv
    │   ├── gss-extract.csv
    │   ├── ipsos-ssm-and-abortion-survey.csv
    │   └── simple.csv
    ├── notebooks
    │   └── example-usage.ipynb
    └── scripts
    │   └── process-acs.py
├── setup.py
├── test
    └── core_tests.py
├── tox.ini
└── weightedcalcs
    ├── __init__.py
    ├── __version__.py
    └── core.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | #### joe made this: http://goel.io/joe
 3 | 
 4 | #####=== Python ===#####
 5 | 
 6 | # Byte-compiled / optimized / DLL files
 7 | __pycache__/
 8 | *.py[cod]
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "2.7"
 4 |     - "3.4"
 5 |     - "3.5"
 6 |     - "3.6"
 7 | install:
 8 |     - pip install .
 9 |     - pip install nose
10 |     - pip install coveralls
11 | script: nosetests --with-coverage --cover-package weightedcalcs
12 | after_success: coveralls
13 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/).
 6 | 
 7 | ## [0.1.3] — 2024-11-10
 8 | ### Fixed
 9 | - Fix deprecation of `pandas.np`, h/t @simon-smart88. ([#9](https://github.com/jsvine/weightedcalcs/issues/9))
10 | - Fix deprecation of `DataFrameGroupBy.apply operated on the grouping columns`.
11 | 
12 | ### Changed
13 | - Change minimum `pandas` version to `2.0`.
14 | 
15 | ## [0.1.2] — 2017-06-17
16 | ### Fixed
17 | - Fix incompatibility with pandas 0.20.1
18 | 
19 | ## [0.1.1] — 2017-04-08
20 | ### Added
21 | - MANIFEST.in
22 | 
23 | ## [0.1.0] — 2017-03-30
24 | ### Added
25 | - Support for Python 2.7
26 | - Support for non-pandas input
27 | - Full test coverage
28 | 
29 | ## [0.0.0] — 2016-12-23
30 | 
31 | Initial release
32 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016, Jeremy Singer-Vine
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt *.md *.rst
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Version](https://img.shields.io/pypi/v/weightedcalcs.svg)](https://pypi.python.org/pypi/weightedcalcs) [![Build status](https://travis-ci.org/jsvine/weightedcalcs.png)](https://travis-ci.org/jsvine/weightedcalcs) [![Code coverage](https://img.shields.io/coveralls/jsvine/weightedcalcs.svg)](https://coveralls.io/github/jsvine/weightedcalcs) [![Support Python versions](https://img.shields.io/pypi/pyversions/weightedcalcs.svg)](https://pypi.python.org/pypi/weightedcalcs)
 2 | 
 3 | # weightedcalcs
 4 | 
 5 | `weightedcalcs` is a `pandas`-based Python library for calculating weighted means, medians, standard deviations, and more.
 6 | 
 7 | ## Features
 8 | 
 9 | - Plays well with `pandas`.
10 | - Support for weighted means, medians, quantiles, standard deviations, and distributions.
11 | - Support for grouped calculations, using `DataFrameGroupBy` objects.
12 | - Raises an error when your data contains null-values.
13 | - Full test coverage.
14 | 
15 | ## Installation
16 | 
17 | ```sh
18 | pip install weightedcalcs
19 | ```
20 | 
21 | ## Usage
22 | 
23 | ### Getting started
24 | 
25 | Every weighted calculation in `weightedcalcs` begins with an instance of the `weightedcalcs.Calculator` class. `Calculator` takes one argument: the name of your weighting variable. So if you're analyzing a survey where the weighting variable is called `"resp_weight"`, you'd do this:
26 | 
27 | ```python
28 | import weightedcalcs as wc
29 | calc = wc.Calculator("resp_weight")
30 | ```
31 | 
32 | ### Types of calculations
33 | 
34 | Currently, `weightedcalcs.Calculator` supports the following calculations:
35 | 
36 | - `calc.mean(my_data, value_var)`: The weighted arithmetic average of `value_var`.
37 | - `calc.quantile(my_data, value_var, q)`: The weighted quantile of `value_var`, where `q` is between 0 and 1.
38 | - `calc.median(my_data, value_var)`: The weighted median of `value_var`, equivalent to `.quantile(...)` where `q=0.5`.
39 | - `calc.std(my_data, value_var)`: The weighted standard deviation of `value_var`.
40 | - `calc.distribution(my_data, value_var)`: The weighted proportions of `value_var`, interpreting `value_var` as categories.
41 | - `calc.count(my_data)`: The weighted count of all observations, i.e., the total weight.
42 | - `calc.sum(my_data, value_var)`: The weighted sum of `value_var`.
43 | 
44 | The `obj` parameter above should one of the following:
45 | 
46 | - A `pandas` `DataFrame` object
47 | - A `pandas` `DataFrame.groupby` object
48 | - A plain Python dictionary where the keys are column names and the values are equal-length lists.
49 | 
50 | ### Basic example
51 | 
52 | Below is a basic example of using `weightedcalcs` to find what percentage of Wyoming residents are married, divorced, et cetera:
53 | 
54 | ```python
55 | import pandas as pd
56 | import weightedcalcs as wc
57 | 
58 | # Load the 2015 American Community Survey person-level responses for Wyoming
59 | responses = pd.read_csv("examples/data/acs-2015-pums-wy-simple.csv")
60 | 
61 | # `PWGTP` is the weighting variable used in the ACS's person-level data
62 | calc = wc.Calculator("PWGTP")
63 | 
64 | # Get the distribution of marriage-status responses
65 | calc.distribution(responses, "marriage_status").round(3).sort_values(ascending=False)
66 | 
67 | # -- Output --
68 | # marriage_status
69 | # Married                                0.425
70 | # Never married or under 15 years old    0.421
71 | # Divorced                               0.097
72 | # Widowed                                0.046
73 | # Separated                              0.012
74 | # Name: PWGTP, dtype: float64
75 | ```
76 | 
77 | ### More examples
78 | 
79 | [See this notebook to see examples of other calculations, including grouped calculations.](examples/notebooks/example-usage.ipynb)
80 | 
81 | [Max Ghenis](https://github.com/MaxGhenis) has created [a version of the example notebook that can be run directly in your browser](https://colab.research.google.com/gist/MaxGhenis/4c96163eacebc1005419c9533a568c7e/weightedcalcs-example-usage-scf.ipynb), via Google Colab. 
82 | 
83 | ### Weightedcalcs in the wild
84 | 
85 | - "[Procesando los microdatos de la Encuesta Permanente de Hogares](http://blog.jazzido.com/2017/01/09/procesando-microdatos-eph)," by Manuel Aristarán
86 | - [BuzzFeedNews/2017-01-media-platform-and-news-trust-survey](https://github.com/BuzzFeedNews/2017-01-media-platform-and-news-trust-survey/blob/master/notebooks/platform-trust-additional-analysis.ipynb)
87 | - [BuzzFeedNews/2016-12-transgender-rights-survey](https://github.com/BuzzFeedNews/2016-12-transgender-rights-survey/blob/master/notebooks/additional-analysis.ipynb)
88 | 
89 | ## Other Python weighted-calculation libraries
90 | 
91 | - [`tinybike/weightedstats`](https://github.com/tinybike/weightedstats)
92 | - [`nudomarinero/wquantiles`](https://github.com/nudomarinero/wquantiles/)
93 | 
94 | 


--------------------------------------------------------------------------------
/examples/data/ipsos-ssm-and-abortion-survey.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsvine/weightedcalcs/cbd2818e6f7ad82c29714f842228bfd4f65c008f/examples/data/ipsos-ssm-and-abortion-survey.csv


--------------------------------------------------------------------------------
/examples/data/simple.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsvine/weightedcalcs/cbd2818e6f7ad82c29714f842228bfd4f65c008f/examples/data/simple.csv


--------------------------------------------------------------------------------
/examples/notebooks/example-usage.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Example usage for `weightedcalcs`\n",
  8 |     "\n",
  9 |     "The example below uawa `weightedcalcs` to analyze a slice of the [American Community Survey's 2015 data](https://www.census.gov/programs-surveys/acs/technical-documentation/pums/documentation.html) for Wyoming."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "collapsed": true
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import weightedcalcs as wc\n",
 21 |     "import pandas as pd"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Load the ACS data into a `pandas.DataFrame`"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "responses = pd.read_csv(\"../data/acs-2015-pums-wy-simple.csv\")"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/html": [
 50 |        "<div>\n",
 51 |        "<style>\n",
 52 |        "    .dataframe thead tr:only-child th {\n",
 53 |        "        text-align: right;\n",
 54 |        "    }\n",
 55 |        "\n",
 56 |        "    .dataframe thead th {\n",
 57 |        "        text-align: left;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe tbody tr th {\n",
 61 |        "        vertical-align: top;\n",
 62 |        "    }\n",
 63 |        "</style>\n",
 64 |        "<table border=\"1\" class=\"dataframe\">\n",
 65 |        "  <thead>\n",
 66 |        "    <tr style=\"text-align: right;\">\n",
 67 |        "      <th></th>\n",
 68 |        "      <th>SERIALNO</th>\n",
 69 |        "      <th>PWGTP</th>\n",
 70 |        "      <th>age</th>\n",
 71 |        "      <th>gender</th>\n",
 72 |        "      <th>marriage_status</th>\n",
 73 |        "      <th>income</th>\n",
 74 |        "    </tr>\n",
 75 |        "  </thead>\n",
 76 |        "  <tbody>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>0</th>\n",
 79 |        "      <td>1990</td>\n",
 80 |        "      <td>148</td>\n",
 81 |        "      <td>67</td>\n",
 82 |        "      <td>Male</td>\n",
 83 |        "      <td>Never married or under 15 years old</td>\n",
 84 |        "      <td>27000.0</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>1</th>\n",
 88 |        "      <td>2253</td>\n",
 89 |        "      <td>371</td>\n",
 90 |        "      <td>93</td>\n",
 91 |        "      <td>Female</td>\n",
 92 |        "      <td>Widowed</td>\n",
 93 |        "      <td>0.0</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>2</th>\n",
 97 |        "      <td>2861</td>\n",
 98 |        "      <td>288</td>\n",
 99 |        "      <td>46</td>\n",
100 |        "      <td>Female</td>\n",
101 |        "      <td>Divorced</td>\n",
102 |        "      <td>44000.0</td>\n",
103 |        "    </tr>\n",
104 |        "    <tr>\n",
105 |        "      <th>3</th>\n",
106 |        "      <td>4537</td>\n",
107 |        "      <td>58</td>\n",
108 |        "      <td>59</td>\n",
109 |        "      <td>Male</td>\n",
110 |        "      <td>Divorced</td>\n",
111 |        "      <td>35000.0</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>4</th>\n",
115 |        "      <td>4797</td>\n",
116 |        "      <td>130</td>\n",
117 |        "      <td>70</td>\n",
118 |        "      <td>Male</td>\n",
119 |        "      <td>Married</td>\n",
120 |        "      <td>0.0</td>\n",
121 |        "    </tr>\n",
122 |        "  </tbody>\n",
123 |        "</table>\n",
124 |        "</div>"
125 |       ],
126 |       "text/plain": [
127 |        "   SERIALNO  PWGTP  age  gender                      marriage_status   income\n",
128 |        "0      1990    148   67    Male  Never married or under 15 years old  27000.0\n",
129 |        "1      2253    371   93  Female                              Widowed      0.0\n",
130 |        "2      2861    288   46  Female                             Divorced  44000.0\n",
131 |        "3      4537     58   59    Male                             Divorced  35000.0\n",
132 |        "4      4797    130   70    Male                              Married      0.0"
133 |       ]
134 |      },
135 |      "execution_count": 3,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "responses.head()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "In addition to the full list of responses, let's create a subset including only adult respondents, since we'll be focusing on income later."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 4,
154 |    "metadata": {
155 |     "collapsed": true
156 |    },
157 |    "outputs": [],
158 |    "source": [
159 |     "adults = responses[responses[\"age\"] >= 18]"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 5,
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "data": {
169 |       "text/html": [
170 |        "<div>\n",
171 |        "<style>\n",
172 |        "    .dataframe thead tr:only-child th {\n",
173 |        "        text-align: right;\n",
174 |        "    }\n",
175 |        "\n",
176 |        "    .dataframe thead th {\n",
177 |        "        text-align: left;\n",
178 |        "    }\n",
179 |        "\n",
180 |        "    .dataframe tbody tr th {\n",
181 |        "        vertical-align: top;\n",
182 |        "    }\n",
183 |        "</style>\n",
184 |        "<table border=\"1\" class=\"dataframe\">\n",
185 |        "  <thead>\n",
186 |        "    <tr style=\"text-align: right;\">\n",
187 |        "      <th></th>\n",
188 |        "      <th>SERIALNO</th>\n",
189 |        "      <th>PWGTP</th>\n",
190 |        "      <th>age</th>\n",
191 |        "      <th>gender</th>\n",
192 |        "      <th>marriage_status</th>\n",
193 |        "      <th>income</th>\n",
194 |        "    </tr>\n",
195 |        "  </thead>\n",
196 |        "  <tbody>\n",
197 |        "    <tr>\n",
198 |        "      <th>0</th>\n",
199 |        "      <td>1990</td>\n",
200 |        "      <td>148</td>\n",
201 |        "      <td>67</td>\n",
202 |        "      <td>Male</td>\n",
203 |        "      <td>Never married or under 15 years old</td>\n",
204 |        "      <td>27000.0</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>1</th>\n",
208 |        "      <td>2253</td>\n",
209 |        "      <td>371</td>\n",
210 |        "      <td>93</td>\n",
211 |        "      <td>Female</td>\n",
212 |        "      <td>Widowed</td>\n",
213 |        "      <td>0.0</td>\n",
214 |        "    </tr>\n",
215 |        "    <tr>\n",
216 |        "      <th>2</th>\n",
217 |        "      <td>2861</td>\n",
218 |        "      <td>288</td>\n",
219 |        "      <td>46</td>\n",
220 |        "      <td>Female</td>\n",
221 |        "      <td>Divorced</td>\n",
222 |        "      <td>44000.0</td>\n",
223 |        "    </tr>\n",
224 |        "    <tr>\n",
225 |        "      <th>3</th>\n",
226 |        "      <td>4537</td>\n",
227 |        "      <td>58</td>\n",
228 |        "      <td>59</td>\n",
229 |        "      <td>Male</td>\n",
230 |        "      <td>Divorced</td>\n",
231 |        "      <td>35000.0</td>\n",
232 |        "    </tr>\n",
233 |        "    <tr>\n",
234 |        "      <th>4</th>\n",
235 |        "      <td>4797</td>\n",
236 |        "      <td>130</td>\n",
237 |        "      <td>70</td>\n",
238 |        "      <td>Male</td>\n",
239 |        "      <td>Married</td>\n",
240 |        "      <td>0.0</td>\n",
241 |        "    </tr>\n",
242 |        "  </tbody>\n",
243 |        "</table>\n",
244 |        "</div>"
245 |       ],
246 |       "text/plain": [
247 |        "   SERIALNO  PWGTP  age  gender                      marriage_status   income\n",
248 |        "0      1990    148   67    Male  Never married or under 15 years old  27000.0\n",
249 |        "1      2253    371   93  Female                              Widowed      0.0\n",
250 |        "2      2861    288   46  Female                             Divorced  44000.0\n",
251 |        "3      4537     58   59    Male                             Divorced  35000.0\n",
252 |        "4      4797    130   70    Male                              Married      0.0"
253 |       ]
254 |      },
255 |      "execution_count": 5,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "adults.head()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "## Create an instance of `weightedcalcs.Calculator`\n",
269 |     "\n",
270 |     "The ACS' `PWGTP` variable is respondents the Census-assigned survey weight. All our weighted calculations will use this variable."
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 6,
276 |    "metadata": {
277 |     "collapsed": true
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "calc = wc.Calculator(\"PWGTP\")"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "## Basic weighted calculations"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {},
294 |    "source": [
295 |     "### Weighted mean income"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 7,
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/plain": [
306 |        "30709.0"
307 |       ]
308 |      },
309 |      "execution_count": 7,
310 |      "metadata": {},
311 |      "output_type": "execute_result"
312 |     }
313 |    ],
314 |    "source": [
315 |     "calc.mean(adults, \"income\").round()"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "metadata": {},
321 |    "source": [
322 |     "### Weighted standard deviation of income"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 8,
328 |    "metadata": {},
329 |    "outputs": [
330 |     {
331 |      "data": {
332 |       "text/plain": [
333 |        "46093.0"
334 |       ]
335 |      },
336 |      "execution_count": 8,
337 |      "metadata": {},
338 |      "output_type": "execute_result"
339 |     }
340 |    ],
341 |    "source": [
342 |     "calc.std(adults, \"income\").round()"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "### Weighted median income"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 9,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/plain": [
360 |        "18000.0"
361 |       ]
362 |      },
363 |      "execution_count": 9,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "calc.median(adults, \"income\")"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "### Weighted 75th percentile of income"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 10,
382 |    "metadata": {},
383 |    "outputs": [
384 |     {
385 |      "data": {
386 |       "text/plain": [
387 |        "45000.0"
388 |       ]
389 |      },
390 |      "execution_count": 10,
391 |      "metadata": {},
392 |      "output_type": "execute_result"
393 |     }
394 |    ],
395 |    "source": [
396 |     "calc.quantile(adults, \"income\", 0.75)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "### Weighted distribution of marriage statuses"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {},
409 |    "source": [
410 |     "~43% of Wyoming residents are married:"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 11,
416 |    "metadata": {},
417 |    "outputs": [
418 |     {
419 |      "data": {
420 |       "text/plain": [
421 |        "marriage_status\n",
422 |        "Married                                0.425\n",
423 |        "Never married or under 15 years old    0.421\n",
424 |        "Divorced                               0.097\n",
425 |        "Widowed                                0.046\n",
426 |        "Separated                              0.012\n",
427 |        "Name: PWGTP, dtype: float64"
428 |       ]
429 |      },
430 |      "execution_count": 11,
431 |      "metadata": {},
432 |      "output_type": "execute_result"
433 |     }
434 |    ],
435 |    "source": [
436 |     "calc.distribution(responses, \"marriage_status\").round(3).sort_values(ascending=False)"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {},
442 |    "source": [
443 |     "~56% of *adult* Wyoming residents are married:"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 12,
449 |    "metadata": {},
450 |    "outputs": [
451 |     {
452 |      "data": {
453 |       "text/plain": [
454 |        "marriage_status\n",
455 |        "Married                                0.557\n",
456 |        "Never married or under 15 years old    0.240\n",
457 |        "Divorced                               0.127\n",
458 |        "Widowed                                0.060\n",
459 |        "Separated                              0.016\n",
460 |        "Name: PWGTP, dtype: float64"
461 |       ]
462 |      },
463 |      "execution_count": 12,
464 |      "metadata": {},
465 |      "output_type": "execute_result"
466 |     }
467 |    ],
468 |    "source": [
469 |     "calc.distribution(adults, \"marriage_status\").round(3).sort_values(ascending=False)"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "## Grouped weighted calculations\n",
477 |     "\n",
478 |     "Below, we perform similar calculations as above, but now take advantage of the fact that `weightedcalcs` can handle `DataFrameGroupBy` objects. In the examples below, we group by the ACS's marriage status categories and gender."
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 13,
484 |    "metadata": {
485 |     "collapsed": true
486 |    },
487 |    "outputs": [],
488 |    "source": [
489 |     "grp_marriage_sex = adults.groupby([\"marriage_status\", \"gender\"])"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {},
495 |    "source": [
496 |     "For reference, here's how many responses fall into each category:"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 14,
502 |    "metadata": {
503 |     "scrolled": true
504 |    },
505 |    "outputs": [
506 |     {
507 |      "data": {
508 |       "text/html": [
509 |        "<div>\n",
510 |        "<style>\n",
511 |        "    .dataframe thead tr:only-child th {\n",
512 |        "        text-align: right;\n",
513 |        "    }\n",
514 |        "\n",
515 |        "    .dataframe thead th {\n",
516 |        "        text-align: left;\n",
517 |        "    }\n",
518 |        "\n",
519 |        "    .dataframe tbody tr th {\n",
520 |        "        vertical-align: top;\n",
521 |        "    }\n",
522 |        "</style>\n",
523 |        "<table border=\"1\" class=\"dataframe\">\n",
524 |        "  <thead>\n",
525 |        "    <tr style=\"text-align: right;\">\n",
526 |        "      <th>gender</th>\n",
527 |        "      <th>Female</th>\n",
528 |        "      <th>Male</th>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>marriage_status</th>\n",
532 |        "      <th></th>\n",
533 |        "      <th></th>\n",
534 |        "    </tr>\n",
535 |        "  </thead>\n",
536 |        "  <tbody>\n",
537 |        "    <tr>\n",
538 |        "      <th>Divorced</th>\n",
539 |        "      <td>292</td>\n",
540 |        "      <td>279</td>\n",
541 |        "    </tr>\n",
542 |        "    <tr>\n",
543 |        "      <th>Married</th>\n",
544 |        "      <td>1337</td>\n",
545 |        "      <td>1337</td>\n",
546 |        "    </tr>\n",
547 |        "    <tr>\n",
548 |        "      <th>Never married or under 15 years old</th>\n",
549 |        "      <td>382</td>\n",
550 |        "      <td>535</td>\n",
551 |        "    </tr>\n",
552 |        "    <tr>\n",
553 |        "      <th>Separated</th>\n",
554 |        "      <td>25</td>\n",
555 |        "      <td>18</td>\n",
556 |        "    </tr>\n",
557 |        "    <tr>\n",
558 |        "      <th>Widowed</th>\n",
559 |        "      <td>232</td>\n",
560 |        "      <td>75</td>\n",
561 |        "    </tr>\n",
562 |        "  </tbody>\n",
563 |        "</table>\n",
564 |        "</div>"
565 |       ],
566 |       "text/plain": [
567 |        "gender                               Female  Male\n",
568 |        "marriage_status                                  \n",
569 |        "Divorced                                292   279\n",
570 |        "Married                                1337  1337\n",
571 |        "Never married or under 15 years old     382   535\n",
572 |        "Separated                                25    18\n",
573 |        "Widowed                                 232    75"
574 |       ]
575 |      },
576 |      "execution_count": 14,
577 |      "metadata": {},
578 |      "output_type": "execute_result"
579 |     }
580 |    ],
581 |    "source": [
582 |     "grp_marriage_sex.size().unstack()"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "markdown",
587 |    "metadata": {},
588 |    "source": [
589 |     "### Weighted mean income\n"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": 15,
595 |    "metadata": {},
596 |    "outputs": [
597 |     {
598 |      "data": {
599 |       "text/html": [
600 |        "<div>\n",
601 |        "<style>\n",
602 |        "    .dataframe thead tr:only-child th {\n",
603 |        "        text-align: right;\n",
604 |        "    }\n",
605 |        "\n",
606 |        "    .dataframe thead th {\n",
607 |        "        text-align: left;\n",
608 |        "    }\n",
609 |        "\n",
610 |        "    .dataframe tbody tr th {\n",
611 |        "        vertical-align: top;\n",
612 |        "    }\n",
613 |        "</style>\n",
614 |        "<table border=\"1\" class=\"dataframe\">\n",
615 |        "  <thead>\n",
616 |        "    <tr style=\"text-align: right;\">\n",
617 |        "      <th>gender</th>\n",
618 |        "      <th>Female</th>\n",
619 |        "      <th>Male</th>\n",
620 |        "    </tr>\n",
621 |        "    <tr>\n",
622 |        "      <th>marriage_status</th>\n",
623 |        "      <th></th>\n",
624 |        "      <th></th>\n",
625 |        "    </tr>\n",
626 |        "  </thead>\n",
627 |        "  <tbody>\n",
628 |        "    <tr>\n",
629 |        "      <th>Divorced</th>\n",
630 |        "      <td>27803</td>\n",
631 |        "      <td>38884</td>\n",
632 |        "    </tr>\n",
633 |        "    <tr>\n",
634 |        "      <th>Married</th>\n",
635 |        "      <td>22592</td>\n",
636 |        "      <td>50263</td>\n",
637 |        "    </tr>\n",
638 |        "    <tr>\n",
639 |        "      <th>Never married or under 15 years old</th>\n",
640 |        "      <td>15625</td>\n",
641 |        "      <td>27531</td>\n",
642 |        "    </tr>\n",
643 |        "    <tr>\n",
644 |        "      <th>Separated</th>\n",
645 |        "      <td>15443</td>\n",
646 |        "      <td>18553</td>\n",
647 |        "    </tr>\n",
648 |        "    <tr>\n",
649 |        "      <th>Widowed</th>\n",
650 |        "      <td>5890</td>\n",
651 |        "      <td>15421</td>\n",
652 |        "    </tr>\n",
653 |        "  </tbody>\n",
654 |        "</table>\n",
655 |        "</div>"
656 |       ],
657 |       "text/plain": [
658 |        "gender                               Female   Male\n",
659 |        "marriage_status                                   \n",
660 |        "Divorced                              27803  38884\n",
661 |        "Married                               22592  50263\n",
662 |        "Never married or under 15 years old   15625  27531\n",
663 |        "Separated                             15443  18553\n",
664 |        "Widowed                                5890  15421"
665 |       ]
666 |      },
667 |      "execution_count": 15,
668 |      "metadata": {},
669 |      "output_type": "execute_result"
670 |     }
671 |    ],
672 |    "source": [
673 |     "calc.mean(grp_marriage_sex, \"income\").round().astype(int)"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "markdown",
678 |    "metadata": {},
679 |    "source": [
680 |     "### Weighted standard deviation of income"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": 16,
686 |    "metadata": {},
687 |    "outputs": [
688 |     {
689 |      "data": {
690 |       "text/html": [
691 |        "<div>\n",
692 |        "<style>\n",
693 |        "    .dataframe thead tr:only-child th {\n",
694 |        "        text-align: right;\n",
695 |        "    }\n",
696 |        "\n",
697 |        "    .dataframe thead th {\n",
698 |        "        text-align: left;\n",
699 |        "    }\n",
700 |        "\n",
701 |        "    .dataframe tbody tr th {\n",
702 |        "        vertical-align: top;\n",
703 |        "    }\n",
704 |        "</style>\n",
705 |        "<table border=\"1\" class=\"dataframe\">\n",
706 |        "  <thead>\n",
707 |        "    <tr style=\"text-align: right;\">\n",
708 |        "      <th>gender</th>\n",
709 |        "      <th>Female</th>\n",
710 |        "      <th>Male</th>\n",
711 |        "    </tr>\n",
712 |        "    <tr>\n",
713 |        "      <th>marriage_status</th>\n",
714 |        "      <th></th>\n",
715 |        "      <th></th>\n",
716 |        "    </tr>\n",
717 |        "  </thead>\n",
718 |        "  <tbody>\n",
719 |        "    <tr>\n",
720 |        "      <th>Divorced</th>\n",
721 |        "      <td>40039.0</td>\n",
722 |        "      <td>40916.0</td>\n",
723 |        "    </tr>\n",
724 |        "    <tr>\n",
725 |        "      <th>Married</th>\n",
726 |        "      <td>33602.0</td>\n",
727 |        "      <td>63959.0</td>\n",
728 |        "    </tr>\n",
729 |        "    <tr>\n",
730 |        "      <th>Never married or under 15 years old</th>\n",
731 |        "      <td>19885.0</td>\n",
732 |        "      <td>34576.0</td>\n",
733 |        "    </tr>\n",
734 |        "    <tr>\n",
735 |        "      <th>Separated</th>\n",
736 |        "      <td>14822.0</td>\n",
737 |        "      <td>25867.0</td>\n",
738 |        "    </tr>\n",
739 |        "    <tr>\n",
740 |        "      <th>Widowed</th>\n",
741 |        "      <td>17113.0</td>\n",
742 |        "      <td>55463.0</td>\n",
743 |        "    </tr>\n",
744 |        "  </tbody>\n",
745 |        "</table>\n",
746 |        "</div>"
747 |       ],
748 |       "text/plain": [
749 |        "gender                                Female     Male\n",
750 |        "marriage_status                                      \n",
751 |        "Divorced                             40039.0  40916.0\n",
752 |        "Married                              33602.0  63959.0\n",
753 |        "Never married or under 15 years old  19885.0  34576.0\n",
754 |        "Separated                            14822.0  25867.0\n",
755 |        "Widowed                              17113.0  55463.0"
756 |       ]
757 |      },
758 |      "execution_count": 16,
759 |      "metadata": {},
760 |      "output_type": "execute_result"
761 |     }
762 |    ],
763 |    "source": [
764 |     "calc.std(grp_marriage_sex, \"income\").round()"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "markdown",
769 |    "metadata": {},
770 |    "source": [
771 |     "### Weighted median income"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": 17,
777 |    "metadata": {},
778 |    "outputs": [
779 |     {
780 |      "data": {
781 |       "text/html": [
782 |        "<div>\n",
783 |        "<style>\n",
784 |        "    .dataframe thead tr:only-child th {\n",
785 |        "        text-align: right;\n",
786 |        "    }\n",
787 |        "\n",
788 |        "    .dataframe thead th {\n",
789 |        "        text-align: left;\n",
790 |        "    }\n",
791 |        "\n",
792 |        "    .dataframe tbody tr th {\n",
793 |        "        vertical-align: top;\n",
794 |        "    }\n",
795 |        "</style>\n",
796 |        "<table border=\"1\" class=\"dataframe\">\n",
797 |        "  <thead>\n",
798 |        "    <tr style=\"text-align: right;\">\n",
799 |        "      <th>gender</th>\n",
800 |        "      <th>Female</th>\n",
801 |        "      <th>Male</th>\n",
802 |        "    </tr>\n",
803 |        "    <tr>\n",
804 |        "      <th>marriage_status</th>\n",
805 |        "      <th></th>\n",
806 |        "      <th></th>\n",
807 |        "    </tr>\n",
808 |        "  </thead>\n",
809 |        "  <tbody>\n",
810 |        "    <tr>\n",
811 |        "      <th>Divorced</th>\n",
812 |        "      <td>21000.0</td>\n",
813 |        "      <td>29000.0</td>\n",
814 |        "    </tr>\n",
815 |        "    <tr>\n",
816 |        "      <th>Married</th>\n",
817 |        "      <td>11000.0</td>\n",
818 |        "      <td>40200.0</td>\n",
819 |        "    </tr>\n",
820 |        "    <tr>\n",
821 |        "      <th>Never married or under 15 years old</th>\n",
822 |        "      <td>8300.0</td>\n",
823 |        "      <td>16000.0</td>\n",
824 |        "    </tr>\n",
825 |        "    <tr>\n",
826 |        "      <th>Separated</th>\n",
827 |        "      <td>10000.0</td>\n",
828 |        "      <td>0.0</td>\n",
829 |        "    </tr>\n",
830 |        "    <tr>\n",
831 |        "      <th>Widowed</th>\n",
832 |        "      <td>0.0</td>\n",
833 |        "      <td>0.0</td>\n",
834 |        "    </tr>\n",
835 |        "  </tbody>\n",
836 |        "</table>\n",
837 |        "</div>"
838 |       ],
839 |       "text/plain": [
840 |        "gender                                Female     Male\n",
841 |        "marriage_status                                      \n",
842 |        "Divorced                             21000.0  29000.0\n",
843 |        "Married                              11000.0  40200.0\n",
844 |        "Never married or under 15 years old   8300.0  16000.0\n",
845 |        "Separated                            10000.0      0.0\n",
846 |        "Widowed                                  0.0      0.0"
847 |       ]
848 |      },
849 |      "execution_count": 17,
850 |      "metadata": {},
851 |      "output_type": "execute_result"
852 |     }
853 |    ],
854 |    "source": [
855 |     "calc.median(grp_marriage_sex, \"income\")"
856 |    ]
857 |   },
858 |   {
859 |    "cell_type": "markdown",
860 |    "metadata": {},
861 |    "source": [
862 |     "## Weighted 75th percentile of income"
863 |    ]
864 |   },
865 |   {
866 |    "cell_type": "code",
867 |    "execution_count": 18,
868 |    "metadata": {},
869 |    "outputs": [
870 |     {
871 |      "data": {
872 |       "text/html": [
873 |        "<div>\n",
874 |        "<style>\n",
875 |        "    .dataframe thead tr:only-child th {\n",
876 |        "        text-align: right;\n",
877 |        "    }\n",
878 |        "\n",
879 |        "    .dataframe thead th {\n",
880 |        "        text-align: left;\n",
881 |        "    }\n",
882 |        "\n",
883 |        "    .dataframe tbody tr th {\n",
884 |        "        vertical-align: top;\n",
885 |        "    }\n",
886 |        "</style>\n",
887 |        "<table border=\"1\" class=\"dataframe\">\n",
888 |        "  <thead>\n",
889 |        "    <tr style=\"text-align: right;\">\n",
890 |        "      <th>gender</th>\n",
891 |        "      <th>Female</th>\n",
892 |        "      <th>Male</th>\n",
893 |        "    </tr>\n",
894 |        "    <tr>\n",
895 |        "      <th>marriage_status</th>\n",
896 |        "      <th></th>\n",
897 |        "      <th></th>\n",
898 |        "    </tr>\n",
899 |        "  </thead>\n",
900 |        "  <tbody>\n",
901 |        "    <tr>\n",
902 |        "      <th>Divorced</th>\n",
903 |        "      <td>39000.0</td>\n",
904 |        "      <td>65000.0</td>\n",
905 |        "    </tr>\n",
906 |        "    <tr>\n",
907 |        "      <th>Married</th>\n",
908 |        "      <td>35000.0</td>\n",
909 |        "      <td>70000.0</td>\n",
910 |        "    </tr>\n",
911 |        "    <tr>\n",
912 |        "      <th>Never married or under 15 years old</th>\n",
913 |        "      <td>25000.0</td>\n",
914 |        "      <td>38000.0</td>\n",
915 |        "    </tr>\n",
916 |        "    <tr>\n",
917 |        "      <th>Separated</th>\n",
918 |        "      <td>32400.0</td>\n",
919 |        "      <td>30000.0</td>\n",
920 |        "    </tr>\n",
921 |        "    <tr>\n",
922 |        "      <th>Widowed</th>\n",
923 |        "      <td>0.0</td>\n",
924 |        "      <td>0.0</td>\n",
925 |        "    </tr>\n",
926 |        "  </tbody>\n",
927 |        "</table>\n",
928 |        "</div>"
929 |       ],
930 |       "text/plain": [
931 |        "gender                                Female     Male\n",
932 |        "marriage_status                                      \n",
933 |        "Divorced                             39000.0  65000.0\n",
934 |        "Married                              35000.0  70000.0\n",
935 |        "Never married or under 15 years old  25000.0  38000.0\n",
936 |        "Separated                            32400.0  30000.0\n",
937 |        "Widowed                                  0.0      0.0"
938 |       ]
939 |      },
940 |      "execution_count": 18,
941 |      "metadata": {},
942 |      "output_type": "execute_result"
943 |     }
944 |    ],
945 |    "source": [
946 |     "calc.quantile(grp_marriage_sex, \"income\", 0.75)"
947 |    ]
948 |   },
949 |   {
950 |    "cell_type": "markdown",
951 |    "metadata": {},
952 |    "source": [
953 |     "---\n",
954 |     "\n",
955 |     "---\n",
956 |     "\n",
957 |     "---"
958 |    ]
959 |   }
960 |  ],
961 |  "metadata": {
962 |   "kernelspec": {
963 |    "display_name": "Python 3",
964 |    "language": "python",
965 |    "name": "python3"
966 |   },
967 |   "language_info": {
968 |    "codemirror_mode": {
969 |     "name": "ipython",
970 |     "version": 3
971 |    },
972 |    "file_extension": ".py",
973 |    "mimetype": "text/x-python",
974 |    "name": "python",
975 |    "nbconvert_exporter": "python",
976 |    "pygments_lexer": "ipython3",
977 |    "version": "3.4.3"
978 |   }
979 |  },
980 |  "nbformat": 4,
981 |  "nbformat_minor": 1
982 | }
983 | 


--------------------------------------------------------------------------------
/examples/scripts/process-acs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env
 2 | import sys, os
 3 | import pandas as pd
 4 | 
 5 | responses = pd.read_csv(sys.stdin).rename(columns={
 6 |     "AGEP": "age",
 7 |     "WAGP": "income"
 8 | })
 9 | 
10 | responses["marriage_status"] = responses["MAR"].apply({
11 |     1: "Married",
12 |     2: "Widowed",
13 |     3: "Divorced",
14 |     4: "Separated",
15 |     5: "Never married or under 15 years old"
16 | }.get)
17 | 
18 | responses["gender"] = responses["SEX"].apply({
19 |     1: "Male",
20 |     2: "Female"
21 | }.get)
22 | 
23 | responses[[
24 |     "SERIALNO",
25 |     "PWGTP",
26 |     "age",
27 |     "gender",
28 |     "marriage_status",
29 |     "income"
30 | ]].to_csv(sys.stdout, index=False)
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import os
 3 | 
 4 | NAME = "weightedcalcs"
 5 | HERE = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | version_ns = {}
 8 | with open(os.path.join(HERE, NAME, '__version__.py')) as f:
 9 |     exec(f.read(), {}, version_ns)
10 | 
11 | setup(
12 |     name=NAME,
13 |     version=version_ns['__version__'],
14 |     description="Pandas-based utility to calculate weighted means, medians, distributions, standard deviations, and more.",
15 |     url="http://github.com/jsvine/weightedcalcs",
16 |     author="Jeremy Singer-Vine",
17 |     author_email="jsvine@gmail.com",
18 |     license="MIT",
19 |     packages=[
20 |         NAME
21 |     ],
22 |     install_requires=[
23 |         "pandas>=2.0"
24 |     ],
25 |     classifiers=[
26 |         "License :: OSI Approved :: MIT License",
27 |         "Operating System :: OS Independent",
28 |         "Programming Language :: Python :: 3",
29 |         "Intended Audience :: Developers",
30 |         "Intended Audience :: Science/Research",
31 |         "Topic :: Scientific/Engineering :: Information Analysis",
32 |     ],
33 | )
34 | 


--------------------------------------------------------------------------------
/test/core_tests.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import weightedcalcs as wc
  3 | import pandas as pd
  4 | import sys
  5 | import os
  6 | 
  7 | calc = wc.Calculator("weights")
  8 | 
  9 | class WCTest(unittest.TestCase):
 10 | 
 11 |     def test_mean(self):
 12 |         # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
 13 |         assert(calc.mean(pd.DataFrame({
 14 |             "values": [ 80, 90 ],
 15 |             "weights": [ 20, 30 ],
 16 |         }), "values") == 86)
 17 | 
 18 |     def test_mean_non_pandas(self):
 19 |         # Example via https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
 20 |         assert(calc.mean({
 21 |             "values": [ 80, 90 ],
 22 |             "weights": [ 20, 30 ],
 23 |         }, "values") == 86)
 24 | 
 25 |     def test_quantile(self):
 26 |         # Example via https://en.wikipedia.org/wiki/Weighted_median
 27 |         df = pd.DataFrame({
 28 |             "values": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
 29 |             "weights": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
 30 |         })
 31 |         assert(df["values"].median() == 0.1)
 32 |         assert(calc.quantile(df, "values", 0.5) == 0.2)
 33 |         assert(calc.median(df, "values") == 0.2)
 34 | 
 35 |     def test_quantile_split(self):
 36 |         df = pd.DataFrame({
 37 |             "values": [ 0, 1, 2, 3 ],
 38 |             "weights": [ 1, 1, 1, 1 ],
 39 |         })
 40 |         assert(calc.quantile(df, "values", 0.5) == 1.5)
 41 | 
 42 |     def test_bad_quantile(self):
 43 |         with self.assertRaises(Exception) as context:
 44 |             q = calc.quantile(pd.DataFrame({
 45 |                 "values": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
 46 |                 "weights": [ 0.1, 0.35, 0.05, 0.1, 0.15, 0.05, 0.2 ],
 47 |             }), "values", -1)
 48 | 
 49 |     def test_std(self):
 50 |         # Example via http://www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weightsd.pdf
 51 |         assert(calc.std(pd.DataFrame({
 52 |             "values": [  2, 3, 5, 7, 11, 13, 17, 19, 23 ],
 53 |             "weights": [ 1, 1, 0, 0, 4, 1, 2, 1, 0 ],
 54 |         }), "values").round(2) == 5.82)
 55 | 
 56 |     def test_distribution(self):
 57 |         dist = calc.distribution(pd.DataFrame({
 58 |             "values": [ "a", "b", "b", "b", "c" ],
 59 |             "weights": [ 3, 2, 0, 1, 2 ],
 60 |         }), "values")
 61 |         assert(dist["a"] == 0.375)
 62 |         assert(dist["b"] == 0.375)
 63 |         assert(dist["c"] == 0.250)
 64 | 
 65 |     def test_count(self):
 66 |         count = calc.count(pd.DataFrame({
 67 |             "values": [ "a", "b", "b", "b", "c" ],
 68 |             "weights": [ 3, 2, 0, 1, 2 ],
 69 |         }))
 70 |         assert(count == 8)
 71 | 
 72 |     def test_sum(self):
 73 |         _sum = calc.sum(pd.DataFrame({
 74 |             "values": [ 1, 2, 3, 4, 5 ],
 75 |             "weights": [ 3, 2, 0, 1, 2 ],
 76 |         }), "values")
 77 |         assert(_sum == 21)
 78 | 
 79 |     def test_grouped(self):
 80 |         dist = calc.distribution(pd.DataFrame({
 81 |             "group": [ "x", "x", "x", "x", "x" ],
 82 |             "values": [ "a", "b", "b", "b", "c" ],
 83 |             "weights": [ 3, 2, 0, 1, 2 ],
 84 |         }).groupby("group"), "values")
 85 |         assert(dist.loc["x"]["a"] == 0.375)
 86 |         assert(dist.loc["x"]["b"] == 0.375)
 87 |         assert(dist.loc["x"]["c"] == 0.250)
 88 | 
 89 |     def test_multi_grouped(self):
 90 |         dist = calc.distribution(pd.DataFrame({
 91 |             "group_a": [ "x", "x", "x", "x", "x" ],
 92 |             "group_b": [ "x", "x", "x", "x", "x" ],
 93 |             "values": [ "a", "b", "b", "b", "c" ],
 94 |             "weights": [ 3, 2, 0, 1, 2 ],
 95 |         }).groupby([ "group_a", "group_b" ]), "values")
 96 |         assert(dist.loc[("x", "x")]["a"] == 0.375)
 97 | 
 98 |     def test_multi_grouped_two(self):
 99 |         dist = calc.distribution(pd.DataFrame({
100 |             "group_a": [ "x", "x", "x", "y", "y" ],
101 |             "group_b": [ "x", "x", "x", "y", "y" ],
102 |             "values": [ "a", "b", "b", "b", "c" ],
103 |             "weights": [ 3, 2, 0, 1, 2 ],
104 |         }).groupby([ "group_a", "group_b" ]), "values")
105 |         assert(dist.loc[("x", "x")]["a"] == 0.6)
106 |         assert(dist.loc[("x", "x")]["b"] == 0.4)
107 |         assert(dist.loc[("x", "x")]["c"] == 0)
108 |     
109 |     def test_null_values(self):
110 |         with self.assertRaises(Exception) as context:
111 |             dist = calc.distribution(pd.DataFrame({
112 |                 "values": [ None, "b", "b", "b", "c" ],
113 |                 "weights": [ 3, 2, 0, 1, 2 ],
114 |             }), "values")
115 | 
116 | if __name__ == '__main__':
117 |     unittest.main()
118 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27,py34,py35,py36
3 | toxworkdir={env:TOX_WORK_DIR:.tox}
4 | 
5 | [testenv]
6 | deps=nose
7 | commands=nosetests --nocapture
8 | 


--------------------------------------------------------------------------------
/weightedcalcs/__init__.py:
--------------------------------------------------------------------------------
1 | from .__version__ import __version__
2 | from .core import Calculator
3 | 


--------------------------------------------------------------------------------
/weightedcalcs/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION_TUPLE = (0, 1, 3)
2 | __version__ = ".".join(map(str, VERSION_TUPLE))
3 | 


--------------------------------------------------------------------------------
/weightedcalcs/core.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import pandas as pd
  3 | 
  4 | PANDAS_TYPES = (
  5 |     pd.DataFrame,
  6 |     pd.core.groupby.DataFrameGroupBy,
  7 | )
  8 | 
  9 | def pandas_deco(func):
 10 |     def func_wrapper(self, thing, *args, **kwargs):
 11 |         if isinstance(thing, PANDAS_TYPES):
 12 |             return func(self, thing, *args, **kwargs)
 13 |         else:
 14 |             return func(self, pd.DataFrame(thing), *args, **kwargs)
 15 |     return func_wrapper
 16 | 
 17 | def groupby_deco(func):
 18 |     def func_wrapper(self, thing, *args, **kwargs):
 19 |         if isinstance(thing, pd.core.groupby.DataFrameGroupBy):
 20 |             agg = thing.apply(lambda x: func(self, x, *args, **kwargs), include_groups=False)
 21 |             is_series = isinstance(agg, pd.core.series.Series)
 22 |             has_multiindex = isinstance(agg.index, pd.MultiIndex)
 23 |             if is_series and has_multiindex:
 24 |                 return agg.unstack()
 25 |             else:
 26 |                 return agg
 27 |         return func(self, thing, *args, **kwargs)
 28 |     return func_wrapper
 29 | 
 30 | def fillna_deco(val):
 31 |     def deco(func):
 32 |         def func_wrapper(self, thing, *args, **kwargs):
 33 |             return func(self, thing, *args, **kwargs).fillna(val)
 34 |         return func_wrapper
 35 |     return deco
 36 | 
 37 | def check_nulls(series):
 38 |     if series.isnull().sum() > 0:
 39 |         raise ValueError("value_var contains null values")
 40 |     return series
 41 | 
 42 | class Calculator(object):
 43 |     def __init__(self, weight_var):
 44 |         self.weight_var = weight_var
 45 | 
 46 |     @groupby_deco
 47 |     @pandas_deco
 48 |     def count(self, thing):
 49 |         return thing[self.weight_var].sum()
 50 | 
 51 |     @groupby_deco
 52 |     @pandas_deco
 53 |     def sum(self, thing, value_var):
 54 |         weights = thing[self.weight_var]
 55 |         values = check_nulls(thing[value_var])
 56 |         return (values * weights).sum()
 57 |     
 58 |     @groupby_deco
 59 |     @pandas_deco
 60 |     def mean(self, thing, value_var):
 61 |         weights = thing[self.weight_var]
 62 |         total_weight = weights.sum()
 63 |         values = check_nulls(thing[value_var])
 64 |         return (values * weights).sum() / total_weight
 65 |     
 66 |     @groupby_deco
 67 |     @pandas_deco
 68 |     def std(self, thing, value_var):
 69 |         weights = thing[self.weight_var]
 70 |         n_nonzero_weights = (weights > 0).sum()
 71 |         if (n_nonzero_weights) < 2: return pd.NA
 72 |         values = check_nulls(thing[value_var])
 73 |         mean = self.mean(thing, value_var)
 74 |         numerator = (weights * (values - mean).pow(2)).sum()
 75 |         denominator = (n_nonzero_weights - 1) * weights.sum() / n_nonzero_weights
 76 |         return pow(numerator / denominator, 0.5)
 77 | 
 78 |     @groupby_deco
 79 |     @pandas_deco
 80 |     def quantile(self, thing, value_var, q):
 81 |         if q < 0 or q > 1:
 82 |             raise ValueError("q must be between 0 and 1")
 83 |         df = pd.DataFrame({
 84 |             "weights": thing[self.weight_var],
 85 |             "values": check_nulls(thing[value_var])
 86 |         }).sort_values("values")
 87 |         df["cumul_prop"] = df["weights"].cumsum() / df["weights"].sum()
 88 |         shaved = df[df["cumul_prop"] >= q]
 89 |         if shaved.iloc[0]["cumul_prop"] == q:
 90 |             return shaved.head(2)["values"].mean()
 91 |         else:
 92 |             return shaved.iloc[0]["values"]
 93 | 
 94 |     @groupby_deco
 95 |     @pandas_deco
 96 |     def median(self, thing, value_var):
 97 |         return self.quantile(thing, value_var, 0.5)
 98 | 
 99 |     @fillna_deco(0)
100 |     @groupby_deco
101 |     @pandas_deco
102 |     def distribution(self, thing, value_var):
103 |         weights = thing[self.weight_var]
104 |         total_weight = weights.sum()
105 |         check_nulls(thing[value_var])
106 |         return thing.groupby(value_var)[self.weight_var].sum() / total_weight 
107 |     
108 | 


--------------------------------------------------------------------------------