├── .gitignore
├── Example.ipynb
├── Example_files
    ├── Example_14_0.png
    ├── Example_15_0.png
    ├── Example_16_0.png
    ├── Example_17_0.png
    ├── Example_18_0.png
    ├── Example_18_1.png
    ├── Example_18_2.png
    ├── Example_19_0.png
    ├── Example_19_1.png
    ├── Example_19_2.png
    ├── Example_20_0.png
    ├── Example_20_1.png
    ├── Example_20_2.png
    ├── Example_20_3.png
    ├── Example_20_4.png
    ├── Example_21_0.png
    ├── Example_21_1.png
    ├── Example_21_2.png
    ├── Example_21_3.png
    ├── Example_21_4.png
    ├── Example_32_0.png
    ├── Example_32_1.png
    ├── Example_32_2.png
    ├── Example_35_0.png
    ├── Example_35_1.png
    ├── Example_35_2.png
    ├── Example_35_3.png
    └── Example_35_4.png
├── LICENSE.txt
├── README.md
├── build
    └── lib
    │   └── pymatch
    │       ├── Matcher.py
    │       ├── __init__.py
    │       └── functions.py
├── dist
    ├── matcher-0.1.tar.gz
    ├── pymatch-0.0.1.tar.gz
    ├── pymatch-0.0.2.tar.gz
    ├── pymatch-0.0.3.tar.gz
    ├── pymatch-0.0.5.tar.gz
    ├── pymatch-0.0.6.tar.gz
    ├── pymatch-0.0.7.tar.gz
    ├── pymatch-0.0.8.tar.gz
    ├── pymatch-0.0.9.tar.gz
    ├── pymatch-0.1.0.tar.gz
    ├── pymatch-0.1.1.tar.gz
    ├── pymatch-0.1.2.tar.gz
    ├── pymatch-0.1.3.tar.gz
    ├── pymatch-0.1.3
    │   ├── PKG-INFO
    │   ├── pymatch.egg-info
    │   │   ├── PKG-INFO
    │   │   ├── SOURCES.txt
    │   │   ├── dependency_links.txt
    │   │   └── top_level.txt
    │   ├── pymatch
    │   │   ├── Matcher.py
    │   │   ├── __init__.py
    │   │   └── functions.py
    │   ├── setup.cfg
    │   └── setup.py
    ├── pymatch-0.1.4.tar.gz
    ├── pymatch-0.1.5.tar.gz
    ├── pymatch-0.1.6.tar.gz
    ├── pymatch-0.1.7.tar.gz
    ├── pymatch-0.1.8.tar.gz
    ├── pymatch-0.1.9.tar.gz
    ├── pymatch-0.1.tar.gz
    ├── pymatch-0.2.0.tar.gz
    ├── pymatch-0.2.1.tar.gz
    ├── pymatch-0.2.2.tar.gz
    ├── pymatch-0.2.3.tar.gz
    ├── pymatch-0.2.4.tar.gz
    ├── pymatch-0.2.5.tar.gz
    ├── pymatch-0.2.6.tar.gz
    ├── pymatch-0.2.8-py2.py3-none-any.whl
    ├── pymatch-0.2.8.tar.gz
    ├── pymatch-0.2.9-py2.py3-none-any.whl
    ├── pymatch-0.2.9.tar.gz
    ├── pymatch-0.3.0-py2-none-any.whl
    ├── pymatch-0.3.0-py2.py3-none-any.whl
    ├── pymatch-0.3.0.tar.gz
    ├── pymatch-0.3.1-py2-none-any.whl
    ├── pymatch-0.3.1.tar.gz
    ├── pymatch-0.3.2-py2-none-any.whl
    ├── pymatch-0.3.2.tar.gz
    ├── pymatch-0.3.3-py3-none-any.whl
    ├── pymatch-0.3.3.tar.gz
    ├── pymatch-0.3.4.1-py3-none-any.whl
    ├── pymatch-0.3.4.1.tar.gz
    ├── pymatch-0.3.4.2-py3-none-any.whl
    ├── pymatch-0.3.4.2.tar.gz
    ├── pymatch-0.5.tar.gz
    ├── pymatch-0.6.tar.gz
    └── pymatch-0.7.tar.gz
├── misc
    ├── .coo
    └── loan.csv
├── pymatch.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    ├── requires.txt
    └── top_level.txt
├── pymatch
    ├── Matcher.py
    ├── __init__.py
    └── functions.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_STORE
2 | *.pyc
3 | .ipynb_checkpoints/
4 | .idea*
5 | 
6 | loan_full.csv
7 | 


--------------------------------------------------------------------------------
/Example_files/Example_14_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_14_0.png


--------------------------------------------------------------------------------
/Example_files/Example_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_15_0.png


--------------------------------------------------------------------------------
/Example_files/Example_16_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_16_0.png


--------------------------------------------------------------------------------
/Example_files/Example_17_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_17_0.png


--------------------------------------------------------------------------------
/Example_files/Example_18_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_18_0.png


--------------------------------------------------------------------------------
/Example_files/Example_18_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_18_1.png


--------------------------------------------------------------------------------
/Example_files/Example_18_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_18_2.png


--------------------------------------------------------------------------------
/Example_files/Example_19_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_19_0.png


--------------------------------------------------------------------------------
/Example_files/Example_19_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_19_1.png


--------------------------------------------------------------------------------
/Example_files/Example_19_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_19_2.png


--------------------------------------------------------------------------------
/Example_files/Example_20_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_20_0.png


--------------------------------------------------------------------------------
/Example_files/Example_20_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_20_1.png


--------------------------------------------------------------------------------
/Example_files/Example_20_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_20_2.png


--------------------------------------------------------------------------------
/Example_files/Example_20_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_20_3.png


--------------------------------------------------------------------------------
/Example_files/Example_20_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_20_4.png


--------------------------------------------------------------------------------
/Example_files/Example_21_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_21_0.png


--------------------------------------------------------------------------------
/Example_files/Example_21_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_21_1.png


--------------------------------------------------------------------------------
/Example_files/Example_21_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_21_2.png


--------------------------------------------------------------------------------
/Example_files/Example_21_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_21_3.png


--------------------------------------------------------------------------------
/Example_files/Example_21_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_21_4.png


--------------------------------------------------------------------------------
/Example_files/Example_32_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_32_0.png


--------------------------------------------------------------------------------
/Example_files/Example_32_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_32_1.png


--------------------------------------------------------------------------------
/Example_files/Example_32_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_32_2.png


--------------------------------------------------------------------------------
/Example_files/Example_35_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_35_0.png


--------------------------------------------------------------------------------
/Example_files/Example_35_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_35_1.png


--------------------------------------------------------------------------------
/Example_files/Example_35_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_35_2.png


--------------------------------------------------------------------------------
/Example_files/Example_35_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_35_3.png


--------------------------------------------------------------------------------
/Example_files/Example_35_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/Example_files/Example_35_4.png


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2017 Ben Miroglio
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | `pymatch`
  3 | =====
  4 | 
  5 | Matching techniques for observational studies. Inspired by and adapted from Jasjeet Singh Sekhon's [Matching](https://cran.r-project.org/web/packages/Matching/Matching.pdf) package in R. I wrote an adaptation in Python that is better suited for my work at Mozilla which incorporates:
  6 | 
  7 | * Integration with Jupyter Notebooks (we use Jupyter + Pyspark)
  8 | * Additional Non-Parametric Tests / Plotting Functionality to assess balance.
  9 | * A more modular, user-specified matching process
 10 | 
 11 | This package was used to support [this research project](https://dl.acm.org/citation.cfm?id=3178876.3186162).
 12 | 
 13 | # Installation
 14 | 
 15 | Install through pip!
 16 | 
 17 | ```bash
 18 | $ pip install pymatch
 19 | ```
 20 | 
 21 | # 
 22 | 
 23 | The best way to get familiar with the package is to work through an example. The example below leaves out much of the theory behind matching and focuses on the application within `pymatch`. If interested, Sekhon gives a nice overview in his [Introduction to the Matching package in R](http://sekhon.berkeley.edu/papers/MatchingJSS.pdf).
 24 | 
 25 | # Example
 26 | 
 27 | The following example demonstrates how to the use the `pymatch` package to match [Lending Club Loan Data](https://www.kaggle.com/wendykan/lending-club-loan-data). Follow the link to download the dataset from Kaggle (you'll have to create an account, it's fast and free!). You can follow along this document or download the corresponding [Example.ipynb](https://github.com/benmiroglio/pymatch/blob/master/Example.ipynb) notebook (just be sure to change the path when loading data!).
 28 | 
 29 | Here we match Lending Club users that fully paid off loans (control) to those that defaulted (test). The example is contrived, however a use case for this could be that we want to analyze user sentiment with the platform. Users that default on loans may have worse sentiment because they are predisposed to a bad situation--influencing their perception of the product. Before analyzing sentiment, we can match users that paid their loans in full to users that defaulted based on the characteristics we can observe. If matching is successful, we could then make a statement about the **causal effect** defaulting has on sentiment if we are confident our samples are sufficiently balanced and our model is free from omitted variable bias.
 30 | 
 31 | This example, however, only goes through the matching procedure, which can be broken down into the following steps:
 32 | 
 33 | * [Data Preparation](#data-prep)
 34 | * [Fit Propensity Score Models](#matcher)
 35 | * [Predict Propensity Scores](#predict-scores)
 36 | * [Tune Threshold](#tune-threshold)
 37 | * [Match Data](#match-data)
 38 | * [Assess Matches](#assess-matches)
 39 | 
 40 | ----
 41 | 
 42 | ### Data Prep
 43 | 
 44 | 
 45 | ```python
 46 | import warnings
 47 | warnings.filterwarnings('ignore')
 48 | from pymatch.Matcher import Matcher
 49 | import pandas as pd
 50 | import numpy as np
 51 | 
 52 | %matplotlib inline
 53 | ```
 54 | 
 55 | Load the dataset (`loan.csv`) and select a subset of columns.
 56 | 
 57 | 
 58 | 
 59 | ```python
 60 | path = "/Users/bmiroglio/Downloads/lending-club-loan-data/loan.csv"
 61 | fields = \
 62 | [
 63 |     "loan_amnt",
 64 |     "funded_amnt",
 65 |     "funded_amnt_inv",
 66 |     "term",
 67 |     "int_rate",
 68 |     "installment",
 69 |     "grade",
 70 |     "sub_grade",
 71 |     "loan_status"
 72 | ]
 73 | 
 74 | data = pd.read_csv(path)[fields]
 75 | ```
 76 | 
 77 | Create test and control groups and reassign `loan_status` to be a binary treatment indicator. This is our reponse in the logistic regression model(s) used to generate propensity scores.
 78 | 
 79 | 
 80 | ```python
 81 | test = data[data.loan_status == "Default"]
 82 | control = data[data.loan_status == "Fully Paid"]
 83 | test['loan_status'] = 1
 84 | control['loan_status'] = 0
 85 | ```
 86 | 
 87 | ----
 88 | 
 89 | ### `Matcher`
 90 | 
 91 | Initialize the `Matcher` object. 
 92 | 
 93 | **Note that:**
 94 | 
 95 | * Upon initialization, `Matcher` prints the formula used to fit logistic regression model(s) and the number of records in the majority/minority class. 
 96 |     * The regression model(s) are used to generate propensity scores. In this case, we are using the covariates on the right side of the equation to estimate the probability of defaulting on a loan (`loan_status`= 1). 
 97 | * `Matcher` will use all covariates in the dataset unless a formula is specified by the user. Note that this step is only fitting model(s), we assign propensity scores later. 
 98 | * Any covariates passed to the (optional) `exclude` parameter will be ignored from the model fitting process. This parameter is particularly useful for unique identifiers like a `user_id`. 
 99 | 
100 | 
101 | ```python
102 | m = Matcher(test, control, yvar="loan_status", exclude=[])
103 | ```
104 | 
105 |     Formula:
106 |     loan_status ~ loan_amnt+funded_amnt+funded_amnt_inv+term+int_rate+installment+grade+sub_grade
107 |     n majority: 207723
108 |     n minority: 1219
109 | 
110 | 
111 | There is a significant **Class Imbalance** in our data--the majority group (fully-paid loans) having many more records than the minority group (defaulted loans). We account for this by setting `balance=True` when calling `Matcher.fit_scores()` below. This tells `Matcher` to sample from the majority group when fitting the logistic regression model(s) so that the groups are of equal size. When undersampling this way, it is highly recommended that `nmodels` is explicitly assigned to a integer much larger than 1. This ensures that more of the majority group is contributing to the generation of propensity scores. The value of this integer should depend on the severity of the imbalance: here we use `nmodels`=100.
112 | 
113 | 
114 | ```python
115 | # for reproducibility
116 | np.random.seed(20170925)
117 | 
118 | m.fit_scores(balance=True, nmodels=100)
119 | ```
120 | 
121 |     Fitting 100 Models on Balanced Samples...
122 |     Average Accuracy: 70.21%
123 | 
124 | 
125 | The average accuracy of our 100 models is 70.21%, suggesting that there's separability within our data and justifiying the need for the matching procedure. It's worth noting that we don't pay much attention to these logistic models since we are using them as a feature extraction tool (generation of propensity scores). The accuracy is a good way to detect separability at a glance, but we shouldn't spend time tuning and tinkering with these models. If our accuracy was close to 50%, that would suggest we cannot detect much separability in our groups given the features we observe and that matching is probably not necessary (or more features should be included if possible).
126 | 
127 | ### Predict Scores
128 | 
129 | 
130 | ```python
131 | m.predict_scores()
132 | ```
133 | 
134 | ```python
135 | m.plot_scores()
136 | ```
137 | 
138 | 
139 | ![png](Example_files/Example_15_0.png)
140 | 
141 | 
142 | The plot above demonstrates the separability present in our data. Test profiles have a much higher **propensity**, or estimated probability of defaulting given the features we isolated in the data.
143 | 
144 | ---
145 | 
146 | ### Tune Threshold
147 | 
148 | The `Matcher.match()` method matches profiles that have propensity scores within some threshold. 
149 | 
150 | i.e. for two scores `s1` and `s2`, `|s1 - s2|` <= `threshold`
151 | 
152 | By default matches are found *from* the majority group *for* the minority group. For example, if our test group contains 1,000 records and our control group contains 20,000, `Matcher` will
153 |     iterate through the test (minority) group and find suitable matches from the control (majority) group. If a record in the minority group has no suitable matches, it is dropped from the final matched dataset. We need to ensure our threshold is small enough such that we get close matches and retain most (or all) of our data in the minority group.
154 |     
155 | Below we tune the threshold using `method="random"`. This matches a random profile that is within the threshold
156 | as there could be many. This is much faster than the alternative method "min", which finds the *closest* match for every minority record.
157 | 
158 | 
159 | ```python
160 | m.tune_threshold(method='random')
161 | ```
162 | 
163 | 
164 | ![png](Example_files/Example_19_0.png)
165 | 
166 | 
167 | It looks like a threshold of 0.0001 retains 100% of our data. Let's proceed with matching using this threshold.
168 | 
169 | ---
170 | 
171 | ### Match Data
172 | 
173 | Below we match one record from the majority group to each record in the minority group. This is done **with** replacement, meaning a single majority record can be matched to multiple minority records. `Matcher` assigns a unique `record_id` to each record in the test and control groups so this can be addressed after matching. If subsequent modeling is planned, one might consider weighting models using a weight vector of 1/`f` for each record, `f` being a record's frequency in the matched dataset. Thankfully `Matcher` can handle all of this for you :).
174 | 
175 | 
176 | ```python
177 | m.match(method="min", nmatches=1, threshold=0.0001)
178 | ```
179 | 
180 | 
181 | ```python
182 | m.record_frequency()
183 | ```
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | <table border="1" class="dataframe">
191 |   <thead>
192 |     <tr style="text-align: right;">
193 |       <th></th>
194 |       <th>freq</th>
195 |       <th>n_records</th>
196 |     </tr>
197 |   </thead>
198 |   <tbody>
199 |     <tr>
200 |       <th>0</th>
201 |       <td>1</td>
202 |       <td>2264</td>
203 |     </tr>
204 |     <tr>
205 |       <th>1</th>
206 |       <td>2</td>
207 |       <td>68</td>
208 |     </tr>
209 |     <tr>
210 |       <th>2</th>
211 |       <td>3</td>
212 |       <td>10</td>
213 |     </tr>
214 |     <tr>
215 |       <th>3</th>
216 |       <td>4</td>
217 |       <td>2</td>
218 |     </tr>
219 |   </tbody>
220 | </table>
221 | 
222 | 
223 | 
224 | 
225 | It looks like the bulk of our matched-majority-group records occur only once, 68 occur twice, ... etc. We can preemptively generate a weight vector using `Matcher.assign_weight_vector()`
226 | 
227 | 
228 | ```python
229 | m.assign_weight_vector()
230 | ```
231 | 
232 | Let's take a look at our matched data thus far. Note that in addition to the weight vector, `Matcher` has also assigned a `match_id` to each record indicating our (in this cased) *paired* matches since we use `nmatches=1`. We can verify that matched records have `scores` within 0.0001 of each other. 
233 | 
234 | 
235 | ```python
236 | m.matched_data.sort_values("match_id").head(6)
237 | ```
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | <table border="1" class="dataframe">
245 |   <thead>
246 |     <tr style="text-align: right;">
247 |       <th></th>
248 |       <th>record_id</th>
249 |       <th>weight</th>
250 |       <th>loan_amnt</th>
251 |       <th>funded_amnt</th>
252 |       <th>funded_amnt_inv</th>
253 |       <th>term</th>
254 |       <th>int_rate</th>
255 |       <th>installment</th>
256 |       <th>grade</th>
257 |       <th>sub_grade</th>
258 |       <th>loan_status</th>
259 |       <th>scores</th>
260 |       <th>match_id</th>
261 |     </tr>
262 |   </thead>
263 |   <tbody>
264 |     <tr>
265 |       <th>0</th>
266 |       <td>0</td>
267 |       <td>1.0</td>
268 |       <td>18000.0</td>
269 |       <td>18000.0</td>
270 |       <td>17975.000000</td>
271 |       <td>60 months</td>
272 |       <td>17.27</td>
273 |       <td>449.97</td>
274 |       <td>D</td>
275 |       <td>D3</td>
276 |       <td>1</td>
277 |       <td>0.644783</td>
278 |       <td>0</td>
279 |     </tr>
280 |     <tr>
281 |       <th>2192</th>
282 |       <td>191970</td>
283 |       <td>1.0</td>
284 |       <td>2275.0</td>
285 |       <td>2275.0</td>
286 |       <td>2275.000000</td>
287 |       <td>36 months</td>
288 |       <td>16.55</td>
289 |       <td>80.61</td>
290 |       <td>D</td>
291 |       <td>D2</td>
292 |       <td>0</td>
293 |       <td>0.644784</td>
294 |       <td>0</td>
295 |     </tr>
296 |     <tr>
297 |       <th>1488</th>
298 |       <td>80665</td>
299 |       <td>1.0</td>
300 |       <td>18400.0</td>
301 |       <td>18400.0</td>
302 |       <td>18250.000000</td>
303 |       <td>36 months</td>
304 |       <td>16.29</td>
305 |       <td>649.53</td>
306 |       <td>C</td>
307 |       <td>C4</td>
308 |       <td>0</td>
309 |       <td>0.173057</td>
310 |       <td>1</td>
311 |     </tr>
312 |     <tr>
313 |       <th>1</th>
314 |       <td>1</td>
315 |       <td>1.0</td>
316 |       <td>21250.0</td>
317 |       <td>21250.0</td>
318 |       <td>21003.604048</td>
319 |       <td>60 months</td>
320 |       <td>14.27</td>
321 |       <td>497.43</td>
322 |       <td>C</td>
323 |       <td>C2</td>
324 |       <td>1</td>
325 |       <td>0.173054</td>
326 |       <td>1</td>
327 |     </tr>
328 |     <tr>
329 |       <th>2</th>
330 |       <td>2</td>
331 |       <td>1.0</td>
332 |       <td>5600.0</td>
333 |       <td>5600.0</td>
334 |       <td>5600.000000</td>
335 |       <td>60 months</td>
336 |       <td>15.99</td>
337 |       <td>136.16</td>
338 |       <td>D</td>
339 |       <td>D2</td>
340 |       <td>1</td>
341 |       <td>0.777273</td>
342 |       <td>2</td>
343 |     </tr>
344 |     <tr>
345 |       <th>1828</th>
346 |       <td>153742</td>
347 |       <td>1.0</td>
348 |       <td>12000.0</td>
349 |       <td>12000.0</td>
350 |       <td>12000.000000</td>
351 |       <td>60 months</td>
352 |       <td>18.24</td>
353 |       <td>306.30</td>
354 |       <td>D</td>
355 |       <td>D5</td>
356 |       <td>0</td>
357 |       <td>0.777270</td>
358 |       <td>2</td>
359 |     </tr>
360 |   </tbody>
361 | </table>
362 | 
363 | 
364 | 
365 | 
366 | ---
367 | 
368 | ### Assess Matches
369 | 
370 | We must now determine if our data is "balanced" across our covariates. Can we detect any statistical differences between the covariates of our matched test and control groups? `Matcher` is configured to treat categorical and continuous variables separately in this assessment.
371 | 
372 | ___categorical___
373 | 
374 | For categorical variables, we look at plots comparing the proportional differences between test and control before and after matching. 
375 | 
376 | For example, the first plot shows:
377 | 
378 | * `prop_test` - `prop_control` for all possible `term` values, `prop_test` and `prop_control` being the proportion of test and control records with a given term value, respectively. We want these (orange) bars to be small after matching.
379 | * Results (pvalue) of a Chi-Square Test for Independence before and after matching. After matching we want this pvalue to be > 0.05, resulting in our failure to reject the null hypothesis that the frequency of the enumerated term values are independent of our test and control groups.
380 | 
381 | 
382 | ```python
383 | categorical_results = m.compare_categorical(return_table=True)
384 | ```
385 | 
386 | 
387 | ![png](Example_files/Example_32_0.png)
388 | 
389 | 
390 | 
391 | ![png](Example_files/Example_32_1.png)
392 | 
393 | 
394 | 
395 | ![png](Example_files/Example_32_2.png)
396 | 
397 | 
398 | 
399 | ```python
400 | categorical_results
401 | ```
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | <table border="1" class="dataframe">
409 |   <thead>
410 |     <tr style="text-align: right;">
411 |       <th></th>
412 |       <th>var</th>
413 |       <th>before</th>
414 |       <th>after</th>
415 |     </tr>
416 |   </thead>
417 |   <tbody>
418 |     <tr>
419 |       <th>0</th>
420 |       <td>term</td>
421 |       <td>0.0</td>
422 |       <td>0.433155</td>
423 |     </tr>
424 |     <tr>
425 |       <th>1</th>
426 |       <td>grade</td>
427 |       <td>0.0</td>
428 |       <td>0.532530</td>
429 |     </tr>
430 |     <tr>
431 |       <th>2</th>
432 |       <td>sub_grade</td>
433 |       <td>0.0</td>
434 |       <td>0.986986</td>
435 |     </tr>
436 |   </tbody>
437 | </table>
438 | 
439 | 
440 | 
441 | 
442 | Looking at the plots and test results, we did a pretty good job balancing our categorical features! The p-values from the Chi-Square tests are all > 0.05 and we can verify by observing the small proportional differences in the plots.
443 | 
444 | ___Continuous___
445 | 
446 | For continous variables we look at Empirical Cumulative Distribution Functions (ECDF) for our test and control groups  before and after matching.
447 | 
448 | For example, the first plot pair shows:
449 | 
450 | * ECDF for test vs ECDF for control **before** matching (left), ECDF for test vs ECDF for control **after** matching (right). We want the two lines to be very close to each other (or indistiguishable) after matching.
451 | * Some tests + metrics are included in the chart titles.
452 |     * Tests performed:
453 |         * Kolmogorov-Smirnov Goodness of fit Test (KS-test)
454 |             This test statistic is calculated on 1000
455 |             permuted samples of the data, generating
456 |             an imperical p-value.  See `pymatch.functions.ks_boot()`
457 |             This is an adaptation of the [`ks.boot()`](https://www.rdocumentation.org/packages/Matching/versions/4.9-2/topics/ks.boot) method in 
458 |             the R "Matching" package
459 |         * Chi-Square Distance:
460 |             Similarly this distance metric is calculated on 
461 |             1000 permuted samples. 
462 |             See `pymatch.functions.grouped_permutation_test()`
463 | 
464 |     * Other included Stats:
465 |         * Standarized mean and median differences.
466 |              How many standard deviations away are the mean/median
467 |             between our groups before and after matching
468 |             i.e. `abs(mean(control) - mean(test))` / `std(control.union(test))`
469 | 
470 | 
471 | ```python
472 | cc = m.compare_continuous(return_table=True)
473 | ```
474 | 
475 | 
476 | ![png](Example_files/Example_35_0.png)
477 | 
478 | 
479 | 
480 | ![png](Example_files/Example_35_1.png)
481 | 
482 | 
483 | 
484 | ![png](Example_files/Example_35_2.png)
485 | 
486 | 
487 | 
488 | ![png](Example_files/Example_35_3.png)
489 | 
490 | 
491 | 
492 | ![png](Example_files/Example_35_4.png)
493 | 
494 | 
495 | 
496 | ```python
497 | cc
498 | ```
499 | 
500 | 
501 | 
502 | 
503 | 
504 | 
505 | <table border="1" class="dataframe">
506 |   <thead>
507 |     <tr style="text-align: right;">
508 |       <th></th>
509 |       <th>var</th>
510 |       <th>ks_before</th>
511 |       <th>ks_after</th>
512 |       <th>grouped_chisqr_before</th>
513 |       <th>grouped_chisqr_after</th>
514 |       <th>std_median_diff_before</th>
515 |       <th>std_median_diff_after</th>
516 |       <th>std_mean_diff_before</th>
517 |       <th>std_mean_diff_after</th>
518 |     </tr>
519 |   </thead>
520 |   <tbody>
521 |     <tr>
522 |       <th>0</th>
523 |       <td>loan_amnt</td>
524 |       <td>0.0</td>
525 |       <td>0.530</td>
526 |       <td>0.000</td>
527 |       <td>1.000</td>
528 |       <td>0.207814</td>
529 |       <td>0.067942</td>
530 |       <td>0.229215</td>
531 |       <td>0.013929</td>
532 |     </tr>
533 |     <tr>
534 |       <th>1</th>
535 |       <td>funded_amnt</td>
536 |       <td>0.0</td>
537 |       <td>0.541</td>
538 |       <td>0.000</td>
539 |       <td>1.000</td>
540 |       <td>0.208364</td>
541 |       <td>0.067942</td>
542 |       <td>0.234735</td>
543 |       <td>0.013929</td>
544 |     </tr>
545 |     <tr>
546 |       <th>2</th>
547 |       <td>funded_amnt_inv</td>
548 |       <td>0.0</td>
549 |       <td>0.573</td>
550 |       <td>0.933</td>
551 |       <td>1.000</td>
552 |       <td>0.242035</td>
553 |       <td>0.067961</td>
554 |       <td>0.244418</td>
555 |       <td>0.013981</td>
556 |     </tr>
557 |     <tr>
558 |       <th>3</th>
559 |       <td>int_rate</td>
560 |       <td>0.0</td>
561 |       <td>0.109</td>
562 |       <td>0.000</td>
563 |       <td>0.349</td>
564 |       <td>0.673904</td>
565 |       <td>0.091925</td>
566 |       <td>0.670445</td>
567 |       <td>0.079891</td>
568 |     </tr>
569 |     <tr>
570 |       <th>4</th>
571 |       <td>installment</td>
572 |       <td>0.0</td>
573 |       <td>0.428</td>
574 |       <td>0.004</td>
575 |       <td>1.000</td>
576 |       <td>0.169177</td>
577 |       <td>0.042140</td>
578 |       <td>0.157699</td>
579 |       <td>0.014590</td>
580 |     </tr>
581 |   </tbody>
582 | </table>
583 | 
584 | 
585 | 
586 | 
587 | We want the pvalues from both the KS-test and the grouped permutation of the Chi-Square distance after matching to be > 0.05, and they all are! We can verify by looking at how close the ECDFs are between test and control.
588 | 
589 | # Conclusion
590 | 
591 | We saw a very "clean" result from the above procedure, achieving balance among all the covariates. In my work at Mozilla, we see much hairier results using the same procedure, which will likely be your experience too. In the case that certain covariates are not well balanced, one might consider tinkering with the parameters of the matching process (`nmatches`>1) or adding more covariates to the formula specified when we initialized the `Matcher` object.
592 | In any case, in subsequent modeling, you can always control for variables that you haven't deemed "balanced".
593 | 


--------------------------------------------------------------------------------
/build/lib/pymatch/Matcher.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from pymatch import *
  3 | import pymatch.functions as uf
  4 | 
  5 | class Matcher:
  6 |     """
  7 |     Matcher Class -- Match data for an observational study.
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     test : pd.DataFrame
 12 |         Data representing the test group
 13 |     control : (pd.DataFrame)
 14 |         Data representing the control group
 15 |     formula : str (optional)
 16 |         custom formula to use for logistic regression
 17 |         i.e. "Y ~ x1 + x2 + ..."
 18 |     yvar : str (optional)
 19 |         Name of dependent variable (the treatment)
 20 |     exclude : list  (optional)
 21 |         List of variables to ignore in regression/matching.
 22 |         Useful for unique idenifiers
 23 |     """
 24 | 
 25 |     def __init__(self, test, control, yvar, formula=None, exclude=[]):
 26 |         # configure plots for ipynb
 27 |         plt.rcParams["figure.figsize"] = (10, 5)
 28 |         # variables generated during matching
 29 |         aux_match = ['scores', 'match_id', 'weight', 'record_id']
 30 |         # assign unique indices to test and control
 31 |         t, c = [i.copy().reset_index(drop=True) for i in (test, control)]
 32 |         t = t.dropna(axis=1, how="all")
 33 |         c = c.dropna(axis=1, how="all")
 34 |         c.index += len(t)
 35 |         self.data = t.dropna(axis=1, how='all').append(c.dropna(axis=1, how='all'), sort=True)
 36 |         self.control_color = "#1F77B4"
 37 |         self.test_color = "#FF7F0E"
 38 |         self.yvar = yvar
 39 |         self.exclude = exclude + [self.yvar] + aux_match
 40 |         self.formula = formula
 41 |         self.nmodels = 1  # for now
 42 |         self.models = []
 43 |         self.swdata = None
 44 |         self.model_accuracy = []
 45 |         self.data[yvar] = self.data[yvar].astype(int)  # should be binary 0, 1
 46 |         self.xvars = [i for i in self.data.columns if i not in self.exclude and i != yvar]
 47 |         self.data = self.data.dropna(subset=self.xvars)
 48 |         self.matched_data = []
 49 |         self.xvars_escaped = [ "Q('{}')".format(x) for x in self.xvars]
 50 |         self.yvar_escaped = "Q('{}')".format(self.yvar)
 51 |         self.y, self.X = patsy.dmatrices('{} ~ {}'.format(self.yvar_escaped, '+'.join(self.xvars_escaped)),
 52 |                                          data=self.data, return_type='dataframe')
 53 |         self.xvars = [i for i in self.data.columns if i not in self.exclude]
 54 |         self.test= self.data[self.data[yvar] == True]
 55 |         self.control = self.data[self.data[yvar] == False]
 56 |         self.testn = len(self.test)
 57 |         self.controln = len(self.control)
 58 |         self.minority, self.majority = [i[1] for i in sorted(zip([self.testn, self.controln],
 59 |                                                                  [1, 0]),
 60 |                                                              key=lambda x: x[0])]
 61 |         print('Formula:\n{} ~ {}'.format(yvar, '+'.join(self.xvars)))
 62 |         print('n majority:', len(self.data[self.data[yvar] == self.majority]))
 63 |         print('n minority:', len(self.data[self.data[yvar] == self.minority]))
 64 | 
 65 |     def fit_scores(self, balance=True, nmodels=None):
 66 |         """
 67 |         Fits logistic regression model(s) used for
 68 |         generating propensity scores
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         balance : bool
 73 |             Should balanced datasets be used?
 74 |             (n_control == n_test)
 75 |         nmodels : int
 76 |             How many models should be fit?
 77 |             Score becomes the average of the <nmodels> models if nmodels > 1
 78 | 
 79 |         Returns
 80 |         -------
 81 |         None
 82 |         """
 83 |         # reset models if refitting
 84 |         if len(self.models) > 0:
 85 |             self.models = []
 86 |         if len(self.model_accuracy) > 0:
 87 |             self.model_accuracy = []
 88 |         if not self.formula:
 89 |             # use all columns in the model
 90 |             self.xvars_escaped = [ "Q('{}')".format(x) for x in self.xvars]
 91 |             self.yvar_escaped = "Q('{}')".format(self.yvar)
 92 |             self.formula = '{} ~ {}'.format(self.yvar_escaped, '+'.join(self.xvars_escaped))
 93 |         if balance:
 94 |             if nmodels is None:
 95 |                 # fit multiple models based on imbalance severity (rounded up to nearest tenth)
 96 |                 minor, major = [self.data[self.data[self.yvar] == i] for i in (self.minority,
 97 |                                                                                self.majority)]
 98 |                 nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10)
 99 |             self.nmodels = nmodels
100 |             i = 0
101 |             errors = 0
102 |             while i < nmodels and errors < 5:
103 |                 uf.progress(i+1, nmodels, prestr="Fitting Models on Balanced Samples")
104 |                 # sample from majority to create balance dataset
105 |                 df = self.balanced_sample()
106 |                 df = pd.concat([uf.drop_static_cols(df[df[self.yvar] == 1], yvar=self.yvar),
107 |                                 uf.drop_static_cols(df[df[self.yvar] == 0], yvar=self.yvar)],
108 |                                sort=True)
109 |                 y_samp, X_samp = patsy.dmatrices(self.formula, data=df, return_type='dataframe')
110 |                 X_samp.drop(self.yvar, axis=1, errors='ignore', inplace=True)
111 |                 glm = GLM(y_samp, X_samp, family=sm.families.Binomial())
112 | 
113 |                 try:
114 |                     res = glm.fit()
115 |                     self.model_accuracy.append(self._scores_to_accuracy(res, X_samp, y_samp))
116 |                     self.models.append(res)
117 |                     i = i + 1
118 |                 except Exception as e:
119 |                     errors = errors + 1 # to avoid infinite loop for misspecified matrix
120 |                     print('Error: {}'.format(e))
121 |             print("\nAverage Accuracy:", "{}%".
122 |                   format(round(np.mean(self.model_accuracy) * 100, 2)))
123 |         else:
124 |             # ignore any imbalance and fit one model
125 |             print('Fitting 1 (Unbalanced) Model...')
126 |             glm = GLM(self.y, self.X, family=sm.families.Binomial())
127 |             res = glm.fit()
128 |             self.model_accuracy.append(self._scores_to_accuracy(res, self.X, self.y))
129 |             self.models.append(res)
130 |             print("\nAccuracy", round(np.mean(self.model_accuracy[0]) * 100, 2))
131 | 
132 | 
133 |     def predict_scores(self):
134 |         """
135 |         Predict Propensity scores for each observation.
136 |         Adds a "scores" columns to self.data
137 | 
138 |         Returns
139 |         -------
140 |         None
141 |         """
142 |         scores = np.zeros(len(self.X))
143 |         for i in range(self.nmodels):
144 |             m = self.models[i]
145 |             scores += m.predict(self.X[m.params.index])
146 |         self.data['scores'] = scores/self.nmodels
147 | 
148 |     def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10):
149 |         """
150 |         Finds suitable match(es) for each record in the minority
151 |         dataset, if one exists. Records are exlcuded from the final
152 |         matched dataset if there are no suitable matches.
153 | 
154 |         self.matched_data contains the matched dataset once this
155 |         method is called
156 | 
157 |         Parameters
158 |         ----------
159 |         threshold : float
160 |             threshold for fuzzy matching matching
161 |             i.e. |score_x - score_y| >= theshold
162 |         nmatches : int
163 |             How majority profiles should be matched
164 |             (at most) to minority profiles
165 |         method : str
166 |             Strategy for when multiple majority profiles
167 |             are suitable matches for a single minority profile
168 |             "random" - choose randomly (fast, good for testing)
169 |             "min" - choose the profile with the closest score
170 |         max_rand : int
171 |             max number of profiles to consider when using random tie-breaks
172 | 
173 |         Returns
174 |         -------
175 |         None
176 |         """
177 |         if 'scores' not in self.data.columns:
178 |             print("Propensity Scores have not been calculated. Using defaults...")
179 |             self.fit_scores()
180 |             self.predict_scores()
181 |         test_scores = self.data[self.data[self.yvar]==True][['scores']]
182 |         ctrl_scores = self.data[self.data[self.yvar]==False][['scores']]
183 |         result, match_ids = [], []
184 |         for i in range(len(test_scores)):
185 |             # uf.progress(i+1, len(test_scores), 'Matching Control to Test...')
186 |             match_id = i
187 |             score = test_scores.iloc[i]
188 |             if method == 'random':
189 |                 bool_match = abs(ctrl_scores - score) <= threshold
190 |                 matches = ctrl_scores.loc[bool_match[bool_match.scores].index]
191 |             elif method == 'min':
192 |                 matches = abs(ctrl_scores - score).sort_values('scores').head(nmatches)
193 |             else:
194 |                 raise(AssertionError, "Invalid method parameter, use ('random', 'min')")
195 |             if len(matches) == 0:
196 |                 continue
197 |             # randomly choose nmatches indices, if len(matches) > nmatches
198 |             select = nmatches if method != 'random' else np.random.choice(range(1, max_rand+1), 1)
199 |             chosen = np.random.choice(matches.index, min(select, nmatches), replace=False)
200 |             result.extend([test_scores.index[i]] + list(chosen))
201 |             match_ids.extend([i] * (len(chosen)+1))
202 |         self.matched_data = self.data.loc[result]
203 |         self.matched_data['match_id'] = match_ids
204 |         self.matched_data['record_id'] = self.matched_data.index
205 | 
206 |     def select_from_design(self, cols):
207 |         d = pd.DataFrame()
208 |         for c in cols:
209 |             d = pd.concat([d, self.X.select(lambda x: x.startswith(c), axis=1)], axis=1, sort=True)
210 |         return d
211 | 
212 |     def balanced_sample(self, data=None):
213 |         if not data:
214 |             data=self.data
215 |         minor, major =  data[data[self.yvar] == self.minority], \
216 |                         data[data[self.yvar] == self.majority]
217 |         return major.sample(len(minor)).append(minor, sort=True).dropna()
218 | 
219 |     def plot_scores(self):
220 |         """
221 |         Plots the distribution of propensity scores before matching between
222 |         our test and control groups
223 |         """
224 |         assert 'scores' in self.data.columns, \
225 |             "Propensity scores haven't been calculated, use Matcher.predict_scores()"
226 |         sns.distplot(self.data[self.data[self.yvar]==0].scores, label='Control')
227 |         sns.distplot(self.data[self.data[self.yvar]==1].scores, label='Test')
228 |         plt.legend(loc='upper right')
229 |         plt.xlim((0, 1))
230 |         plt.title("Propensity Scores Before Matching")
231 |         plt.ylabel("Percentage (%)")
232 |         plt.xlabel("Scores")
233 | 
234 |     def prop_test(self, col):
235 |         """
236 |         Performs a Chi-Square test of independence on <col>
237 |         See stats.chi2_contingency()
238 | 
239 |         Parameters
240 |         ----------
241 |         col : str
242 |             Name of column on which the test should be performed
243 | 
244 |         Returns
245 |         ______
246 |         dict
247 |             {'var': <col>,
248 |              'before': <pvalue before matching>,
249 |              'after': <pvalue after matching>}
250 | 
251 | 
252 |         """
253 |         if not uf.is_continuous(col, self.X) and col not in self.exclude:
254 |             pval_before = round(stats.chi2_contingency(self.prep_prop_test(self.data,
255 |                                                                            col))[1], 6)
256 |             pval_after = round(stats.chi2_contingency(self.prep_prop_test(self.matched_data,
257 |                                                                           col))[1], 6)
258 |             return {'var':col, 'before':pval_before, 'after':pval_after}
259 |         else:
260 |             print("{} is a continuous variable".format(col))
261 | 
262 |     def compare_continuous(self, save=False, return_table=False):
263 |         """
264 |         Plots the ECDFs for continuous features before and
265 |         after matching. Each chart title contains test results
266 |         and statistics to summarize how similar the two distributions
267 |         are (we want them to be close after matching).
268 | 
269 |         Tests performed:
270 |         Kolmogorov-Smirnov Goodness of fit Test (KS-test)
271 |             This test statistic is calculated on 1000
272 |             permuted samples of the data, generating
273 |             an imperical p-value.  See pymatch.functions.ks_boot()
274 |             This is an adaptation of the ks.boot() method in
275 |             the R "Matching" package
276 |             https://www.rdocumentation.org/packages/Matching/versions/4.9-2/topics/ks.boot
277 |         Chi-Square Distance:
278 |             Similarly this distance metric is calculated on
279 |             1000 permuted samples.
280 |             See pymatch.functions.grouped_permutation_test()
281 | 
282 |         Other included Stats:
283 |         Standarized mean and median differences
284 |         How many standard deviations away are the mean/median
285 |         between our groups before and after matching
286 |         i.e. abs(mean(control) - mean(test)) / std(control.union(test))
287 | 
288 |         Parameters
289 |         ----------
290 |         return_table : bool
291 |             Should the function a table with tests and statistics?
292 | 
293 |         Returns
294 |         -------
295 |         pd.DataFrame (optional)
296 |             Table of before/after statistics if return_table == True
297 | 
298 | 
299 |         """
300 |         test_results = []
301 |         for col in self.matched_data.columns:
302 |             if uf.is_continuous(col, self.X) and col not in self.exclude:
303 |                 # organize data
304 |                 trb, cob = self.test[col], self.control[col]
305 |                 tra = self.matched_data[self.matched_data[self.yvar]==True][col]
306 |                 coa = self.matched_data[self.matched_data[self.yvar]==False][col]
307 |                 xtb, xcb = ECDF(trb), ECDF(cob)
308 |                 xta, xca = ECDF(tra),ECDF(coa)
309 | 
310 |                 # before/after stats
311 |                 std_diff_med_before, std_diff_mean_before = uf.std_diff(trb, cob)
312 |                 std_diff_med_after, std_diff_mean_after = uf.std_diff(tra, coa)
313 |                 pb, truthb = uf.grouped_permutation_test(uf.chi2_distance, trb, cob)
314 |                 pa, trutha = uf.grouped_permutation_test(uf.chi2_distance, tra, coa)
315 |                 ksb = round(uf.ks_boot(trb, cob, nboots=1000), 6)
316 |                 ksa = round(uf.ks_boot(tra, coa, nboots=1000), 6)
317 | 
318 |                 # plotting
319 |                 f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(12, 5))
320 |                 ax1.plot(xcb.x, xcb.y, label='Control', color=self.control_color)
321 |                 ax1.plot(xtb.x, xtb.y, label='Test', color=self.test_color)
322 |                 ax1.plot(xcb.x, xcb.y, label='Control', color=self.control_color)
323 |                 ax1.plot(xtb.x, xtb.y, label='Test', color=self.test_color)
324 | 
325 |                 title_str = '''
326 |                 ECDF for {} {} Matching
327 |                 KS p-value: {}
328 |                 Grouped Perm p-value: {}
329 |                 Std. Median Difference: {}
330 |                 Std. Mean Difference: {}
331 |                 '''
332 |                 ax1.set_title(title_str.format(col, "before", ksb, pb,
333 |                                                std_diff_med_before, std_diff_mean_before))
334 |                 ax2.plot(xca.x, xca.y, label='Control')
335 |                 ax2.plot(xta.x, xta.y, label='Test')
336 |                 ax2.set_title(title_str.format(col, "after", ksa, pa,
337 |                                                std_diff_med_after, std_diff_mean_after))
338 |                 ax2.legend(loc="lower right")
339 |                 plt.xlim((0, np.percentile(xta.x, 99)))
340 | 
341 |                 test_results.append({
342 |                         "var": col,
343 |                         "ks_before": ksb,
344 |                         "ks_after": ksa,
345 |                         "grouped_chisqr_before": pb,
346 |                         "grouped_chisqr_after": pa,
347 |                         "std_median_diff_before": std_diff_med_before,
348 |                         "std_median_diff_after": std_diff_med_after,
349 |                         "std_mean_diff_before": std_diff_mean_before,
350 |                         "std_mean_diff_after": std_diff_mean_after
351 |                     })
352 | 
353 |         var_order = [
354 |                     "var",
355 |                     "ks_before",
356 |                     "ks_after",
357 |                     "grouped_chisqr_before",
358 |                     "grouped_chisqr_after",
359 |                     "std_median_diff_before",
360 |                     "std_median_diff_after",
361 |                     "std_mean_diff_before",
362 |                     "std_mean_diff_after"
363 |                 ]
364 | 
365 |         return pd.DataFrame(test_results)[var_order] if return_table else None
366 | 
367 |     def compare_categorical(self, return_table=False):
368 |         """
369 |         Plots the proportional differences of each enumerated
370 |         discete column for test and control.
371 |         i.e. <prop_test_that_have_x>  - <prop_control_that_have_x>
372 |         Each chart title contains the results from a
373 |         Chi-Square Test of Independence before and after
374 |         matching.
375 |         See pymatch.prop_test()
376 | 
377 |         Parameters
378 |         ----------
379 |         return_table : bool
380 |             Should the function return a table with
381 |             test results?
382 | 
383 |         Return
384 |         ------
385 |         pd.DataFrame() (optional)
386 |             Table with the p-values of the Chi-Square contingency test
387 |             for each discrete column before and after matching
388 | 
389 |         """
390 |         def prep_plot(data, var, colname):
391 |             t, c = data[data[self.yvar] == 1], data[data[self.yvar] == 0]
392 |             # dummy var for counting
393 |             dummy = [i for i in t.columns if i not in \
394 |                       (var, "match_id", "record_id", "weight")][0]
395 |             countt = t[[var, dummy]].groupby(var).count() / len(t)
396 |             countc = c[[var, dummy]].groupby(var).count() / len(c)
397 |             ret = (countt-countc).dropna()
398 |             ret.columns = [colname]
399 |             return ret
400 | 
401 |         title_str = '''
402 |         Proportional Difference (test-control) for {} Before and After Matching
403 |         Chi-Square Test for Independence p-value before | after:
404 |         {} | {}
405 |         '''
406 |         test_results = []
407 |         for col in self.matched_data.columns:
408 |             if not uf.is_continuous(col, self.X) and col not in self.exclude:
409 |                 dbefore = prep_plot(self.data, col, colname="before")
410 |                 dafter = prep_plot(self.matched_data, col, colname="after")
411 |                 df = dbefore.join(dafter)
412 |                 test_results_i = self.prop_test(col)
413 |                 test_results.append(test_results_i)
414 | 
415 |                 # plotting
416 |                 df.plot.bar(alpha=.8)
417 |                 plt.title(title_str.format(col, test_results_i["before"],
418 |                                            test_results_i["after"]))
419 |                 lim = max(.09, abs(df).max().max()) + .01
420 |                 plt.ylim((-lim, lim))
421 |         return pd.DataFrame(test_results)[['var', 'before', 'after']] if return_table else None
422 | 
423 |     def prep_prop_test(self, data, var):
424 |         """
425 |         Helper method for running chi-square contingency tests
426 | 
427 |         Balances the counts of discrete variables with our groups
428 |         so that missing levels are replaced with 0.
429 |         i.e. if the test group has no records with x as a field
430 |         for a given column, make sure the count for x is 0
431 |         and not missing.
432 | 
433 |         Parameters
434 |         ----------
435 |         data : pd.DataFrame()
436 |             Data to use for counting
437 |         var : str
438 |             Column to use within data
439 | 
440 |         Returns
441 |         -------
442 |         list
443 |             A table (list of lists) of counts for all enumerated field within <var>
444 |             for test and control groups.
445 |         """
446 |         counts = data.groupby([var, self.yvar]).count().reset_index()
447 |         table = []
448 |         for t in (0, 1):
449 |             os_counts = counts[counts[self.yvar] ==t]\
450 |                                      .sort_values(var)
451 |             cdict = {}
452 |             for row in os_counts.iterrows():
453 |                 row = row[1]
454 |                 cdict[row[var]] = row[2]
455 |             table.append(cdict)
456 |         # fill empty keys as 0
457 |         all_keys = set(chain.from_iterable(table))
458 |         for d in table:
459 |             d.update((k, 0) for k in all_keys if k not in d)
460 |         ctable = [[i[k] for k in sorted(all_keys)] for i in table]
461 |         return ctable
462 | 
463 |     def prop_retained(self):
464 |         """
465 |         Returns the proportion of data retained after matching
466 |         """
467 |         return len(self.matched_data[self.matched_data[self.yvar] == self.minority]) * 1.0 / \
468 |                len(self.data[self.data[self.yvar] == self.minority])
469 | 
470 |     def tune_threshold(self, method, nmatches=1, rng=np.arange(0, .001, .0001)):
471 |         """
472 |         Matches data over a grid to optimize threshold value and plots results.
473 | 
474 |         Parameters
475 |         ----------
476 |         method : str
477 |             Method used for matching (use "random" for this method)
478 |         nmatches : int
479 |             Max number of matches per record. See pymatch.match()
480 |         rng: : list / np.array()
481 |             Grid of threshold values
482 | 
483 |         Returns
484 |         -------
485 |         None
486 | 
487 |         """
488 |         results = []
489 |         for i in rng:
490 |             self.match(method=method, nmatches=nmatches, threshold=i)
491 |             results.append(self.prop_retained())
492 |         plt.plot(rng, results)
493 |         plt.title("Proportion of Data retained for grid of threshold values")
494 |         plt.ylabel("Proportion Retained")
495 |         plt.xlabel("Threshold")
496 |         plt.xticks(rng)
497 | 
498 |     def record_frequency(self):
499 |         """
500 |         Calculates the frequency of specifi records in
501 |         the matched dataset
502 | 
503 |         Returns
504 |         -------
505 |         pd.DataFrame()
506 |             Frequency table of the number records
507 |             matched once, twice, ..., etc.
508 |         """
509 |         freqs = self.matched_data.groupby("record_id")\
510 |                     .count().groupby("match_id").count()\
511 |                     [["scores"]].reset_index()
512 |         freqs.columns = ["freq", "n_records"]
513 |         return freqs
514 | 
515 |     def assign_weight_vector(self):
516 |         record_freqs = self.matched_data.groupby("record_id")\
517 |                            .count()[['match_id']].reset_index()
518 |         record_freqs.columns = ["record_id", "weight"]
519 |         fm = record_freqs.merge(self.matched_data, on="record_id")
520 |         fm['weight'] = 1/fm['weight']
521 |         self.matched_data = fm
522 | 
523 |     @staticmethod
524 |     def _scores_to_accuracy(m, X, y):
525 |         preds = [[1.0 if i >= .5 else 0.0 for i in m.predict(X)]]
526 |         return (y == preds).sum() * 1.0 / len(y)
527 | 


--------------------------------------------------------------------------------
/build/lib/pymatch/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from statsmodels.genmod.generalized_linear_model import GLM
 3 | from statsmodels.tools.sm_exceptions import PerfectSeparationError
 4 | from statsmodels.distributions.empirical_distribution import ECDF
 5 | from scipy import stats
 6 | from collections import Counter
 7 | from itertools import chain
 8 | import sys; sys.path.append(sys.argv[0])
 9 | import pymatch.functions as uf
10 | import statsmodels.api as sm
11 | import patsy
12 | import seaborn as sns
13 | import matplotlib.pyplot as plt
14 | import pandas as pd
15 | import numpy as np
16 | 


--------------------------------------------------------------------------------
/build/lib/pymatch/functions.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from pymatch import *
  3 | import sys
  4 | import numpy as np
  5 | 
  6 | 
  7 | def drop_static_cols(df, yvar, cols=None):
  8 |     if not cols:
  9 |         cols = list(df.columns)
 10 |     # will be static for both groups
 11 |     cols.pop(cols.index(yvar))
 12 |     for col in df[cols]:
 13 |         n_unique = len(np.unique(df[col]))
 14 |         if n_unique == 1:
 15 |             df.drop(col, axis=1, inplace=True)
 16 |             sys.stdout.write('\rStatic column dropped: {}'.format(col))
 17 |     return df
 18 |   
 19 |   
 20 | def ks_boot(tr, co, nboots=1000):
 21 |     nx = len(tr)
 22 |     w = np.concatenate((tr, co))
 23 |     obs = len(w)
 24 |     cutp = nx
 25 |     bbcount = 0
 26 |     ss = []
 27 |     fs_ks, _ = stats.ks_2samp(tr, co)
 28 |     for bb in range(nboots):
 29 |         sw = np.random.choice(w, obs, replace=True)
 30 |         x1tmp = sw[:cutp]
 31 |         x2tmp = sw[cutp:]
 32 |         s_ks, _ = stats.ks_2samp(x1tmp, x2tmp)
 33 |         ss.append(s_ks)
 34 |         if s_ks >= fs_ks:
 35 |             bbcount += 1
 36 |     ks_boot_pval = bbcount * 1.0 / nboots
 37 |     return ks_boot_pval
 38 | 
 39 | 
 40 | def _chi2_distance(tb, cb):
 41 |     dist = 0
 42 |     for b in set(np.union1d(list(tb.keys()), list(cb.keys()))):
 43 |         if b not in tb:
 44 |             tb[b] = 0
 45 |         if b not in cb:
 46 |             cb[b] = 0
 47 |         xi, yi = tb[b], cb[b]
 48 |         dist += ((xi - yi) ** 2) * 1.0 / (xi + yi)
 49 |     return dist * 1.0 / 2
 50 | 
 51 | 
 52 | def chi2_distance(t, c):
 53 |     tb, cb, bins = which_bin_hist(t, c)
 54 |     tb, cb = bin_hist(tb, cb, bins)
 55 |     return _chi2_distance(tb,cb)
 56 | 
 57 | 
 58 | def which_bin_hist(t, c):
 59 |     comb = np.concatenate((t, c))
 60 |     bins = np.arange(np.percentile(comb, 99), step=10)
 61 |     t_binned = np.digitize(t, bins)
 62 |     c_binned = np.digitize(c, bins)
 63 |     return t_binned, c_binned, bins
 64 | 
 65 | 
 66 | def bin_hist(t, c, bins):
 67 |     tc, cc = Counter(t), Counter(c)
 68 | 
 69 |     def idx_to_value(d, bins):
 70 |         result = {}
 71 |         for k, v, in d.items():
 72 |             result[int(bins[k-1])] = v
 73 |         return result
 74 | 
 75 |     return idx_to_value(tc, bins), idx_to_value(cc, bins)
 76 | 
 77 | 
 78 | def grouped_permutation_test(f, t, c, n_samples=1000):
 79 |     truth = f(t, c)
 80 |     comb = np.concatenate((t, c))
 81 |     times_geq=0
 82 |     samp_arr = []
 83 |     for i in range(n_samples):
 84 |         tn = len(t)
 85 |         combs = comb[:]
 86 |         np.random.shuffle(combs)
 87 |         tt = combs[:tn]
 88 |         cc = combs[tn:]
 89 |         sample_truth = f(np.array(tt), np.array(cc))
 90 |         if sample_truth >= truth:
 91 |             times_geq += 1
 92 |         samp_arr.append(sample_truth)
 93 |     return (times_geq * 1.0) / n_samples, truth
 94 | 
 95 | 
 96 | def std_diff(a, b):
 97 |     sd = np.std(a.append(b))
 98 |     med = (np.median(a) - np.median(b)) * 1.0 / sd
 99 |     mean = (np.mean(a) - np.mean(b)) * 1.0 / sd
100 |     return med, mean
101 | 
102 | 
103 | def progress(i, n, prestr=''):
104 |     sys.stdout.write('\r{}: {}\{}'.format(prestr, i, n))
105 | 
106 | 
107 | def is_continuous(colname, dmatrix):
108 |     """
109 |     Check if the colname was treated as continuous in the patsy.dmatrix
110 |     Would look like colname[<factor_value>] otherwise
111 |     """
112 |     return colname in dmatrix.columns
113 | 


--------------------------------------------------------------------------------
/dist/matcher-0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/matcher-0.1.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.1.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.2.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.3.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.5.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.6.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.7.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.8.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.8.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.0.9.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.0.9.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.0.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.1.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.2.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.3.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: pymatch
 3 | Version: 0.1.3
 4 | Summary: Matching techniques for Observational Studies
 5 | Home-page: https://github.com/benmiroglio/pymatch
 6 | Author: Ben Miroglio
 7 | Author-email: benmiroglio@gmail.com
 8 | License: UNKNOWN
 9 | Download-URL: https://github.com/benmiroglio/pymatch/archive/0.1.3.tar.gz
10 | Description: UNKNOWN
11 | Keywords: logistic,regression,matching,observational,study,causal,inference
12 | Platform: UNKNOWN
13 | Requires: seaborn
14 | Requires: statsmodels
15 | Requires: scipy
16 | Requires: patsy
17 | Requires: matplotlib
18 | Requires: pandas
19 | Requires: numpy
20 | 


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/pymatch.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: pymatch
 3 | Version: 0.1.3
 4 | Summary: Matching techniques for Observational Studies
 5 | Home-page: https://github.com/benmiroglio/pymatch
 6 | Author: Ben Miroglio
 7 | Author-email: benmiroglio@gmail.com
 8 | License: UNKNOWN
 9 | Download-URL: https://github.com/benmiroglio/pymatch/archive/0.1.3.tar.gz
10 | Description: UNKNOWN
11 | Keywords: logistic,regression,matching,observational,study,causal,inference
12 | Platform: UNKNOWN
13 | Requires: seaborn
14 | Requires: statsmodels
15 | Requires: scipy
16 | Requires: patsy
17 | Requires: matplotlib
18 | Requires: pandas
19 | Requires: numpy
20 | 


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/pymatch.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.cfg
2 | setup.py
3 | pymatch/Matcher.py
4 | pymatch/__init__.py
5 | pymatch/functions.py
6 | pymatch.egg-info/PKG-INFO
7 | pymatch.egg-info/SOURCES.txt
8 | pymatch.egg-info/dependency_links.txt
9 | pymatch.egg-info/top_level.txt


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/pymatch.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/pymatch.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | pymatch
2 | 


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/pymatch/Matcher.py:
--------------------------------------------------------------------------------
  1 | from pymatch import *
  2 | import functions as uf
  3 | 
  4 | class Matcher:
  5 |     '''
  6 |     Matcher Class -- Match data for an observational study.
  7 |     
  8 |     Args:
  9 |         test (pd.DataFrame): Data representing the test group
 10 |         control (pd.DataFrame): Data representing the control group
 11 |         formula (str): custom formula to use for logistic regression
 12 |             i.e. "Y ~ x1 + x2 + ..."
 13 |         yvar (str): Name of dependent variable (the treatment)
 14 |         exclude (list): List of variables to ignore in regression/matching.
 15 |             Useful for unique idenifiers
 16 |     '''
 17 | 
 18 |     def __init__(self, test, control, yvar, formula=None, exclude=[]):  
 19 |         # configure plots for ipynb
 20 |         plt.rcParams["figure.figsize"] = (10, 5)
 21 |         # assign unique indices to test and control 
 22 |         t, c = [i.copy().reset_index(drop=True) for i in (test, control)]
 23 |         c.index += len(t)
 24 |         self.data = t.append(c).dropna(axis=1, how="all")
 25 |         self.control_color = "#1F77B4"
 26 |         self.test_color = "#FF7F0E"
 27 |         self.yvar = yvar
 28 |         self.exclude = exclude + [self.yvar] + ['scores', 'match_id']
 29 |         self.formula = formula
 30 |         self.models = []
 31 |         self.swdata = None
 32 |         self.model_accurracy = []
 33 |         self.data[yvar] = self.data[yvar].astype(int)  # should be binary 0, 1
 34 |         self.xvars = [i for i in self.data.columns if i not in self.exclude and i != yvar]
 35 |         self.matched_data = []  
 36 |         self.y, self.X = patsy.dmatrices('{} ~ {}'.format(yvar, '+'.join(self.xvars)), data=self.data,
 37 |                                              return_type='dataframe')
 38 |         self.xvars = [i for i in self.data.columns if i not in exclude]
 39 |         self.test= self.data[self.data[yvar] == True]
 40 |         self.control = self.data[self.data[yvar] == False]
 41 |         self.testn = len(self.test)
 42 |         self.controln = len(self.control)
 43 |         self.minority, self.majority = \
 44 |           [i[1] for i in sorted(zip([self.testn, self.controln], [1, 0]), 
 45 |                                 key=lambda x: x[0])]
 46 |         print 'Formula:\n{} ~ {}'.format(yvar, '+'.join(self.xvars))
 47 |         print 'n majority:', len(self.data[self.data[yvar] == self.majority])
 48 |         print 'n minority:', len(self.data[self.data[yvar] == self.minority])
 49 |         
 50 |     def fit_scores(self, balance=True, nmodels=None):
 51 |         """
 52 |         Args:
 53 |             balance (bool): Should balanced datasets be used? 
 54 |                 (n_control ~ n_test)
 55 |             nmodels (int): How many models should be fit?
 56 |                 Score becomes the average of the <nmodels> models if nmodels > 1
 57 |         """
 58 |         # reset models if refitting
 59 |         if len(self.models) > 0:
 60 |             self.models = []
 61 |         if len(self.model_accurracy) > 0:
 62 |             self.model_accurracy = []
 63 |         if not self.formula:
 64 |             # use all columns in the model
 65 |             self.formula = '{} ~ {}'.format(self.yvar, '+'.join(self.xvars))
 66 |         if balance:
 67 |             if nmodels is None:
 68 |                 # fit mutliple models based on imbalance severity (rounded up to nearest tenth)
 69 |                 minor, major = [self.data[self.data[self.yvar] == i] for i in (self.minority, self.majority)]
 70 |                 nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10)
 71 |             self.nmodels = nmodels
 72 |             i = 0
 73 |             errors = 0
 74 |             while i < nmodels and errors < 5:
 75 |                 uf.progress(i+1, nmodels, 
 76 |                          prestr="Fitting {} Models on Balanced Samples...".format(nmodels))
 77 |                 
 78 |                 # sample from majority to create balance dataset
 79 |                 
 80 |                 df = self.balanced_sample()
 81 |                 df = pd.concat([uf.drop_static_cols(df[df[self.yvar] == 1], yvar=self.yvar), 
 82 |                                 uf.drop_static_cols(df[df[self.yvar] == 0], yvar=self.yvar)])
 83 |                 y_samp, X_samp = patsy.dmatrices(self.formula, data=df, return_type='dataframe')
 84 |                 X_samp.drop(self.yvar, axis=1, errors='ignore', inplace=True)
 85 |                 
 86 |                 glm = GLM(y_samp, X_samp, family=sm.families.Binomial())
 87 |                 try:
 88 |                     res = glm.fit()
 89 |                     self.model_accurracy.append(self._scores_to_accuracy(res, X_samp, y_samp))
 90 |                     self.models.append(res)
 91 |                     i += 1
 92 |                 except Exception as e:
 93 |                     errors += 1 # to avoid infinite loop for misspecified matrix
 94 |                     print 'Error: {}'.format(e)
 95 | 
 96 |             print "\nAverage Accuracy:", "{}%".\
 97 |                   format(round(np.mean(self.model_accurracy) * 100, 2))
 98 |         else:
 99 |             # ignore any imbalance and fit one model
100 |             self.nmodels = 1
101 |             print '\nFitting 1 (Unbalanced) Model...'
102 |             glm = GLM(self.y, self.X, family=sm.families.Binomial())
103 |             res = glm.fit()
104 |             self.model_accurracy.append(self._scores_to_accuracy(res, self.X, self.y))
105 |             self.models.append(res)
106 |             print "Accuracy", round(np.mean(self.model_accurracy[0]) * 100, 2)
107 |             
108 |     def predict_scores(self):
109 |         """
110 |         Predict Propensity scores for each observation
111 |         """
112 |         scores = np.zeros(len(self.X))
113 |         for i in range(self.nmodels):
114 |             uf.progress(i+1, self.nmodels, "Caclculating Propensity Scores...")
115 |             m = self.models[i]
116 |             scores += m.predict(self.X[m.params.index])
117 |         self.data['scores'] = scores/self.nmodels
118 |         
119 |     def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10):
120 |         """
121 |         Match data
122 |         
123 |         Args:
124 |             threshold (float): threshold for "exact" matching
125 |                 i.e. |score_x - score_y| >= theshold
126 |             nmatches (int): How control profiles should be matched
127 |                 (at most) to test
128 |             method (str): Strategy for when multiple control profiles
129 |                 are suitable matches for a single test profile
130 |                 "random" - choose randomly
131 |                 "min" - choose the profile with the closest score
132 |             max_rand
133 |         """
134 |         if 'scores' not in self.data.columns:
135 |             print "Propensity Scores have not been calculated. Using defaults..."
136 |             self.fit_scores()
137 |             self.predict_scores()
138 |         test_scores = self.data[self.data[self.yvar]==True][['scores']]
139 |         ctrl_scores = self.data[self.data[self.yvar]==False][['scores']]
140 |         result, match_ids = [], []
141 |         for i in range(len(test_scores)):
142 |             # uf.progress(i+1, len(test_scores), 'Matching Control to Test...')
143 |             match_id = i
144 |             score = test_scores.iloc[i]
145 |             if method == 'random':
146 |                 bool_match = abs(ctrl_scores - score) <= threshold
147 |                 matches = ctrl_scores.loc[bool_match[bool_match.scores].index]
148 |             elif method == 'min':
149 |                 matches = abs(ctrl_scores - score).sort_values('scores').head(1)
150 |             else:
151 |                 raise AssertionError, "Invalid tie_strategy parameter, use ('random', 'min')"
152 |             if len(matches) == 0:
153 |                 continue
154 |             # randomly choose nmatches indices, if len(matches) > nmatches
155 |             select = nmatches if method != 'random' else np.random.choice(range(1, max_rand+1), 1)
156 |             chosen = np.random.choice(matches.index, min(select, nmatches), replace=False)
157 |             result.extend([test_scores.index[i]] + list(chosen))
158 |             match_ids.extend([i] * (len(chosen)+1))
159 |         self.matched_data = self.data.loc[result]
160 |         self.matched_data['match_id'] = match_ids  
161 |         
162 |     def select_from_design(self, cols):
163 |         d = pd.DataFrame()
164 |         for c in cols:
165 |             d = pd.concat([d, self.X.select(lambda x: x.startswith(c), axis=1)], axis=1)
166 |         return d
167 | 
168 |     def balanced_sample(self, data=None):
169 |         if not data:
170 |             data=self.data
171 |         minor, major = data[data[self.yvar] == self.minority], data[data[self.yvar] == self.majority]
172 |         return major.sample(len(minor)).append(minor).dropna()
173 | 
174 |     def plot_scores(self):
175 |         assert 'scores' in self.data.columns, "Propensity scores haven't been calculated, use Matcher.predict_scores()"
176 |         sns.distplot(self.data[self.data[self.yvar]==False].scores, label='Control')
177 |         sns.distplot(self.data[self.data[self.yvar]==True].scores, label='Test')
178 |         plt.legend(loc='upper right')
179 |         plt.xlim((0, 1))
180 |         plt.title("Propensity Scores Before Matching")
181 |         plt.ylabel("Percentage (%)")
182 |         plt.xlabel("Scores")
183 | 
184 |             
185 |     def prop_test(self, col):
186 |         if not uf.is_continuous(col, self.X) and col not in self.exclude:
187 |             pval_before = round(stats.chi2_contingency(self.prep_prop_test(self.data, col))[1], 6)
188 |             pval_after = round(stats.chi2_contingency(self.prep_prop_test(self.matched_data, col))[1], 6)
189 |             return {'var':col, 'before':pval_before, 'after':pval_after}
190 |         else:
191 |             print "{} is a continuous variable".format(col)
192 |                     
193 |     def compare_continuous(self, save=False, return_table=False):
194 |         test_results = []
195 |         for col in self.matched_data.columns:
196 |             if uf.is_continuous(col, self.X) and col not in self.exclude:
197 |                 if save: pp = PdfPages("{}-ecdf.pdf".format(col))
198 |                 # organize data
199 |                 trb, cob = self.test[col], self.control[col]
200 |                 tra = self.matched_data[self.matched_data[self.yvar]==True][col]
201 |                 coa = self.matched_data[self.matched_data[self.yvar]==False][col]
202 |                 xtb, xcb = ECDF(trb), ECDF(cob)
203 |                 xta, xca = ECDF(tra),ECDF(coa)
204 |                 
205 |                 # before/after stats
206 |                 std_diff_med_before, std_diff_mean_before = uf.std_diff(trb, cob)
207 |                 std_diff_med_after, std_diff_mean_after = uf.std_diff(tra, coa)
208 |                 pb, truthb = uf.grouped_permutation_test(uf.chi2_distance, trb, cob)
209 |                 pa, trutha = uf.grouped_permutation_test(uf.chi2_distance, tra, coa)
210 |                 ksb = round(uf.ks_boot(trb, cob, nboots=1000), 6)
211 |                 ksa = round(uf.ks_boot(tra, coa, nboots=1000), 6)
212 |                 
213 |                 # plotting
214 |                 f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(12, 5))
215 |                 ax1.plot(xcb.x, xcb.y, label='Control', color=self.control_color)
216 |                 ax1.plot(xtb.x, xtb.y, label='Test', color=self.test_color)
217 |                 ax1.plot(xcb.x, xcb.y, label='Control', color=self.control_color)
218 |                 ax1.plot(xtb.x, xtb.y, label='Test', color=self.test_color)
219 |                     
220 |                 title_str = '''
221 |                 ECDF for {} {} Matching
222 |                 KS p-value: {}
223 |                 Grouped Perm p-value: {}
224 |                 Std. Median Difference: {}
225 |                 Std. Mean Difference: {}
226 |                 '''
227 |                 ax1.set_title(title_str\
228 |                   .format(col, "before", ksb, pb, std_diff_med_before, std_diff_mean_before))
229 |                 ax2.plot(xca.x, xca.y, label='Control')
230 |                 ax2.plot(xta.x, xta.y, label='Test')
231 |                 ax2.set_title(title_str\
232 |                   .format(col, "after", ksa, pa, std_diff_med_after, std_diff_mean_after))
233 |                 ax2.legend(loc="lower right")
234 |                 plt.xlim((0, np.percentile(xta.x, 99)))
235 |                 
236 |                 test_results.append({
237 |                         "var": col,
238 |                         "ks_before": ksb,
239 |                         "ks_after": ksa,
240 |                         "perm_chisqr_before": pb,
241 |                         "grouped_chisqr_after": pa,
242 |                         "std_median_diff_before": std_diff_med_before,
243 |                         "std_median_diff_after": std_diff_med_after,
244 |                         "std_mean_diff_before": std_diff_mean_before,
245 |                         "std_mean_diff_after": std_diff_mean_after
246 |                     })
247 |                 
248 |                 if save: pp.savefig()
249 |         
250 |         var_order=["var",
251 |         "ks_before",
252 |         "ks_after",
253 |         "perm_chisqr_before",
254 |         "grouped_chisqr_after",
255 |         "std_median_diff_before",
256 |         "std_median_diff_after",
257 |         "std_mean_diff_before",
258 |         "std_mean_diff_after"]
259 |         if save: pp.close()
260 |         return pd.DataFrame(test_results)[var_order] if return_table else None
261 |                 
262 |     def compare_discrete(self, return_table=False):
263 |         def prep_plot(data, var, colname):
264 |             t, c = data[data[self.yvar]==1], data[data[self.yvar]==0]
265 |             #dummy var for counting
266 |             dummy = [i for i in t.columns if i != var][0]
267 |             countt = t[[var, dummy]].groupby(var).count() / len(t)
268 |             countc = c[[var, dummy]].groupby(var).count() / len(c)
269 |             ret = (countt-countc).dropna()
270 |             ret.columns = [colname]
271 |             return ret
272 |         
273 |         title_str = '''
274 |         Proportional Difference (test-control) for {} Before and After Matching
275 |         Chi-Square Test for Independence p-value before | after:
276 |         {} | {}
277 |         '''
278 |         test_results = []
279 |         for col in self.matched_data.columns:
280 |             if not uf.is_continuous(col, self.X) and col not in self.exclude:
281 |                 dbefore = prep_plot(self.data, col, colname="before")
282 |                 dafter = prep_plot(self.matched_data, col, colname="after")
283 |                 df = dbefore.join(dafter)
284 |                 test_results_i = self.prop_test(col)
285 |                 test_results.append(test_results_i)
286 |                 
287 |                 # plotting
288 |                 df.plot.bar(alpha=.8)
289 |                 plt.title(title_str.format(col, test_results_i["before"], test_results_i["after"]))
290 |                 lim = max(.09, abs(df).max().max()) + .01
291 |                 plt.ylim((-lim, lim))
292 |         return pd.DataFrame(test_results)[['var', 'before', 'after']] if return_table else None
293 |                 
294 | 
295 |     def prep_prop_test(self, data, var):
296 |         counts = data.groupby([var, self.yvar]).count().reset_index()
297 |         table = []
298 |         for t in (0, 1):
299 |             os_counts = counts[counts[self.yvar] ==t]\
300 |                                      .sort_values(var)
301 |             cdict = {}
302 |             for row in os_counts.iterrows():
303 |                 row = row[1]
304 |                 cdict[row[var]] = row[2]
305 |             table.append(cdict)
306 |         # fill empty keys as 0
307 |         all_keys = set(chain.from_iterable(table))
308 |         for d in table:
309 |             d.update((k, 0) for k in all_keys if k not in d)
310 |         ctable = [[i[k] for k in sorted(all_keys)] for i in table]
311 |         return ctable
312 |             
313 |     def prop_retained(self):
314 |         return len(self.matched_data[self.matched_data[self.yvar] == self.minority]) * 1.0 / \
315 |                len(self.data[self.data[self.yvar] == self.minority])
316 | 
317 |     def tune_threshold(self, method, nmatches=1, rng=np.arange(0, .001, .0001)):
318 |         results = []
319 |         for i in rng:
320 |             self.match(method=method, nmatches=nmatches, threshold=i)
321 |             results.append(self.prop_retained())
322 |         plt.plot(rng, results)
323 |         plt.title("Proportion of Data retained for grid of threshold values")
324 |         plt.ylabel("Proportion Retained")
325 |         plt.xlabel("Threshold")
326 |         plt.xticks(rng)
327 | 
328 |         
329 |     def _scores_to_accuracy(self, m, X, y):
330 |         preds = [1.0 if i >= .5 else 0.0 for i in m.predict(X)]
331 |         return (y == preds).sum() * 1.0 / len(y)
332 | 
333 | 


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/pymatch/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from statsmodels.genmod.generalized_linear_model import GLM
 3 | from statsmodels.tools.sm_exceptions import PerfectSeparationError
 4 | from statsmodels.distributions.empirical_distribution import ECDF
 5 | from scipy import stats
 6 | from collections import Counter
 7 | from itertools import chain
 8 | from utils import functions as uf
 9 | import statsmodels.api as sm
10 | import patsy 
11 | import seaborn as sns
12 | import matplotlib.pyplot as plt
13 | import pandas as pd
14 | import numpy as np
15 | import sys
16 | 


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/pymatch/functions.py:
--------------------------------------------------------------------------------
  1 | from pymatch import *
  2 | from pymatch.utils import *
  3 | 
  4 | def drop_static_cols(df, yvar, cols=None):
  5 |     if not cols:
  6 |         cols = list(df.columns)
  7 |     # will be static for both groups
  8 |     cols.pop(cols.index(yvar))
  9 |     for col in df[cols]:
 10 |         n_unique = len(np.unique(df[col]))
 11 |         if n_unique == 1:
 12 |             df.drop(col, axis=1, inplace=True)
 13 |     return df
 14 | 
 15 | 
 16 | def ks_boot(tr, co, nboots=1000):
 17 |     nx = len(tr)
 18 |     ny = len(co)
 19 |     w = np.concatenate((tr, co))
 20 |     obs = len(w)
 21 |     cutp = nx
 22 |     ks_boot_pval = None
 23 |     bbcount = 0
 24 |     ss = []
 25 |     fs_ks, _ = stats.ks_2samp(tr, co)
 26 |     for bb in range(nboots):
 27 |         sw = np.random.choice(w, obs, replace=True)
 28 |         x1tmp = sw[:cutp]
 29 |         x2tmp = sw[cutp:]
 30 |         s_ks, _ = stats.ks_2samp(x1tmp, x2tmp)
 31 |         ss.append(s_ks)
 32 |         if s_ks >= fs_ks:
 33 |             bbcount += 1
 34 |     ks_boot_pval = bbcount * 1.0 / nboots
 35 |     return ks_boot_pval
 36 | 
 37 | def _chi2_distance(tb, cb):
 38 |     dist = 0
 39 |     for b in np.union1d(tb.keys(), cb.keys()):
 40 |         if b not in tb:
 41 |             tb[b] = 0
 42 |         if b not in cb:
 43 |             cb[b] = 0
 44 |         xi, yi = tb[b], cb[b]
 45 |         dist += ((xi - yi) ** 2) * 1.0 / (xi + yi)
 46 |     return dist * 1.0 / 2
 47 | 
 48 | def chi2_distance(t, c):
 49 |     tb, cb, bins = which_bin_hist(t, c)
 50 |     tb, cb = bin_hist(tb, cb, bins)
 51 |     return _chi2_distance(tb,cb)
 52 |     
 53 | def which_bin_hist(t, c):
 54 |     comb = np.concatenate((t, c))
 55 |     bins =np.arange(np.percentile(comb , 99), step=10)
 56 |     t_binned = np.digitize(t, bins)
 57 |     c_binned = np.digitize(c, bins)
 58 |     return t_binned, c_binned, bins
 59 | 
 60 | def bin_hist(t, c, bins):
 61 |     tc, cc = Counter(t), Counter(c)
 62 |     def idx_to_value(d, bins):
 63 |         result = {}
 64 |         for k, v, in d.items():
 65 |             result[int(bins[k-1])] = v
 66 |         return result
 67 |     return idx_to_value(tc, bins), idx_to_value(cc, bins)
 68 | 
 69 | def grouped_permutation_test(f, t, c, n_samples=1000):
 70 |     truth = f(t, c)
 71 |     comb = np.concatenate((t, c))
 72 |     times_geq=0
 73 |     samp_arr = []
 74 |     for i in range(n_samples):
 75 |         tn = len(t)
 76 |         combs = comb[:]
 77 |         np.random.shuffle(combs)
 78 |         tt = combs[:tn]
 79 |         cc = combs[tn:]
 80 |         sample_truth = f(np.array(tt), np.array(cc))
 81 |         if sample_truth >= truth:
 82 |             times_geq += 1
 83 |         samp_arr.append(sample_truth)
 84 |     return (times_geq * 1.0)  / n_samples, truth
 85 | 
 86 | def std_diff(a, b):
 87 |     sd = np.std(a.append(b))
 88 |     med = (np.median(a) - np.median(b)) * 1.0 / sd
 89 |     mean = (np.mean(a) - np.mean(b)) * 1.0 / sd
 90 |     return med, mean
 91 | 
 92 | def progress(i, n, prestr=''):
 93 |     sys.stdout.write('\r{}{}'.format(prestr, ''))
 94 |  
 95 | def is_continuous(colname, dmatrix):
 96 |     '''
 97 |     Check if the colname was treated as continuous in the patsy.dmatrix
 98 |     Would look like colname[<factor_value>] otherwise
 99 |     '''
100 |     return colname in dmatrix.columns


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 
4 | [egg_info]
5 | tag_build = 
6 | tag_date = 0
7 | tag_svn_revision = 0
8 | 
9 | 


--------------------------------------------------------------------------------
/dist/pymatch-0.1.3/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | dependencies =[
 3 |   'seaborn',
 4 |   'statsmodels',
 5 |   'scipy',
 6 |   'patsy',
 7 |   'matplotlib',
 8 |   'pandas', 
 9 |   'numpy'
10 |   ]
11 | 
12 | VERSION = "0.1.3"
13 | 
14 | setup(
15 |   name = 'pymatch',
16 |   packages = ['pymatch'],
17 |   version = VERSION,
18 |   description = 'Matching techniques for Observational Studies',
19 |   author = 'Ben Miroglio',
20 |   author_email = 'benmiroglio@gmail.com',
21 |   url = 'https://github.com/benmiroglio/pymatch',
22 |   download_url = 'https://github.com/benmiroglio/pymatch/archive/{}.tar.gz'.format(VERSION), 
23 |   keywords = ['logistic', 'regression', 'matching', 'observational', 'study', 'causal', 'inference'],
24 |   include_package_data=True,
25 |   requires=dependencies
26 | )


--------------------------------------------------------------------------------
/dist/pymatch-0.1.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.4.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.5.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.6.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.7.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.8.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.8.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.9.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.9.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.1.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.0.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.1.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.2.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.3.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.4.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.5.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.6.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.8-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.8-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.2.8.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.8.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.2.9-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.9-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.2.9.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.2.9.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.3.0-py2-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.0-py2-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.3.0-py2.py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.0-py2.py3-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.3.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.0.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.3.1-py2-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.1-py2-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.3.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.1.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.3.2-py2-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.2-py2-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.3.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.2.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.3.3-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.3-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.3.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.3.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.3.4.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.4.1-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.3.4.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.4.1.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.3.4.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.4.2-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/pymatch-0.3.4.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.3.4.2.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.5.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.6.tar.gz


--------------------------------------------------------------------------------
/dist/pymatch-0.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benmiroglio/pymatch/b5582b6dd3399cbdfc7bcf06603e34d979ff9ba4/dist/pymatch-0.7.tar.gz


--------------------------------------------------------------------------------
/misc/.coo:
--------------------------------------------------------------------------------
1 | https://orig00.deviantart.net/3ba4/f/2012/065/d/a/highland_coo_by_charlymarion-d4rxb7p.jpg
2 | 


--------------------------------------------------------------------------------
/pymatch.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.1
 2 | Name: pymatch
 3 | Version: 0.3.4.2
 4 | Summary: Matching techniques for Observational Studies
 5 | Home-page: https://github.com/benmiroglio/pymatch
 6 | Author: Ben Miroglio
 7 | Author-email: benmiroglio@gmail.com
 8 | License: UNKNOWN
 9 | Download-URL: https://github.com/benmiroglio/pymatch/archive/0.3.4.2.tar.gz
10 | Description: UNKNOWN
11 | Keywords: logistic,regression,matching,observational,study,causal,inference
12 | Platform: UNKNOWN
13 | 


--------------------------------------------------------------------------------
/pymatch.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | setup.cfg
 3 | setup.py
 4 | pymatch/Matcher.py
 5 | pymatch/__init__.py
 6 | pymatch/functions.py
 7 | pymatch.egg-info/PKG-INFO
 8 | pymatch.egg-info/SOURCES.txt
 9 | pymatch.egg-info/dependency_links.txt
10 | pymatch.egg-info/requires.txt
11 | pymatch.egg-info/top_level.txt


--------------------------------------------------------------------------------
/pymatch.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pymatch.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | seaborn
2 | statsmodels
3 | scipy
4 | patsy
5 | matplotlib
6 | pandas
7 | numpy
8 | 


--------------------------------------------------------------------------------
/pymatch.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | pymatch
2 | 


--------------------------------------------------------------------------------
/pymatch/Matcher.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from pymatch import *
  3 | import pymatch.functions as uf
  4 | 
  5 | class Matcher:
  6 |     """
  7 |     Matcher Class -- Match data for an observational study.
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     test : pd.DataFrame
 12 |         Data representing the test group
 13 |     control : (pd.DataFrame)
 14 |         Data representing the control group
 15 |     formula : str (optional)
 16 |         custom formula to use for logistic regression
 17 |         i.e. "Y ~ x1 + x2 + ..."
 18 |     yvar : str (optional)
 19 |         Name of dependent variable (the treatment)
 20 |     exclude : list  (optional)
 21 |         List of variables to ignore in regression/matching.
 22 |         Useful for unique idenifiers
 23 |     """
 24 | 
 25 |     def __init__(self, test, control, yvar, formula=None, exclude=[]):
 26 |         # configure plots for ipynb
 27 |         plt.rcParams["figure.figsize"] = (10, 5)
 28 |         # variables generated during matching
 29 |         aux_match = ['scores', 'match_id', 'weight', 'record_id']
 30 |         # assign unique indices to test and control
 31 |         t, c = [i.copy().reset_index(drop=True) for i in (test, control)]
 32 |         t = t.dropna(axis=1, how="all")
 33 |         c = c.dropna(axis=1, how="all")
 34 |         c.index += len(t)
 35 |         self.data = t.dropna(axis=1, how='all').append(c.dropna(axis=1, how='all'), sort=True)
 36 |         self.control_color = "#1F77B4"
 37 |         self.test_color = "#FF7F0E"
 38 |         self.yvar = yvar
 39 |         self.exclude = exclude + [self.yvar] + aux_match
 40 |         self.formula = formula
 41 |         self.nmodels = 1  # for now
 42 |         self.models = []
 43 |         self.swdata = None
 44 |         self.model_accuracy = []
 45 |         self.data[yvar] = self.data[yvar].astype(int)  # should be binary 0, 1
 46 |         self.xvars = [i for i in self.data.columns if i not in self.exclude and i != yvar]
 47 |         self.data = self.data.dropna(subset=self.xvars)
 48 |         self.matched_data = []
 49 |         self.xvars_escaped = [ "Q('{}')".format(x) for x in self.xvars]
 50 |         self.yvar_escaped = "Q('{}')".format(self.yvar)
 51 |         self.y, self.X = patsy.dmatrices('{} ~ {}'.format(self.yvar_escaped, '+'.join(self.xvars_escaped)),
 52 |                                          data=self.data, return_type='dataframe')
 53 |         self.xvars = [i for i in self.data.columns if i not in self.exclude]
 54 |         self.test= self.data[self.data[yvar] == True]
 55 |         self.control = self.data[self.data[yvar] == False]
 56 |         self.testn = len(self.test)
 57 |         self.controln = len(self.control)
 58 |         self.minority, self.majority = [i[1] for i in sorted(zip([self.testn, self.controln],
 59 |                                                                  [1, 0]),
 60 |                                                              key=lambda x: x[0])]
 61 |         print('Formula:\n{} ~ {}'.format(yvar, '+'.join(self.xvars)))
 62 |         print('n majority:', len(self.data[self.data[yvar] == self.majority]))
 63 |         print('n minority:', len(self.data[self.data[yvar] == self.minority]))
 64 | 
 65 |     def fit_scores(self, balance=True, nmodels=None):
 66 |         """
 67 |         Fits logistic regression model(s) used for
 68 |         generating propensity scores
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         balance : bool
 73 |             Should balanced datasets be used?
 74 |             (n_control == n_test)
 75 |         nmodels : int
 76 |             How many models should be fit?
 77 |             Score becomes the average of the <nmodels> models if nmodels > 1
 78 | 
 79 |         Returns
 80 |         -------
 81 |         None
 82 |         """
 83 |         # reset models if refitting
 84 |         if len(self.models) > 0:
 85 |             self.models = []
 86 |         if len(self.model_accuracy) > 0:
 87 |             self.model_accuracy = []
 88 |         if not self.formula:
 89 |             # use all columns in the model
 90 |             self.xvars_escaped = [ "Q('{}')".format(x) for x in self.xvars]
 91 |             self.yvar_escaped = "Q('{}')".format(self.yvar)
 92 |             self.formula = '{} ~ {}'.format(self.yvar_escaped, '+'.join(self.xvars_escaped))
 93 |         if balance:
 94 |             if nmodels is None:
 95 |                 # fit multiple models based on imbalance severity (rounded up to nearest tenth)
 96 |                 minor, major = [self.data[self.data[self.yvar] == i] for i in (self.minority,
 97 |                                                                                self.majority)]
 98 |                 nmodels = int(np.ceil((len(major) / len(minor)) / 10) * 10)
 99 |             self.nmodels = nmodels
100 |             i = 0
101 |             errors = 0
102 |             while i < nmodels and errors < 5:
103 |                 uf.progress(i+1, nmodels, prestr="Fitting Models on Balanced Samples")
104 |                 # sample from majority to create balance dataset
105 |                 df = self.balanced_sample()
106 |                 df = pd.concat([uf.drop_static_cols(df[df[self.yvar] == 1], yvar=self.yvar),
107 |                                 uf.drop_static_cols(df[df[self.yvar] == 0], yvar=self.yvar)],
108 |                                sort=True)
109 |                 y_samp, X_samp = patsy.dmatrices(self.formula, data=df, return_type='dataframe')
110 |                 X_samp.drop(self.yvar, axis=1, errors='ignore', inplace=True)
111 |                 glm = GLM(y_samp, X_samp, family=sm.families.Binomial())
112 | 
113 |                 try:
114 |                     res = glm.fit()
115 |                     self.model_accuracy.append(self._scores_to_accuracy(res, X_samp, y_samp))
116 |                     self.models.append(res)
117 |                     i = i + 1
118 |                 except Exception as e:
119 |                     errors = errors + 1 # to avoid infinite loop for misspecified matrix
120 |                     print('Error: {}'.format(e))
121 |             print("\nAverage Accuracy:", "{}%".
122 |                   format(round(np.mean(self.model_accuracy) * 100, 2)))
123 |         else:
124 |             # ignore any imbalance and fit one model
125 |             print('Fitting 1 (Unbalanced) Model...')
126 |             glm = GLM(self.y, self.X, family=sm.families.Binomial())
127 |             res = glm.fit()
128 |             self.model_accuracy.append(self._scores_to_accuracy(res, self.X, self.y))
129 |             self.models.append(res)
130 |             print("\nAccuracy", round(np.mean(self.model_accuracy[0]) * 100, 2))
131 | 
132 | 
133 |     def predict_scores(self):
134 |         """
135 |         Predict Propensity scores for each observation.
136 |         Adds a "scores" columns to self.data
137 | 
138 |         Returns
139 |         -------
140 |         None
141 |         """
142 |         scores = np.zeros(len(self.X))
143 |         for i in range(self.nmodels):
144 |             m = self.models[i]
145 |             scores += m.predict(self.X[m.params.index])
146 |         self.data['scores'] = scores/self.nmodels
147 | 
148 |     def match(self, threshold=0.001, nmatches=1, method='min', max_rand=10):
149 |         """
150 |         Finds suitable match(es) for each record in the minority
151 |         dataset, if one exists. Records are exlcuded from the final
152 |         matched dataset if there are no suitable matches.
153 | 
154 |         self.matched_data contains the matched dataset once this
155 |         method is called
156 | 
157 |         Parameters
158 |         ----------
159 |         threshold : float
160 |             threshold for fuzzy matching matching
161 |             i.e. |score_x - score_y| >= theshold
162 |         nmatches : int
163 |             How majority profiles should be matched
164 |             (at most) to minority profiles
165 |         method : str
166 |             Strategy for when multiple majority profiles
167 |             are suitable matches for a single minority profile
168 |             "random" - choose randomly (fast, good for testing)
169 |             "min" - choose the profile with the closest score
170 |         max_rand : int
171 |             max number of profiles to consider when using random tie-breaks
172 | 
173 |         Returns
174 |         -------
175 |         None
176 |         """
177 |         if 'scores' not in self.data.columns:
178 |             print("Propensity Scores have not been calculated. Using defaults...")
179 |             self.fit_scores()
180 |             self.predict_scores()
181 |         test_scores = self.data[self.data[self.yvar]==True][['scores']]
182 |         ctrl_scores = self.data[self.data[self.yvar]==False][['scores']]
183 |         result, match_ids = [], []
184 |         for i in range(len(test_scores)):
185 |             # uf.progress(i+1, len(test_scores), 'Matching Control to Test...')
186 |             match_id = i
187 |             score = test_scores.iloc[i]
188 |             if method == 'random':
189 |                 bool_match = abs(ctrl_scores - score) <= threshold
190 |                 matches = ctrl_scores.loc[bool_match[bool_match.scores].index]
191 |             elif method == 'min':
192 |                 matches = abs(ctrl_scores - score).sort_values('scores').head(nmatches)
193 |             else:
194 |                 raise(AssertionError, "Invalid method parameter, use ('random', 'min')")
195 |             if len(matches) == 0:
196 |                 continue
197 |             # randomly choose nmatches indices, if len(matches) > nmatches
198 |             select = nmatches if method != 'random' else np.random.choice(range(1, max_rand+1), 1)
199 |             chosen = np.random.choice(matches.index, min(select, nmatches), replace=False)
200 |             result.extend([test_scores.index[i]] + list(chosen))
201 |             match_ids.extend([i] * (len(chosen)+1))
202 |         self.matched_data = self.data.loc[result]
203 |         self.matched_data['match_id'] = match_ids
204 |         self.matched_data['record_id'] = self.matched_data.index
205 | 
206 |     def select_from_design(self, cols):
207 |         d = pd.DataFrame()
208 |         for c in cols:
209 |             d = pd.concat([d, self.X.select(lambda x: x.startswith(c), axis=1)], axis=1, sort=True)
210 |         return d
211 | 
212 |     def balanced_sample(self, data=None):
213 |         if not data:
214 |             data=self.data
215 |         minor, major =  data[data[self.yvar] == self.minority], \
216 |                         data[data[self.yvar] == self.majority]
217 |         return major.sample(len(minor)).append(minor, sort=True).dropna()
218 | 
219 |     def plot_scores(self):
220 |         """
221 |         Plots the distribution of propensity scores before matching between
222 |         our test and control groups
223 |         """
224 |         assert 'scores' in self.data.columns, \
225 |             "Propensity scores haven't been calculated, use Matcher.predict_scores()"
226 |         sns.distplot(self.data[self.data[self.yvar]==0].scores, label='Control')
227 |         sns.distplot(self.data[self.data[self.yvar]==1].scores, label='Test')
228 |         plt.legend(loc='upper right')
229 |         plt.xlim((0, 1))
230 |         plt.title("Propensity Scores Before Matching")
231 |         plt.ylabel("Percentage (%)")
232 |         plt.xlabel("Scores")
233 | 
234 |     def prop_test(self, col):
235 |         """
236 |         Performs a Chi-Square test of independence on <col>
237 |         See stats.chi2_contingency()
238 | 
239 |         Parameters
240 |         ----------
241 |         col : str
242 |             Name of column on which the test should be performed
243 | 
244 |         Returns
245 |         ______
246 |         dict
247 |             {'var': <col>,
248 |              'before': <pvalue before matching>,
249 |              'after': <pvalue after matching>}
250 | 
251 | 
252 |         """
253 |         if not uf.is_continuous(col, self.X) and col not in self.exclude:
254 |             pval_before = round(stats.chi2_contingency(self.prep_prop_test(self.data,
255 |                                                                            col))[1], 6)
256 |             pval_after = round(stats.chi2_contingency(self.prep_prop_test(self.matched_data,
257 |                                                                           col))[1], 6)
258 |             return {'var':col, 'before':pval_before, 'after':pval_after}
259 |         else:
260 |             print("{} is a continuous variable".format(col))
261 | 
262 |     def compare_continuous(self, save=False, return_table=False):
263 |         """
264 |         Plots the ECDFs for continuous features before and
265 |         after matching. Each chart title contains test results
266 |         and statistics to summarize how similar the two distributions
267 |         are (we want them to be close after matching).
268 | 
269 |         Tests performed:
270 |         Kolmogorov-Smirnov Goodness of fit Test (KS-test)
271 |             This test statistic is calculated on 1000
272 |             permuted samples of the data, generating
273 |             an imperical p-value.  See pymatch.functions.ks_boot()
274 |             This is an adaptation of the ks.boot() method in
275 |             the R "Matching" package
276 |             https://www.rdocumentation.org/packages/Matching/versions/4.9-2/topics/ks.boot
277 |         Chi-Square Distance:
278 |             Similarly this distance metric is calculated on
279 |             1000 permuted samples.
280 |             See pymatch.functions.grouped_permutation_test()
281 | 
282 |         Other included Stats:
283 |         Standarized mean and median differences
284 |         How many standard deviations away are the mean/median
285 |         between our groups before and after matching
286 |         i.e. abs(mean(control) - mean(test)) / std(control.union(test))
287 | 
288 |         Parameters
289 |         ----------
290 |         return_table : bool
291 |             Should the function a table with tests and statistics?
292 | 
293 |         Returns
294 |         -------
295 |         pd.DataFrame (optional)
296 |             Table of before/after statistics if return_table == True
297 | 
298 | 
299 |         """
300 |         test_results = []
301 |         for col in self.matched_data.columns:
302 |             if uf.is_continuous(col, self.X) and col not in self.exclude:
303 |                 # organize data
304 |                 trb, cob = self.test[col], self.control[col]
305 |                 tra = self.matched_data[self.matched_data[self.yvar]==True][col]
306 |                 coa = self.matched_data[self.matched_data[self.yvar]==False][col]
307 |                 xtb, xcb = ECDF(trb), ECDF(cob)
308 |                 xta, xca = ECDF(tra),ECDF(coa)
309 | 
310 |                 # before/after stats
311 |                 std_diff_med_before, std_diff_mean_before = uf.std_diff(trb, cob)
312 |                 std_diff_med_after, std_diff_mean_after = uf.std_diff(tra, coa)
313 |                 pb, truthb = uf.grouped_permutation_test(uf.chi2_distance, trb, cob)
314 |                 pa, trutha = uf.grouped_permutation_test(uf.chi2_distance, tra, coa)
315 |                 ksb = round(uf.ks_boot(trb, cob, nboots=1000), 6)
316 |                 ksa = round(uf.ks_boot(tra, coa, nboots=1000), 6)
317 | 
318 |                 # plotting
319 |                 f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(12, 5))
320 |                 ax1.plot(xcb.x, xcb.y, label='Control', color=self.control_color)
321 |                 ax1.plot(xtb.x, xtb.y, label='Test', color=self.test_color)
322 |                 ax1.plot(xcb.x, xcb.y, label='Control', color=self.control_color)
323 |                 ax1.plot(xtb.x, xtb.y, label='Test', color=self.test_color)
324 | 
325 |                 title_str = '''
326 |                 ECDF for {} {} Matching
327 |                 KS p-value: {}
328 |                 Grouped Perm p-value: {}
329 |                 Std. Median Difference: {}
330 |                 Std. Mean Difference: {}
331 |                 '''
332 |                 ax1.set_title(title_str.format(col, "before", ksb, pb,
333 |                                                std_diff_med_before, std_diff_mean_before))
334 |                 ax2.plot(xca.x, xca.y, label='Control')
335 |                 ax2.plot(xta.x, xta.y, label='Test')
336 |                 ax2.set_title(title_str.format(col, "after", ksa, pa,
337 |                                                std_diff_med_after, std_diff_mean_after))
338 |                 ax2.legend(loc="lower right")
339 |                 plt.xlim((0, np.percentile(xta.x, 99)))
340 | 
341 |                 test_results.append({
342 |                         "var": col,
343 |                         "ks_before": ksb,
344 |                         "ks_after": ksa,
345 |                         "grouped_chisqr_before": pb,
346 |                         "grouped_chisqr_after": pa,
347 |                         "std_median_diff_before": std_diff_med_before,
348 |                         "std_median_diff_after": std_diff_med_after,
349 |                         "std_mean_diff_before": std_diff_mean_before,
350 |                         "std_mean_diff_after": std_diff_mean_after
351 |                     })
352 | 
353 |         var_order = [
354 |                     "var",
355 |                     "ks_before",
356 |                     "ks_after",
357 |                     "grouped_chisqr_before",
358 |                     "grouped_chisqr_after",
359 |                     "std_median_diff_before",
360 |                     "std_median_diff_after",
361 |                     "std_mean_diff_before",
362 |                     "std_mean_diff_after"
363 |                 ]
364 | 
365 |         return pd.DataFrame(test_results)[var_order] if return_table else None
366 | 
367 |     def compare_categorical(self, return_table=False):
368 |         """
369 |         Plots the proportional differences of each enumerated
370 |         discete column for test and control.
371 |         i.e. <prop_test_that_have_x>  - <prop_control_that_have_x>
372 |         Each chart title contains the results from a
373 |         Chi-Square Test of Independence before and after
374 |         matching.
375 |         See pymatch.prop_test()
376 | 
377 |         Parameters
378 |         ----------
379 |         return_table : bool
380 |             Should the function return a table with
381 |             test results?
382 | 
383 |         Return
384 |         ------
385 |         pd.DataFrame() (optional)
386 |             Table with the p-values of the Chi-Square contingency test
387 |             for each discrete column before and after matching
388 | 
389 |         """
390 |         def prep_plot(data, var, colname):
391 |             t, c = data[data[self.yvar] == 1], data[data[self.yvar] == 0]
392 |             # dummy var for counting
393 |             dummy = [i for i in t.columns if i not in \
394 |                       (var, "match_id", "record_id", "weight")][0]
395 |             countt = t[[var, dummy]].groupby(var).count() / len(t)
396 |             countc = c[[var, dummy]].groupby(var).count() / len(c)
397 |             ret = (countt-countc).dropna()
398 |             ret.columns = [colname]
399 |             return ret
400 | 
401 |         title_str = '''
402 |         Proportional Difference (test-control) for {} Before and After Matching
403 |         Chi-Square Test for Independence p-value before | after:
404 |         {} | {}
405 |         '''
406 |         test_results = []
407 |         for col in self.matched_data.columns:
408 |             if not uf.is_continuous(col, self.X) and col not in self.exclude:
409 |                 dbefore = prep_plot(self.data, col, colname="before")
410 |                 dafter = prep_plot(self.matched_data, col, colname="after")
411 |                 df = dbefore.join(dafter)
412 |                 test_results_i = self.prop_test(col)
413 |                 test_results.append(test_results_i)
414 | 
415 |                 # plotting
416 |                 df.plot.bar(alpha=.8)
417 |                 plt.title(title_str.format(col, test_results_i["before"],
418 |                                            test_results_i["after"]))
419 |                 lim = max(.09, abs(df).max().max()) + .01
420 |                 plt.ylim((-lim, lim))
421 |         return pd.DataFrame(test_results)[['var', 'before', 'after']] if return_table else None
422 | 
423 |     def prep_prop_test(self, data, var):
424 |         """
425 |         Helper method for running chi-square contingency tests
426 | 
427 |         Balances the counts of discrete variables with our groups
428 |         so that missing levels are replaced with 0.
429 |         i.e. if the test group has no records with x as a field
430 |         for a given column, make sure the count for x is 0
431 |         and not missing.
432 | 
433 |         Parameters
434 |         ----------
435 |         data : pd.DataFrame()
436 |             Data to use for counting
437 |         var : str
438 |             Column to use within data
439 | 
440 |         Returns
441 |         -------
442 |         list
443 |             A table (list of lists) of counts for all enumerated field within <var>
444 |             for test and control groups.
445 |         """
446 |         counts = data.groupby([var, self.yvar]).count().reset_index()
447 |         table = []
448 |         for t in (0, 1):
449 |             os_counts = counts[counts[self.yvar] ==t]\
450 |                                      .sort_values(var)
451 |             cdict = {}
452 |             for row in os_counts.iterrows():
453 |                 row = row[1]
454 |                 cdict[row[var]] = row[2]
455 |             table.append(cdict)
456 |         # fill empty keys as 0
457 |         all_keys = set(chain.from_iterable(table))
458 |         for d in table:
459 |             d.update((k, 0) for k in all_keys if k not in d)
460 |         ctable = [[i[k] for k in sorted(all_keys)] for i in table]
461 |         return ctable
462 | 
463 |     def prop_retained(self):
464 |         """
465 |         Returns the proportion of data retained after matching
466 |         """
467 |         return len(self.matched_data[self.matched_data[self.yvar] == self.minority]) * 1.0 / \
468 |                len(self.data[self.data[self.yvar] == self.minority])
469 | 
470 |     def tune_threshold(self, method, nmatches=1, rng=np.arange(0, .001, .0001)):
471 |         """
472 |         Matches data over a grid to optimize threshold value and plots results.
473 | 
474 |         Parameters
475 |         ----------
476 |         method : str
477 |             Method used for matching (use "random" for this method)
478 |         nmatches : int
479 |             Max number of matches per record. See pymatch.match()
480 |         rng: : list / np.array()
481 |             Grid of threshold values
482 | 
483 |         Returns
484 |         -------
485 |         None
486 | 
487 |         """
488 |         results = []
489 |         for i in rng:
490 |             self.match(method=method, nmatches=nmatches, threshold=i)
491 |             results.append(self.prop_retained())
492 |         plt.plot(rng, results)
493 |         plt.title("Proportion of Data retained for grid of threshold values")
494 |         plt.ylabel("Proportion Retained")
495 |         plt.xlabel("Threshold")
496 |         plt.xticks(rng)
497 | 
498 |     def record_frequency(self):
499 |         """
500 |         Calculates the frequency of specifi records in
501 |         the matched dataset
502 | 
503 |         Returns
504 |         -------
505 |         pd.DataFrame()
506 |             Frequency table of the number records
507 |             matched once, twice, ..., etc.
508 |         """
509 |         freqs = self.matched_data.groupby("record_id")\
510 |                     .count().groupby("match_id").count()\
511 |                     [["scores"]].reset_index()
512 |         freqs.columns = ["freq", "n_records"]
513 |         return freqs
514 | 
515 |     def assign_weight_vector(self):
516 |         record_freqs = self.matched_data.groupby("record_id")\
517 |                            .count()[['match_id']].reset_index()
518 |         record_freqs.columns = ["record_id", "weight"]
519 |         fm = record_freqs.merge(self.matched_data, on="record_id")
520 |         fm['weight'] = 1/fm['weight']
521 |         self.matched_data = fm
522 | 
523 |     @staticmethod
524 |     def _scores_to_accuracy(m, X, y):
525 |         preds = [[1.0 if i >= .5 else 0.0 for i in m.predict(X)]]
526 |         return (y.to_numpy().T == preds).sum() * 1.0 / len(y)
527 | 


--------------------------------------------------------------------------------
/pymatch/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from statsmodels.genmod.generalized_linear_model import GLM
 3 | from statsmodels.tools.sm_exceptions import PerfectSeparationError
 4 | from statsmodels.distributions.empirical_distribution import ECDF
 5 | from scipy import stats
 6 | from collections import Counter
 7 | from itertools import chain
 8 | import sys; sys.path.append(sys.argv[0])
 9 | import pymatch.functions as uf
10 | import statsmodels.api as sm
11 | import patsy
12 | import seaborn as sns
13 | import matplotlib.pyplot as plt
14 | import pandas as pd
15 | import numpy as np
16 | 


--------------------------------------------------------------------------------
/pymatch/functions.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from pymatch import *
  3 | import sys
  4 | import numpy as np
  5 | 
  6 | 
  7 | def drop_static_cols(df, yvar, cols=None):
  8 |     if not cols:
  9 |         cols = list(df.columns)
 10 |     # will be static for both groups
 11 |     cols.pop(cols.index(yvar))
 12 |     for col in df[cols]:
 13 |         n_unique = len(np.unique(df[col]))
 14 |         if n_unique == 1:
 15 |             df.drop(col, axis=1, inplace=True)
 16 |             sys.stdout.write('\rStatic column dropped: {}'.format(col))
 17 |     return df
 18 |   
 19 |   
 20 | def ks_boot(tr, co, nboots=1000):
 21 |     nx = len(tr)
 22 |     w = np.concatenate((tr, co))
 23 |     obs = len(w)
 24 |     cutp = nx
 25 |     bbcount = 0
 26 |     ss = []
 27 |     fs_ks, _ = stats.ks_2samp(tr, co)
 28 |     for bb in range(nboots):
 29 |         sw = np.random.choice(w, obs, replace=True)
 30 |         x1tmp = sw[:cutp]
 31 |         x2tmp = sw[cutp:]
 32 |         s_ks, _ = stats.ks_2samp(x1tmp, x2tmp)
 33 |         ss.append(s_ks)
 34 |         if s_ks >= fs_ks:
 35 |             bbcount += 1
 36 |     ks_boot_pval = bbcount * 1.0 / nboots
 37 |     return ks_boot_pval
 38 | 
 39 | 
 40 | def _chi2_distance(tb, cb):
 41 |     dist = 0
 42 |     for b in set(np.union1d(list(tb.keys()), list(cb.keys()))):
 43 |         if b not in tb:
 44 |             tb[b] = 0
 45 |         if b not in cb:
 46 |             cb[b] = 0
 47 |         xi, yi = tb[b], cb[b]
 48 |         dist += ((xi - yi) ** 2) * 1.0 / (xi + yi)
 49 |     return dist * 1.0 / 2
 50 | 
 51 | 
 52 | def chi2_distance(t, c):
 53 |     tb, cb, bins = which_bin_hist(t, c)
 54 |     tb, cb = bin_hist(tb, cb, bins)
 55 |     return _chi2_distance(tb,cb)
 56 | 
 57 | 
 58 | def which_bin_hist(t, c):
 59 |     comb = np.concatenate((t, c))
 60 |     bins = np.arange(np.percentile(comb, 99), step=10)
 61 |     t_binned = np.digitize(t, bins)
 62 |     c_binned = np.digitize(c, bins)
 63 |     return t_binned, c_binned, bins
 64 | 
 65 | 
 66 | def bin_hist(t, c, bins):
 67 |     tc, cc = Counter(t), Counter(c)
 68 | 
 69 |     def idx_to_value(d, bins):
 70 |         result = {}
 71 |         for k, v, in d.items():
 72 |             result[int(bins[k-1])] = v
 73 |         return result
 74 | 
 75 |     return idx_to_value(tc, bins), idx_to_value(cc, bins)
 76 | 
 77 | 
 78 | def grouped_permutation_test(f, t, c, n_samples=1000):
 79 |     truth = f(t, c)
 80 |     comb = np.concatenate((t, c))
 81 |     times_geq=0
 82 |     samp_arr = []
 83 |     for i in range(n_samples):
 84 |         tn = len(t)
 85 |         combs = comb[:]
 86 |         np.random.shuffle(combs)
 87 |         tt = combs[:tn]
 88 |         cc = combs[tn:]
 89 |         sample_truth = f(np.array(tt), np.array(cc))
 90 |         if sample_truth >= truth:
 91 |             times_geq += 1
 92 |         samp_arr.append(sample_truth)
 93 |     return (times_geq * 1.0) / n_samples, truth
 94 | 
 95 | 
 96 | def std_diff(a, b):
 97 |     sd = np.std(a.append(b))
 98 |     med = (np.median(a) - np.median(b)) * 1.0 / sd
 99 |     mean = (np.mean(a) - np.mean(b)) * 1.0 / sd
100 |     return med, mean
101 | 
102 | 
103 | def progress(i, n, prestr=''):
104 |     sys.stdout.write('\r{}: {}\{}'.format(prestr, i, n))
105 | 
106 | 
107 | def is_continuous(colname, dmatrix):
108 |     """
109 |     Check if the colname was treated as continuous in the patsy.dmatrix
110 |     Would look like colname[<factor_value>] otherwise
111 |     """
112 |     return (colname in dmatrix.columns) or ("Q('{}')".format(colname) in dmatrix.columns)
113 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | dependencies = [
 4 |     'seaborn',
 5 |     'statsmodels',
 6 |     'scipy',
 7 |     'patsy',
 8 |     'matplotlib',
 9 |     'pandas',
10 |     'numpy'
11 |   ]
12 | 
13 | VERSION = "0.3.4.2"
14 | 
15 | setup(
16 |     name='pymatch',
17 |     packages=['pymatch'],
18 |     version=VERSION,
19 |     description='Matching techniques for Observational Studies',
20 |     author='Ben Miroglio',
21 |     author_email='benmiroglio@gmail.com',
22 |     url='https://github.com/benmiroglio/pymatch',
23 |     download_url='https://github.com/benmiroglio/pymatch/archive/{}.tar.gz'.format(VERSION),
24 |     keywords=['logistic', 'regression', 'matching', 'observational', 'study', 'causal', 'inference'],
25 |     include_package_data=True,
26 |     install_requires=dependencies
27 | )
28 | 


--------------------------------------------------------------------------------