├── docs ├── README.md ├── authors.md ├── license.md ├── CHANGELOG.md ├── history.md ├── contributing.md ├── .gitignore ├── usage.md ├── Makefile ├── api.rst ├── make.bat ├── quickstart.md ├── installation.md ├── index.rst └── conf.py ├── .envrc ├── CHANGELOG.md ├── requirements_dev.txt ├── AUTHORS.md ├── PULL_REQUEST_TEMPLATE.md ├── requirements.txt ├── medicare_utils ├── __init__.py ├── metadata │ ├── xw │ │ ├── to_json.py │ │ └── snfr.json │ └── codebook │ │ ├── bsfab.json │ │ └── med.json ├── codebook.py ├── utils.py └── parquet.py ├── tox.ini ├── readthedocs.yml ├── MANIFEST.in ├── .editorconfig ├── .github ├── ISSUE_TEMPLATE.md └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── tests ├── test_codebook.py ├── test_medicare_df_with_data.py └── test_medicare_df.py ├── environment.yml ├── LICENSE ├── .gitignore ├── setup.py ├── README.md ├── Makefile ├── CONTRIBUTING.md └── setup.cfg /docs/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /docs/authors.md: -------------------------------------------------------------------------------- 1 | ../AUTHORS.md -------------------------------------------------------------------------------- /docs/license.md: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../CHANGELOG.md -------------------------------------------------------------------------------- /docs/history.md: -------------------------------------------------------------------------------- 1 | ../CHANGELOG.md -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ../CONTRIBUTING.md -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | source activate medicare_utils 2 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | /medicare_utils.rst 2 | /medicare_utils.*.rst 3 | /modules.rst 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.0.1 (2018-03-01) 4 | 5 | - First release on PyPI. 6 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | To use `medicare_utils` in a project: 4 | 5 | ```py 6 | import medicare_utils as med 7 | ``` 8 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | Sphinx==1.7.0 2 | bumpversion==0.5.3 3 | coverage==4.5.1 4 | flake8==3.5.0 5 | pip==21.1 6 | tox==2.9.1 7 | wheel==0.30.0 8 | yapf==0.20.2 9 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | ## Development Lead 4 | 5 | - Kyle Barron 6 | 7 | ## Contributors 8 | 9 | None yet. Why not be the first? 10 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | - [ ] closes #xxxx 2 | - [ ] tests added / passed 3 | - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` 4 | - [ ] whatsnew entry 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dask>=0.17.1 2 | distributed>=1.21.1 3 | fastparquet>=0.1.4 4 | joblib>=0.11 5 | numpy>=1.14.1 6 | pandas>=0.22.0 7 | pyarrow>=0.9.0 8 | requests>=2.18.4 9 | requests-html>=0.9.0 10 | tqdm>=4.19.9 11 | -------------------------------------------------------------------------------- /medicare_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = """Kyle Barron""" 4 | __email__ = 'barronk@mit.edu' 5 | __version__ = '0.1.0' 6 | 7 | from .codes import icd9, hcpcs, npi 8 | from .utils import fpath, pq_vars 9 | from .medicare_df import MedicareDF 10 | from .codebook import codebook 11 | from . import parquet 12 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py35, py36, flake8 3 | 4 | [travis] 5 | python = 6 | 3.6: py36 7 | 3.5: py35 8 | 9 | [testenv:flake8] 10 | basepython = python 11 | deps = flake8 12 | commands = flake8 medicare_utils 13 | 14 | [testenv] 15 | setenv = 16 | PYTHONPATH = {toxinidir} 17 | 18 | commands = python setup.py test 19 | 20 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | build: 2 | image: latest 3 | 4 | python: 5 | version: 3.6 6 | setup_py_install: true 7 | 8 | # Also build PDF documentation 9 | formats: 10 | - pdf 11 | 12 | # Path to pip requirements file 13 | # requirements_file: requirements.txt 14 | 15 | # Conda environment file 16 | conda: 17 | file: environment.yml 18 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.md 2 | include CHANGELOG.md 3 | include CONTRIBUTING.md 4 | include LICENSE 5 | include README.md 6 | 7 | recursive-include medicare_utils/metadata *.json 8 | 9 | recursive-include tests * 10 | recursive-exclude * __pycache__ 11 | recursive-exclude * *.py[co] 12 | 13 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 14 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * medicare_utils version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | #### Code Sample, a copy-pastable example if possible 8 | 9 | ```python 10 | # Your code here 11 | 12 | ``` 13 | #### Problem description 14 | 15 | [this should explain **why** the current behaviour is a problem and why the expected output is a better solution.] 16 | 17 | **Note**: Many problems can be resolved by simply upgrading `medicare_utils` to the latest version. Before submitting, please check if that solution works for you. 18 | 19 | #### Expected Output 20 | 21 | #### Package Version 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = medicare_utils 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/test_codebook.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from medicare_utils import codebook 3 | 4 | class TestCodebook(object): 5 | @pytest.fixture(params=['bsfab', 'med', 'opc']) 6 | def d(self, request): 7 | return codebook(request.param) 8 | 9 | def test_unique_varnames(self, d): 10 | varnames = [key for key, val in d.items()] 11 | assert len(varnames) == len(set(varnames)) 12 | 13 | def test_dict_keys(self, d): 14 | varnames = [key for key, val in d.items()] 15 | for varname in varnames: 16 | keys = [key for key, val in d[varname].items()] 17 | assert len(keys) == 2 18 | assert 'name' in keys 19 | assert 'values' in keys 20 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: 'medicare_utils' 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - bumpversion==0.5.3 6 | - dask>=0.18.0 7 | - fastparquet>=0.1.5 8 | - flake8>=3.5.0 9 | - ipykernel>=4.8.2 10 | - joblib 11 | - mypy 12 | - numpy>=1.14.5 13 | - pandas>=0.23.3 14 | - pip==10.0.1 15 | - pyarrow>=0.10.0 16 | - pydocstyle>=2.1.1 17 | - pytest>=3.6.2 18 | - python-snappy>=0.5.2 19 | - python=3.6 20 | - recommonmark>=0.4.0 21 | - requests>=2.19.1 22 | - sphinx>=1.7.5 23 | - tqdm>=4.23.4 24 | - wheel==0.30.0 25 | - yapf>=0.22.0 26 | - pip: 27 | - jupyter 28 | - pymdown-extensions>=4.11 29 | - python-language-server>=0.19.0 30 | - requests-html 31 | - sphinx-autodoc-typehints 32 | - sphinx-bootstrap-theme 33 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | medicare\_utils.codebook 5 | ------------------------ 6 | 7 | .. automodule:: medicare_utils.codebook 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | medicare\_utils.codes 13 | --------------------- 14 | 15 | .. automodule:: medicare_utils.codes 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | medicare\_utils.medicare\_df 21 | ---------------------------- 22 | 23 | .. automodule:: medicare_utils.medicare_df 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | medicare\_utils.parquet 29 | ----------------------- 30 | 31 | .. automodule:: medicare_utils.parquet 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | medicare\_utils.utils 37 | --------------------- 38 | 39 | .. automodule:: medicare_utils.utils 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=medicare_utils 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018, Kyle Barron 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # Quick Start guide 2 | 3 | ## Importing the package 4 | 5 | First, make sure you've [installed](installation.html) `medicare_utils`. 6 | Then to use the package, you need to import it: 7 | 8 | ```py 9 | import medicare_utils as med 10 | ``` 11 | 12 | The `as med` means that you can refer to the package as `med` instead of `medicare_utils` from here on. 13 | 14 | ## Data extracts 15 | 16 | Data extracts are started with the `med.MedicareDF` function. For example, I can begin an extract using 1% sample data and for the years 2010-2012 with: 17 | ```py 18 | mdf = med.MedicareDF(percent=1, years=range(2010, 2013)) 19 | ``` 20 | 21 | Note that the `range` function includes integers up to but not including the second argument. 22 | 23 | Then I can get a cohort of white women aged 66-75 24 | 25 | 26 | In recent years, Python has become the `fastest growing major programming language `_, largely due to its widespread use among data scientists. This popularity has fostered packages that work with data, such as `Pandas `_, the standard for in-memory data analysis. A newer package, `Dask `_, has been developed to parallelize Pandas operations and work with data larger than memory. 27 | -------------------------------------------------------------------------------- /medicare_utils/metadata/xw/to_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | def main(): 5 | """Internal code to convert Jean's crosswalks to JSON files 6 | """ 7 | data_types = ['carc', 'carl', 'dmec', 'dmel', 'hhac', 'hhar', 'hosc', 8 | 'hosr', 'ipc', 'ipr', 'med', 'opc', 'opr', 'snfc', 'snfr'] 9 | for data_type in data_types: 10 | df = pd.read_stata(f'harm{data_type}.dta') 11 | df = df.sort_values(['cname', 'year']) 12 | df = df[df['year'] >= 1999] 13 | df = df[df['cname'] != ''] 14 | xw = {} 15 | 16 | for i in range(len(df)): 17 | dfi = df.iloc[i] 18 | 19 | yeari = str(dfi['year']) 20 | cnamei = dfi['cname'] 21 | namei = dfi['name'] 22 | typei = dfi['type'] 23 | formati = dfi['format'] 24 | labeli = dfi['varlab'] 25 | 26 | xw[cnamei] = xw.get(cnamei, {}) 27 | xw[cnamei]['desc'] = labeli 28 | xw[cnamei][yeari] = xw[cnamei].get(yeari, {}) 29 | xw[cnamei][yeari]['name'] = xw[cnamei][yeari].get('name', namei) 30 | xw[cnamei][yeari]['type'] = xw[cnamei][yeari].get('type', typei) 31 | xw[cnamei][yeari]['format'] = xw[cnamei][yeari].get( 32 | 'format', formati) 33 | 34 | with open(f'{data_type}.json', 'w') as f: 35 | json.dump(xw, f, sort_keys=True, indent=4) 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .lastlogin 2 | 3 | # Temporary folder 4 | tmp/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | .venv 93 | venv/ 94 | ENV/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | **This package only supports Python 3.6 or higher.** You can find out the version of Python installed by running `python --version` in your terminal. The first two numbers must be 3.6 or 3.7. 4 | 5 | ``` 6 | $ python --version 7 | Python 3.6.4 :: Anaconda custom (64-bit) 8 | ``` 9 | 10 | ## Stable release 11 | 12 | To install medicare_utils, run this command in your terminal: 13 | 14 | ``` 15 | $ pip install medicare_utils --upgrade 16 | ``` 17 | 18 | This is the preferred method to install medicare_utils, as it will always install the most recent stable release. 19 | 20 | If you don't have [`pip`](https://pip.pypa.io) installed, I recommend installing the [Anaconda distribution](https://www.anaconda.com/download), which will install a wide variety of helpful data science packages. 21 | Otherwise, this [Python installation guide](http://docs.python-guide.org/en/latest/starting/installation/) can guide you through the process of installing `pip` manually. 22 | 23 | ## Development version 24 | 25 | If you want the newest version available, you can install direct from the Github repository with: 26 | ``` 27 | $ pip install git+https://github.com/kylebarron/medicare_utils --upgrade 28 | ``` 29 | 30 | ## From sources 31 | 32 | The sources for medicare_utils can be downloaded from the [Github repo](https://github.com/kylebarron/medicare_utils). 33 | 34 | You can either clone the public repository: 35 | 36 | ``` 37 | $ git clone git://github.com/kylebarron/medicare_utils 38 | ``` 39 | 40 | Or download the [tarball](https://github.com/kylebarron/medicare_utils/tarball/master): 41 | 42 | ``` 43 | $ curl -OL https://github.com/kylebarron/medicare_utils/tarball/master 44 | ``` 45 | 46 | Once you have a copy of the source, you can install it with: 47 | 48 | ``` 49 | $ python setup.py install 50 | ``` 51 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import setup, find_packages 7 | 8 | with open('README.md') as readme_file: 9 | readme = readme_file.read() 10 | 11 | with open('CHANGELOG.md') as history_file: 12 | history = history_file.read() 13 | 14 | with open('requirements.txt') as requirements_file: 15 | requirements = requirements_file.readlines() 16 | requirements = [x[:-1] for x in requirements] 17 | 18 | with open('requirements_dev.txt') as test_requirements_file: 19 | test_requirements = test_requirements_file.readlines() 20 | test_requirements = [x[:-1] for x in test_requirements] 21 | 22 | 23 | setup_requirements = [ 24 | 'setuptools >= 38.6.0', 25 | 'twine >= 1.11.0' 26 | ] 27 | 28 | setup( 29 | author="Kyle Barron", 30 | author_email='barronk@mit.edu', 31 | classifiers=[ 32 | 'Development Status :: 2 - Pre-Alpha', 33 | 'Intended Audience :: Developers', 34 | 'License :: OSI Approved :: MIT License', 35 | 'Natural Language :: English', 36 | 'Programming Language :: Python :: 3', 37 | 'Programming Language :: Python :: 3.4', 38 | 'Programming Language :: Python :: 3.5', 39 | 'Programming Language :: Python :: 3.6', 40 | ], 41 | description="Scripts to assist working with Medicare data.", 42 | install_requires=requirements, 43 | license="MIT license", 44 | long_description=readme + '\n\n' + history, 45 | long_description_content_type='text/markdown', 46 | include_package_data=True, 47 | keywords='medicare_utils', 48 | name='medicare_utils', 49 | packages=find_packages(include=['medicare_utils']), 50 | setup_requires=setup_requirements, 51 | test_suite='tests', 52 | tests_require=test_requirements, 53 | url='https://github.com/kylebarron/medicare_utils', 54 | version='0.1.0', 55 | zip_safe=False, 56 | ) 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # medicare_utils 2 | 3 | 4 | 5 | 6 | 7 | [![Documentation Status](https://readthedocs.org/projects/medicare-utils/badge/?version=latest)](https://medicare-utils.readthedocs.io/en/latest/?badge=latest) 8 | 9 | Scripts to assist working with Medicare data. 10 | 11 | - Free software: MIT license 12 | - Documentation: [medicare-utils.readthedocs.io](https://medicare-utils.readthedocs.io). 13 | 14 | ## Features 15 | 16 | Provides the class `MedicareDF`. This class contains some canned scripts to make common tasks easier. It currently contains two functions: 17 | - `get_cohort()`, which uses the beneficiary summary file to find a set of medicare beneficiaries according to options given to the function. 18 | - `search_for_codes()`, which searches for HCPCS, ICD-9 diagnosis, and/or ICD-9 procedure codes in a given type of file. 19 | 20 | ## Installation 21 | 22 | Install the package with: 23 | ``` 24 | pip install git+https://github.com/kylebarron/medicare_utils --upgrade 25 | ``` 26 | 27 | You'll also need to manually install the `python-snappy` package. This is easiest with: 28 | ``` 29 | conda install -c conda-forge python-snappy 30 | ``` 31 | 32 | Otherwise you need to first install the Snappy C library and then run 33 | ``` 34 | pip install python-snappy 35 | ``` 36 | 37 | ## Usage 38 | 39 | The class is initialized with 40 | ```py 41 | import medicare_utils as med 42 | mdf = med.MedicareDF('05', range(2010, 2013)) 43 | mdf.get_cohort(gender='female', ages=range(65, 75)) 44 | mdf.search_for_codes(2010, 'med', icd9_diag='41071') 45 | ``` 46 | 47 | It has attributes that refer to different levels of the data, when applicable: 48 | 49 | - `mdf.pl`: patient-level data. Here the index of the data is `bene_id` for data post-2005, or `ehic` for data pre-2005. 50 | - `mdf.cl`: claim-level data. 51 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | 52 | lint: ## check style with flake8 53 | flake8 medicare_utils tests 54 | 55 | test: ## run tests quickly with the default Python 56 | python setup.py test 57 | 58 | test-all: ## run tests on every Python version with tox 59 | tox 60 | 61 | coverage: ## check code coverage quickly with the default Python 62 | coverage run --source medicare_utils setup.py test 63 | coverage report -m 64 | coverage html 65 | $(BROWSER) htmlcov/index.html 66 | 67 | docs: ## generate Sphinx HTML documentation, including API docs 68 | rm -f docs/medicare_utils.rst 69 | rm -f docs/modules.rst 70 | sphinx-apidoc -o docs/ medicare_utils 71 | $(MAKE) -C docs clean 72 | $(MAKE) -C docs html 73 | $(BROWSER) docs/_build/html/index.html 74 | 75 | servedocs: docs ## compile the docs watching for changes 76 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 77 | 78 | release: clean ## package and upload a release 79 | python setup.py sdist upload 80 | python setup.py bdist_wheel upload 81 | 82 | dist: clean ## builds source and wheel package 83 | python setup.py sdist 84 | python setup.py bdist_wheel 85 | ls -l dist 86 | 87 | install: clean ## install the package to the active Python's site-packages 88 | python setup.py install 89 | -------------------------------------------------------------------------------- /medicare_utils/codebook.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pkg_resources as pkg 3 | 4 | 5 | def codebook(data_type: str) -> dict: 6 | """Load variable codebook 7 | 8 | Args: 9 | data_type: 10 | 11 | Type of file to get codebook for 12 | 13 | - ``bsfab`` (`Beneficiary Summary File, Base segment`_) 14 | - ``med`` (`MedPAR File`_) 15 | - ``opc`` (`Outpatient File, Claims segment`_) 16 | 17 | .. _`Beneficiary Summary File, Base segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#base-abcd-segment_2 18 | .. _`MedPAR File`: https://kylebarron.github.io/medicare-documentation/resdac/medpar-rif/#medpar-rif_1 19 | .. _`Outpatient File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/op-rif/#outpatient-rif_1 20 | 21 | Returns: 22 | ``dict`` with variable names as keys; values are another ``dict``. This 23 | inner ``dict`` has two keys: ``name``, where the value is the 24 | descriptive name of the variable, and ``values``, which is itself a 25 | ``dict`` with variable values as keys and value descriptions as values 26 | of the ``dict``. 27 | Examples: 28 | To get the labels of the values of ``clm_type``, in the ``med`` file, 29 | you could do 30 | 31 | .. code-block:: python 32 | 33 | >>> import medicare_utils as med 34 | >>> cbk = med.codebook('med')['clm_type']['values'] 35 | 36 | Now ``cbk`` is a ``dict`` where the keys are the values 37 | the variable can take, and the values are the labels of 38 | the variable's values. 39 | 40 | .. code-block:: python 41 | 42 | >>> from pprint import pprint 43 | >>> pprint(cbk) 44 | {'10': 'HHA claim', 45 | '20': 'Non swing bed SNF claim', 46 | '30': 'Swing bed SNF claim', 47 | '40': 'Outpatient claim', 48 | '50': 'Hospice claim', 49 | '60': 'Inpatient claim', 50 | '61': "Inpatient 'Full-Encounter' claim", 51 | '62': 'Medicare Advantage IME/GME claims', 52 | '63': 'Medicare Advantage (no-pay) claims', 53 | '64': 'Medicare Advantage (paid as FFS) claim', 54 | '71': 'RIC O local carrier non-DMEPOS claim', 55 | '72': 'RIC O local carrier DMEPOS claim', 56 | '81': 'RIC M DMERC non-DMEPOS claim', 57 | '82': 'RIC M DMERC DMEPOS claim'} 58 | """ 59 | path = pkg.resource_filename( 60 | 'medicare_utils', f'metadata/codebook/{data_type}.json') 61 | 62 | try: 63 | with open(path) as f: 64 | data = json.load(f) 65 | except FileNotFoundError: 66 | raise NotImplementedError(f'Haven\'t added {data_type} codebook yet') 67 | 68 | return data 69 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to medicare_utils's documentation! 2 | ========================================== 3 | 4 | A Python package to make working with Medicare data easier. 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :caption: Contents: 9 | 10 | installation.md 11 | quickstart.md 12 | usage.md 13 | api.rst 14 | authors.md 15 | contributing.md 16 | history.md 17 | 18 | Introduction 19 | ------------ 20 | 21 | Medicare data are large and unwieldy. Since the size of the data is often larger than memory, many people use SAS. However SAS is an ugly language and not enjoyable to work with. Python is an easier and faster alternative. 22 | 23 | 24 | Creating a data extract can be done in three lines of code: 25 | 26 | .. code-block:: python 27 | 28 | import re 29 | import medicare_utils as med 30 | mdf = med.MedicareDF( 31 | percent='100', 32 | years=range(2008, 2014)) 33 | mdf.get_cohort( 34 | gender='male', 35 | ages=range(65, 75), 36 | buyin_val=['3', 'C'], 37 | join='outer', 38 | keep_vars=['bene_dob']) 39 | mdf.search_for_codes( 40 | data_types=['med', 'opc'], 41 | icd9_dx=re.compile(r'^410'), 42 | icd9_dx_max_cols=1, 43 | collapse_codes=True, 44 | keep_vars={'med': ['medparid', 'admsndt', 'dschrgdt']}, 45 | rename={'icd9_dx': 'ami'}) 46 | 47 | The resulting data extract consists of a patient-level file of patients who are: 48 | 49 | - Male 50 | - Aged 65-74 (inclusive) in any year from 2008 to 2013 51 | - Continuously enrolled in fee-for-service Medicare in any year from 2008 to 2013 (i.e. :code:`buyin_val` either :code:`3` or :code:`C`) 52 | 53 | and a claim-level file of patients who were included in the above cohort and furthermore had a primary diagnosis code of AMI in either the `MedPAR `_ or `Outpatient claims `_ files. The patient-level file is accessed with :code:`mdf.pl` and the claim-level file is accessed with :code:`mdf.cl`. 54 | 55 | This package also provides: 56 | 57 | - Classes to work with NPI, ICD-9, and HCPCS codes. These commands will automatically download these data files for you. [#copyright]_ 58 | - Codebooks for values of categorical variables. 59 | - A simple interface to convert data files from Stata format to the modern Parquet format. 60 | 61 | This documentation aims to walk through everything needed to run these routines. Then you can keep working with these extracts in Python or easily export them to Stata's :code:`.dta` format. Head to the :doc:`Quick Start guide ` to get started. 62 | 63 | Caveats 64 | ------- 65 | 66 | This package contains no Medicare data or private information. It assumes you already have access to Medicare data. 67 | 68 | This package was originally developed for use on the National Bureau of Economic 69 | Research's servers, but portions of the package may be useful for third parties 70 | as well. 71 | 72 | 73 | Indices and tables 74 | ------------------ 75 | 76 | * :ref:`genindex` 77 | * :ref:`modindex` 78 | * :ref:`search` 79 | 80 | .. rubric:: Footnotes 81 | 82 | .. [#copyright] Datasets with HCPCS codes and short descriptions from 2003 to the present are freely available on the CMS website in their `Relative Value Files `_. These CMS files are released under the `End User Point and Click Agreement `_. In order to not run afoul of this license agreement, this package does not distribute HCPCS codes. Rather, it provides code for the user to download and work with them. By using the HCPCS functions in this package, you agree to the above Agreement. 83 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions are welcome, and they are greatly appreciated! Every 4 | little bit helps, and credit will always be given. 5 | 6 | You can contribute in many ways: 7 | 8 | ## Types of Contributions 9 | 10 | ### Report Bugs 11 | 12 | Report bugs at . 13 | 14 | If you are reporting a bug, please include: 15 | 16 | - Your operating system name and version. 17 | - Any details about your local setup that might be helpful in 18 | troubleshooting. 19 | - Detailed steps to reproduce the bug. 20 | 21 | ### Fix Bugs or Implement Features 22 | 23 | Look through the GitHub issues for bugs. Most issues are open to whoever wants to implement it, but comment on it so that I know you're working on it. 24 | 25 | ### Write Documentation 26 | 27 | medicare_utils could always use more documentation, whether as part of 28 | the official medicare_utils docs, in docstrings, or even on the web in 29 | blog posts, articles, and such. 30 | 31 | ### Submit Feedback 32 | 33 | The best way to send feedback is to file an issue at 34 | . 35 | 36 | If you are proposing a feature: 37 | 38 | - Explain in detail how it would work. 39 | - Keep the scope as narrow as possible, to make it easier to 40 | implement. 41 | - Remember that this is a volunteer-driven project, and that 42 | contributions are welcome :) 43 | 44 | ## Get Started! 45 | 46 | Ready to contribute? Here's how to set up medicare_utils for local 47 | development. 48 | 49 | 1. Fork the medicare_utils repo on GitHub. 50 | 2. Clone your fork locally: 51 | 52 | ``` 53 | $ git clone git@github.com:your_github_user/medicare_utils.git 54 | ``` 55 | 56 | 3. Install your local copy into a Conda environment. If you don't have Conda installed, install [Anaconda](https://www.anaconda.com/download/) or [Miniconda](https://conda.io/miniconda.html) first. Then set up your fork for local development with: 57 | 58 | ``` 59 | $ cd medicare_utils/ 60 | $ conda create env -f environment.yml 61 | $ source activate medicare_utils 62 | $ python setup.py develop 63 | ``` 64 | 65 | 4. Create a branch for local development: 66 | 67 | ``` 68 | $ git checkout -b name-of-your-bugfix-or-feature 69 | ``` 70 | 71 | Now you can make your changes locally. 72 | 73 | 5. When you're done making changes, check that your changes pass 74 | flake8 and the tests, including testing other Python versions with 75 | tox: 76 | 77 | ``` 78 | $ flake8 medicare_utils tests 79 | $ python setup.py test or py.test 80 | $ tox 81 | ``` 82 | 83 | 6. Commit your changes and push your branch to GitHub: 84 | 85 | ``` 86 | $ git add . 87 | $ git commit -m "Your detailed description of your changes." 88 | $ git push origin name-of-your-bugfix-or-feature 89 | ``` 90 | 91 | 7. Submit a pull request through the GitHub website. 92 | 93 | ## Pull Request Guidelines 94 | 95 | Before you submit a pull request, check that it meets these guidelines: 96 | 97 | 1. The pull request should include tests. 98 | 2. If the pull request adds functionality, the docs should be updated. 99 | Put your new functionality into a function with a docstring, and add 100 | the feature to the list in README.rst. 101 | 3. The pull request should work for Python 3.6 and 3.7. Check 102 | and 103 | make sure that the tests pass for all supported Python versions. 104 | 105 | ## Tips 106 | 107 | To run a subset of tests: 108 | 109 | $ python -m unittest tests.test_medicare_utils 110 | 111 | ## Deploying 112 | 113 | A reminder for the maintainers on how to deploy. Make sure all your 114 | changes are committed (including an entry in `CHANGELOG.md`). Then run: 115 | 116 | ```bash 117 | $ bumpversion patch # possible: major / minor / patch 118 | $ git push 119 | $ git push --tags 120 | ``` 121 | 122 | Travis will then deploy to PyPI if tests pass. 123 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # medicare_utils documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | # 21 | import os 22 | import sys 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | import sphinx_bootstrap_theme 26 | import medicare_utils 27 | 28 | # -- General configuration --------------------------------------------- 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.coverage', 39 | 'sphinx.ext.doctest', 40 | 'sphinx.ext.githubpages', 41 | 'sphinx.ext.mathjax', 42 | 'sphinx.ext.napoleon', 43 | 'sphinx.ext.viewcode', 44 | # Must be loaded after napoleon 45 | 'sphinx_autodoc_typehints' 46 | ] 47 | 48 | napoleon_google_docstring = True 49 | # Must be True to work with sphinx_autodoc_typehints 50 | napoleon_use_param = True 51 | napoleon_use_ivar = True 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ['_templates'] 55 | 56 | # The suffix(es) of source filenames. 57 | # You can specify multiple suffix as a list of string: 58 | # 59 | source_suffix = ['.rst', '.md'] 60 | 61 | # The master toctree document. 62 | master_doc = 'index' 63 | 64 | # General information about the project. 65 | project = u'medicare_utils' 66 | copyright = u"2018, Kyle Barron" 67 | author = u"Kyle Barron" 68 | 69 | # The version info for the project you're documenting, acts as replacement 70 | # for |version| and |release|, also used in various other places throughout 71 | # the built documents. 72 | # 73 | # The short X.Y version. 74 | version = medicare_utils.__version__ 75 | # The full version, including alpha/beta/rc tags. 76 | release = medicare_utils.__version__ 77 | 78 | # The language for content autogenerated by Sphinx. Refer to documentation 79 | # for a list of supported languages. 80 | # 81 | # This is also used if you do content translation via gettext catalogs. 82 | # Usually you set "language" from the command line for these cases. 83 | language = None 84 | 85 | # List of patterns, relative to source directory, that match files and 86 | # directories to ignore when looking for source files. 87 | # This patterns also effect to html_static_path and html_extra_path 88 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 89 | 90 | # The name of the Pygments (syntax highlighting) style to use. 91 | pygments_style = 'sphinx' 92 | 93 | # If true, `todo` and `todoList` produce output, else they produce nothing. 94 | todo_include_todos = False 95 | 96 | 97 | # -- Options for HTML output ------------------------------------------- 98 | 99 | # The theme to use for HTML and HTML Help pages. See the documentation for 100 | # a list of builtin themes. 101 | # 102 | html_theme = 'bootstrap' 103 | html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 104 | 105 | # Theme options are theme-specific and customize the look and feel of a 106 | # theme further. For a list of options available for each theme, see the 107 | # documentation. 108 | # 109 | # html_theme_options = {} 110 | 111 | # Add any paths that contain custom static files (such as style sheets) here, 112 | # relative to this directory. They are copied after the builtin static files, 113 | # so a file named "default.css" will overwrite the builtin "default.css". 114 | html_static_path = ['_static'] 115 | 116 | 117 | # -- Options for HTMLHelp output --------------------------------------- 118 | 119 | # Output file base name for HTML help builder. 120 | htmlhelp_basename = 'medicare_utilsdoc' 121 | 122 | 123 | # -- Options for LaTeX output ------------------------------------------ 124 | 125 | latex_elements = { 126 | # The paper size ('letterpaper' or 'a4paper'). 127 | # 128 | # 'papersize': 'letterpaper', 129 | 130 | # The font size ('10pt', '11pt' or '12pt'). 131 | # 132 | # 'pointsize': '10pt', 133 | 134 | # Additional stuff for the LaTeX preamble. 135 | # 136 | # 'preamble': '', 137 | 138 | # Latex figure (float) alignment 139 | # 140 | # 'figure_align': 'htbp', 141 | } 142 | 143 | # Grouping the document tree into LaTeX files. List of tuples 144 | # (source start file, target name, title, author, documentclass 145 | # [howto, manual, or own class]). 146 | latex_documents = [ 147 | (master_doc, 'medicare_utils.tex', 148 | u'medicare_utils Documentation', 149 | u'Kyle Barron', 'manual'), 150 | ] 151 | 152 | 153 | # -- Options for manual page output ------------------------------------ 154 | 155 | # One entry per manual page. List of tuples 156 | # (source start file, name, description, authors, manual section). 157 | man_pages = [ 158 | (master_doc, 'medicare_utils', 159 | u'medicare_utils Documentation', 160 | [author], 1) 161 | ] 162 | 163 | 164 | # -- Options for Texinfo output ---------------------------------------- 165 | 166 | # Grouping the document tree into Texinfo files. List of tuples 167 | # (source start file, target name, title, author, 168 | # dir menu entry, description, category) 169 | texinfo_documents = [ 170 | (master_doc, 'medicare_utils', 171 | u'medicare_utils Documentation', 172 | author, 173 | 'medicare_utils', 174 | 'One line description of project.', 175 | 'Miscellaneous'), 176 | ] 177 | 178 | source_parsers = { 179 | '.md': 'recommonmark.parser.CommonMarkParser', 180 | } 181 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.1 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:medicare_utils/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | # Define setup.py command aliases here 22 | 23 | [pycodestyle] 24 | max-line-length = 80 25 | 26 | [yapf] 27 | # Align closing bracket with visual indentation. 28 | align_closing_bracket_with_visual_indent=False 29 | 30 | # Allow dictionary keys to exist on multiple lines. For example: 31 | # 32 | # x = { 33 | # ('this is the first element of a tuple', 34 | # 'this is the second element of a tuple'): 35 | # value, 36 | # } 37 | allow_multiline_dictionary_keys=False 38 | 39 | # Allow lambdas to be formatted on more than one line. 40 | allow_multiline_lambdas=False 41 | 42 | # Allow splits before the dictionary value. 43 | allow_split_before_dict_value=True 44 | 45 | # Insert a blank line before a class-level docstring. 46 | blank_line_before_class_docstring=False 47 | 48 | # Insert a blank line before a 'def' or 'class' immediately nested 49 | # within another 'def' or 'class'. For example: 50 | # 51 | # class Foo: 52 | # # <------ this blank line 53 | # def method(): 54 | # ... 55 | blank_line_before_nested_class_or_def=False 56 | 57 | # Do not split consecutive brackets. Only relevant when 58 | # dedent_closing_brackets is set. For example: 59 | # 60 | # call_func_that_takes_a_dict( 61 | # { 62 | # 'key1': 'value1', 63 | # 'key2': 'value2', 64 | # } 65 | # ) 66 | # 67 | # would reformat to: 68 | # 69 | # call_func_that_takes_a_dict({ 70 | # 'key1': 'value1', 71 | # 'key2': 'value2', 72 | # }) 73 | coalesce_brackets=True 74 | 75 | # The column limit. 76 | column_limit=80 77 | 78 | # Indent width used for line continuations. 79 | continuation_indent_width=4 80 | 81 | # Put closing brackets on a separate line, dedented, if the bracketed 82 | # expression can't fit in a single line. Applies to all kinds of brackets, 83 | # including function definitions and calls. For example: 84 | # 85 | # config = { 86 | # 'key1': 'value1', 87 | # 'key2': 'value2', 88 | # } # <--- this bracket is dedented and on a separate line 89 | # 90 | # time_series = self.remote_client.query_entity_counters( 91 | # entity='dev3246.region1', 92 | # key='dns.query_latency_tcp', 93 | # transform=Transformation.AVERAGE(window=timedelta(seconds=60)), 94 | # start_ts=now()-timedelta(days=3), 95 | # end_ts=now(), 96 | # ) # <--- this bracket is dedented and on a separate line 97 | dedent_closing_brackets=False 98 | 99 | # Place each dictionary entry onto its own line. 100 | each_dict_entry_on_separate_line=True 101 | 102 | # The regex for an i18n comment. The presence of this comment stops 103 | # reformatting of that line, because the comments are required to be 104 | # next to the string they translate. 105 | i18n_comment= 106 | 107 | # The i18n function call names. The presence of this function stops 108 | # reformattting on that line, because the string it has cannot be moved 109 | # away from the i18n comment. 110 | i18n_function_call= 111 | 112 | # Indent the dictionary value if it cannot fit on the same line as the 113 | # dictionary key. For example: 114 | # 115 | # config = { 116 | # 'key1': 117 | # 'value1', 118 | # 'key2': value1 + 119 | # value2, 120 | # } 121 | indent_dictionary_value=True 122 | 123 | # The number of columns to use for indentation. 124 | indent_width=4 125 | 126 | # Join short lines into one line. E.g., single line 'if' statements. 127 | join_multiple_lines=True 128 | 129 | # Do not include spaces around selected binary operators. For example: 130 | # 131 | # 1 + 2 * 3 - 4 / 5 132 | # 133 | # will be formatted as follows when configured with a value "*,/": 134 | # 135 | # 1 + 2*3 - 4/5 136 | # 137 | no_spaces_around_selected_binary_operators=set() 138 | 139 | # Use spaces around default or named assigns. 140 | spaces_around_default_or_named_assign=False 141 | 142 | # Use spaces around the power operator. 143 | spaces_around_power_operator=True 144 | 145 | # The number of spaces required before a trailing comment. 146 | spaces_before_comment=2 147 | 148 | # Insert a space between the ending comma and closing bracket of a list, 149 | # etc. 150 | space_between_ending_comma_and_closing_bracket=True 151 | 152 | # Split before arguments if the argument list is terminated by a 153 | # comma. 154 | split_arguments_when_comma_terminated=False 155 | 156 | # Set to True to prefer splitting before '&', '|' or '^' rather than 157 | # after. 158 | split_before_bitwise_operator=True 159 | 160 | split_before_closing_bracket=False 161 | 162 | # Split before a dictionary or set generator (comp_for). For example, note 163 | # the split before the 'for': 164 | # 165 | # foo = { 166 | # variable: 'Hello world, have a nice day!' 167 | # for variable in bar if variable != 42 168 | # } 169 | split_before_dict_set_generator=True 170 | 171 | # Split after the opening paren which surrounds an expression if it doesn't 172 | # fit on a single line. 173 | split_before_expression_after_opening_paren=False 174 | 175 | # If an argument / parameter list is going to be split, then split before 176 | # the first argument. 177 | split_before_first_argument=True 178 | 179 | # Set to True to prefer splitting before 'and' or 'or' rather than 180 | # after. 181 | split_before_logical_operator=True 182 | 183 | # Split named assignments onto individual lines. 184 | split_before_named_assigns=False 185 | 186 | # Set to True to split list comprehensions and generators that have 187 | # non-trivial expressions and multiple clauses before each of these 188 | # clauses. For example: 189 | # 190 | # result = [ 191 | # a_long_var + 100 for a_long_var in xrange(1000) 192 | # if a_long_var % 10] 193 | # 194 | # would reformat to something like: 195 | # 196 | # result = [ 197 | # a_long_var + 100 198 | # for a_long_var in xrange(1000) 199 | # if a_long_var % 10] 200 | split_complex_comprehension=True 201 | 202 | # The penalty for splitting right after the opening bracket. 203 | split_penalty_after_opening_bracket=30 204 | 205 | # The penalty for splitting the line after a unary operator. 206 | split_penalty_after_unary_operator=10000 207 | 208 | # The penalty for splitting right before an if expression. 209 | split_penalty_before_if_expr=30 210 | 211 | # The penalty of splitting the line around the '&', '|', and '^' 212 | # operators. 213 | split_penalty_bitwise_operator=300 214 | 215 | # The penalty for splitting a list comprehension or generator 216 | # expression. 217 | split_penalty_comprehension=80 218 | 219 | # The penalty for characters over the column limit. 220 | split_penalty_excess_character=4500 221 | 222 | # The penalty incurred by adding a line split to the unwrapped line. The 223 | # more line splits added the higher the penalty. 224 | split_penalty_for_added_line_split=30 225 | 226 | # The penalty of splitting a list of "import as" names. For example: 227 | # 228 | # from a_very_long_or_indented_module_name_yada_yad import (long_argument_1, 229 | # long_argument_2, 230 | # long_argument_3) 231 | # 232 | # would reformat to something like: 233 | # 234 | # from a_very_long_or_indented_module_name_yada_yad import ( 235 | # long_argument_1, long_argument_2, long_argument_3) 236 | split_penalty_import_names=0 237 | 238 | # The penalty of splitting the line around the 'and' and 'or' 239 | # operators. 240 | split_penalty_logical_operator=300 241 | 242 | # Use the Tab character for indentation. 243 | use_tabs=False 244 | 245 | -------------------------------------------------------------------------------- /medicare_utils/utils.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | from pathlib import Path 3 | from textwrap import dedent, fill 4 | from typing import Union 5 | 6 | allowed_pcts = ['0001', '01', '05', '20', '100'] 7 | pct_dict = {0.01: '0001', 1: '01', 5: '05', 20: '20', 100: '100'} 8 | 9 | 10 | def pq_vars(ParquetFile): 11 | return ParquetFile.schema.names 12 | 13 | 14 | def _mywrap(text: str) -> str: 15 | text = dedent(text) 16 | lines = text.split('\n') 17 | lines = [ 18 | fill(x, replace_whitespace=False, subsequent_indent=' ') 19 | for x in lines] 20 | text = '\n'.join(lines) 21 | return text 22 | 23 | 24 | def fpath(percent, year, data_type, root_path, extension, new_style): 25 | """Generate path to Medicare files 26 | 27 | Args: 28 | percent: 29 | percent sample of data. Can be {'0001', '01', '05', '20', '100'} 30 | year: year of data. 31 | data_type: 32 | desired type of file 33 | 34 | - ``bsfab`` (`Beneficiary Summary File, Base segment`_) 35 | - ``bsfcc`` (`Beneficiary Summary File, Chronic Conditions segment`_) 36 | - ``bsfcu`` (`Beneficiary Summary File, Cost & Use segment`_) 37 | - ``bsfd`` (`Beneficiary Summary File, National Death Index segment`_) 38 | - ``carc`` (`Carrier File, Claims segment`_) 39 | - ``carl`` (`Carrier File, Line segment`_) 40 | - ``den`` (Denominator File) 41 | - ``dmec`` (`Durable Medical Equipment File, Claims segment`_) 42 | - ``dmel`` (`Durable Medical Equipment File, Line segment`_) 43 | - ``hhac`` (`Home Health Agency File, Claims segment`_) 44 | - ``hhar`` (`Home Health Agency File, Revenue Center segment`_) 45 | - ``hosc`` (`Hospice File, Claims segment`_) 46 | - ``hosr`` (`Hospice File, Revenue Center segment`_) 47 | - ``ipc`` (`Inpatient File, Claims segment`_) 48 | - ``ipr`` (`Inpatient File, Revenue Center segment`_) 49 | - ``med`` (`MedPAR File`_) 50 | - ``opc`` (`Outpatient File, Claims segment`_) 51 | - ``opr`` (`Outpatient File, Revenue Center segment`_) 52 | - ``snfc`` (`Skilled Nursing Facility File, Claims segment`_) 53 | - ``snfr`` (`Skilled Nursing Facility File, Revenue Center segment`_) 54 | - ``xw`` (Crosswalks files for ``ehic`` - ``bene_id``) 55 | - ``xw_bsf`` (Crosswalks files for ``ehic`` - ``bene_id``) 56 | 57 | .. _`Beneficiary Summary File, Base segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#base-abcd-segment_2 58 | .. _`Beneficiary Summary File, Chronic Conditions segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#chronic-conditions-segment_2 59 | .. _`Beneficiary Summary File, Cost & Use segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#cost-and-use-segment_1 60 | .. _`Beneficiary Summary File, National Death Index segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#national-death-index-segment_1 61 | .. _`Carrier File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/carrier-rif/#carrier-rif_1 62 | .. _`Carrier File, Line segment`: https://kylebarron.github.io/medicare-documentation/resdac/carrier-rif/#line-file 63 | .. _`Durable Medical Equipment File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/dme-rif/#durable-medical-equipment-rif_1 64 | .. _`Durable Medical Equipment File, Line segment`: https://kylebarron.github.io/medicare-documentation/resdac/dme-rif/#line-file 65 | .. _`Home Health Agency File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/hha-rif/#home-health-agency-rif_1 66 | .. _`Home Health Agency File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/hha-rif/#revenue-center-file 67 | .. _`Hospice File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/hospice-rif/#hospice-rif_1 68 | .. _`Hospice File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/hospice-rif/#revenue-center-file 69 | .. _`Inpatient File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/ip-rif/#inpatient-rif_1 70 | .. _`Inpatient File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/ip-rif/#revenue-center-file 71 | .. _`MedPAR File`: https://kylebarron.github.io/medicare-documentation/resdac/medpar-rif/#medpar-rif_1 72 | .. _`Outpatient File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/op-rif/#outpatient-rif_1 73 | .. _`Outpatient File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/op-rif/#revenue-center-file 74 | .. _`Skilled Nursing Facility File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/snf-rif/#skilled-nursing-facility-rif_1 75 | .. _`Skilled Nursing Facility File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/snf-rif/#revenue-center-file 76 | 77 | root_path: top of tree for file path 78 | extension: file extension 79 | new_style: 80 | If False, matches the file names at /disk/aging/medicare/data, if 81 | True, uses simplified directory structure. 82 | Returns: 83 | (str) path to file 84 | """ 85 | 86 | # Check types 87 | if type(data_type) != str: 88 | raise TypeError('data_type must be str') 89 | 90 | try: 91 | year = int(year) 92 | except ValueError: 93 | raise TypeError('Invalid year provided') 94 | 95 | allowed_pcts = ['0001', '01', '05', '20', '100'] 96 | if percent not in allowed_pcts: 97 | msg = f'percent must be one of: {allowed_pcts}' 98 | raise ValueError(msg) 99 | 100 | if extension == '': 101 | raise ValueError('Must provide valid extension') 102 | 103 | if extension[0] != '.': 104 | extension = '.' + extension 105 | 106 | root_path = Path(root_path).expanduser().resolve() 107 | root_path /= f'{percent}pct' 108 | if data_type in ['bsfab', 'bsfcc', 'bsfcu', 'bsfd', 'carc', 'carl', 'den', 109 | 'dmec', 'dmel', 'hhac', 'hhar', 'hosc', 'hosr', 'med', 110 | 'snfc', 'snfr']: 111 | root_path /= data_type[:3] 112 | elif data_type == 'xw_bsf' and not new_style: 113 | root_path /= 'bsf' 114 | else: 115 | root_path /= data_type[:2] 116 | 117 | if new_style: 118 | root_path /= f'{year}' 119 | if data_type == 'xw': 120 | root_path /= f'ehicbenex_one{year}{extension}' 121 | elif data_type == 'xw_bsf': 122 | root_path /= f'ehicbenex_unique{year}{extension}' 123 | else: 124 | root_path /= f'{data_type}{year}{extension}' 125 | else: 126 | root_path /= f'{year}' 127 | if data_type in ['den', 'dmec', 'dmel', 'hhac', 'hhar', 'hosc', 'hosr', 128 | 'med', 'snfc', 'snfr']: 129 | root_path /= f'{data_type}{year}{extension}' 130 | 131 | elif data_type in ['bsfab', 'bsfcc', 'bsfcu', 'bsfd']: 132 | root_path /= f'1/{data_type}{year}{extension}' 133 | 134 | elif data_type in ['carc', 'carl', 'ipc', 'ipr']: 135 | if year >= 2002: 136 | root_path /= f'{data_type}{year}{extension}' 137 | else: 138 | root_path /= f'{data_type[:-1]}{year}{extension}' 139 | 140 | elif data_type in ['opc', 'opr']: 141 | if year >= 2001: 142 | root_path /= f'{data_type}{year}{extension}' 143 | else: 144 | root_path /= f'{data_type[:-1]}{year}{extension}' 145 | 146 | elif data_type == 'xw': 147 | root_path /= f'ehicbenex_one{year}{extension}' 148 | 149 | elif data_type == 'xw_bsf': 150 | root_path /= f'xw/ehicbenex_unique{year}{extension}' 151 | 152 | else: 153 | raise ValueError(f'Invalid data_type: {data_type}') 154 | 155 | return str(root_path) 156 | -------------------------------------------------------------------------------- /tests/test_medicare_df_with_data.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import medicare_utils as med 4 | 5 | 6 | class TestGetCohortGetVarsToload(object): 7 | @pytest.fixture 8 | def init(self): 9 | return { 10 | 'gender': None, 11 | 'ages': None, 12 | 'races': None, 13 | 'race_col': 'race', 14 | 'buyin_val': None, 15 | 'hmo_val': None, 16 | 'keep_vars': []} 17 | 18 | @pytest.fixture 19 | def mdf(self, year, percent): 20 | return med.MedicareDF(percent, year) 21 | 22 | @pytest.fixture(params=['0001', '01', '05', '20', '100']) 23 | def percent(self, request): 24 | return request.param 25 | 26 | @pytest.fixture(params=[2005, 2012]) 27 | def year(self, request): 28 | return request.param 29 | 30 | def add_ehic(self, x, year): 31 | if year >= 2006: 32 | return x 33 | else: 34 | x.append('ehic') 35 | return x 36 | 37 | def assert_exp(self, mdf, init, exp, year): 38 | res = mdf._get_cohort_get_vars_toload(**init) 39 | exp = TestGetCohortGetVarsToload().add_ehic(exp, year) 40 | assert set(res[year]) == set(exp) 41 | 42 | # Only need to adjust these inputs 43 | @pytest.mark.parametrize( 44 | 'inputs,extra_vars', 45 | [ 46 | ({'gender': '1'}, ['sex']), 47 | ({'ages': range(70, 80)}, ['age']), 48 | ({'races': ['1'], 'race_col': 'race'}, ['race']), 49 | ({'races': ['1'], 'race_col': 'rti_race_cd'}, ['rti_race_cd']), 50 | ({'buyin_val': ['1', '2']}, ['buyin01', 'buyin02', 'buyin03', 'buyin04', 'buyin05', 'buyin06', 'buyin07', 'buyin08', 'buyin09', 'buyin10', 'buyin11', 'buyin12']), 51 | ({'hmo_val': ['1', '2']}, ['hmoind01', 'hmoind02', 'hmoind03', 'hmoind04', 'hmoind05', 'hmoind06', 'hmoind07', 'hmoind08', 'hmoind09', 'hmoind10', 'hmoind11', 'hmoind12']), 52 | ]) # yapf: disable 53 | def test_gender(self, year, mdf, init, inputs, extra_vars): 54 | for key, val in inputs.items(): 55 | init[key] = val 56 | exp = ['bene_id'] 57 | exp.extend(extra_vars) 58 | TestGetCohortGetVarsToload().assert_exp(mdf, init, exp, year) 59 | 60 | 61 | class TestGetCohortExtractEachYear(object): 62 | """Tests for a single year of cohort extraction 63 | """ 64 | 65 | @pytest.fixture 66 | def init(self): 67 | return { 68 | 'gender': None, 69 | 'ages': None, 70 | 'races': None, 71 | 'rti_race': False, 72 | 'buyin_val': None, 73 | 'hmo_val': None, 74 | 'join': 'outer', 75 | 'keep_vars': [], 76 | 'dask': False, 77 | 'verbose': False} 78 | 79 | @pytest.fixture 80 | def full_df(self): 81 | path = med.fpath(percent='0001', year=2012, data_type='bsfab') 82 | cols = [ 83 | 'bene_id', 'age', 'sex', 'race', 'rti_race_cd', 'buyin01', 84 | 'buyin02', 'buyin03', 'buyin04', 'buyin05', 'buyin06', 'buyin07', 85 | 'buyin08', 'buyin09', 'buyin10', 'buyin11', 'buyin12', 'hmoind01', 86 | 'hmoind02', 'hmoind03', 'hmoind04', 'hmoind05', 'hmoind06', 87 | 'hmoind07', 'hmoind08', 'hmoind09', 'hmoind10', 'hmoind11', 88 | 'hmoind12'] 89 | full_df = pd.read_parquet(path, columns=cols) 90 | print('Finished reading 0.01% bsfab data in 2012') 91 | return full_df 92 | 93 | # gender = 'm' 94 | # ages = None 95 | # races = None 96 | # rti_race = False 97 | # buyin_val = None 98 | # hmo_val = None 99 | # join = 'outer' 100 | # keep_vars = [] 101 | # dask = False 102 | # verbose = False 103 | 104 | def setup_df( 105 | self, 106 | gender, 107 | ages, 108 | races, 109 | rti_race, 110 | buyin_val, 111 | hmo_val, 112 | join, 113 | keep_vars, 114 | dask, 115 | verbose, 116 | year_type='calendar', 117 | pct='0001', 118 | year=2012): 119 | """Set up to run _get_cohort_extract_each_year() 120 | 121 | Replicate get_cohort methods up to call of _get_cohort_extract_each_year 122 | """ 123 | 124 | mdf = med.MedicareDF(pct, year, year_type=year_type) 125 | objs = mdf._get_cohort_type_check( 126 | gender=gender, 127 | ages=ages, 128 | races=races, 129 | rti_race=rti_race, 130 | buyin_val=buyin_val, 131 | hmo_val=hmo_val, 132 | join=join, 133 | keep_vars=keep_vars, 134 | dask=dask, 135 | verbose=verbose) 136 | gender = objs.gender 137 | ages = objs.ages 138 | races = objs.races 139 | rti_race = objs.rti_race 140 | race_col = objs.race_col 141 | buyin_val = objs.buyin_val 142 | hmo_val = objs.hmo_val 143 | join = objs.join 144 | keep_vars = objs.keep_vars 145 | dask = objs.dask 146 | verbose = objs.verbose 147 | 148 | toload_vars = mdf._get_cohort_get_vars_toload( 149 | gender, ages, races, race_col, buyin_val, hmo_val, keep_vars) 150 | 151 | return mdf, { 152 | 'year': year, 153 | 'toload_vars': toload_vars[year], 154 | 'nobs_dropped': { 155 | year: {}}, 156 | 'gender': gender, 157 | 'ages': ages, 158 | 'races': races, 159 | 'race_col': race_col, 160 | 'buyin_val': buyin_val, 161 | 'hmo_val': hmo_val, 162 | 'join': join, 163 | 'keep_vars': keep_vars, 164 | 'dask': dask, 165 | 'verbose': verbose} 166 | 167 | @pytest.mark.parametrize( 168 | 'attrs,values,exp_vars,exp_isin_vals', 169 | [ 170 | ( 171 | ['gender'], 172 | ['m'], 173 | ['sex'], 174 | [['1']] 175 | ), 176 | ( 177 | ['gender'], 178 | ['f'], 179 | ['sex'], 180 | [['2']] 181 | ), 182 | ( 183 | ['gender'], 184 | [None], 185 | ['sex'], 186 | [['0', '1', '2']] 187 | ), 188 | ( 189 | ['ages'], 190 | [range(75, 85)], 191 | ['age'], 192 | [range(75, 85)] 193 | ), 194 | ( 195 | ['ages'], 196 | [[75, 76, 77, 78, 79, 80, 81, 82, 83, 84]], 197 | ['age'], 198 | [range(75, 85)] 199 | ), 200 | ( 201 | ['races', 'rti_race'], 202 | ['white', False], 203 | ['race'], 204 | [['1']] 205 | ), 206 | ( 207 | ['races', 'rti_race'], 208 | ['black', False], 209 | ['race'], 210 | [['2']] 211 | ), 212 | ( 213 | ['races', 'rti_race'], 214 | ['asian', False], 215 | ['race'], 216 | [['4']] 217 | ), 218 | ( 219 | ['races', 'rti_race'], 220 | [['white', 'black', 'asian'], False], 221 | ['race'], 222 | [['1', '2', '4']] 223 | ), 224 | ( 225 | ['races', 'rti_race'], 226 | ['white', True], 227 | ['rti_race_cd'], 228 | [['1']] 229 | ), 230 | ( 231 | ['races', 'rti_race'], 232 | ['black', True], 233 | ['rti_race_cd'], 234 | [['2']] 235 | ), 236 | ( 237 | ['races', 'rti_race'], 238 | ['asian', True], 239 | ['rti_race_cd'], 240 | [['4']] 241 | ), 242 | ( 243 | ['races', 'rti_race'], 244 | [['white', 'black', 'asian'], True], 245 | ['rti_race_cd'], 246 | [['1', '2', '4']] 247 | ), 248 | ( 249 | ['buyin_val'], 250 | ['1'], 251 | ['buyin'], 252 | [['1']] 253 | ), 254 | ( 255 | ['buyin_val'], 256 | [['1', '2', '3']], 257 | ['buyin'], 258 | [['1', '2', '3']] 259 | ), 260 | ( 261 | ['buyin_val'], 262 | [['2', '3', 'B', 'C']], 263 | ['buyin'], 264 | [['2', '3', 'B', 'C']] 265 | ), 266 | ( 267 | ['buyin_val'], 268 | [['3', 'C']], 269 | ['buyin'], 270 | [['3', 'C']] 271 | ), 272 | ( 273 | ['hmo_val'], 274 | ['1'], 275 | ['hmoind'], 276 | [['1']] 277 | ), 278 | ( 279 | ['hmo_val'], 280 | [['1', '2', '3']], 281 | ['hmoind'], 282 | [['1', '2', '3']] 283 | ), 284 | ( 285 | ['hmo_val'], 286 | [['2', '3', 'B', 'C']], 287 | ['hmoind'], 288 | [['2', '3', 'B', 'C']] 289 | ), 290 | ( 291 | ['hmo_val'], 292 | [['3', 'C']], 293 | ['hmoind'], 294 | [['3', 'C']] 295 | ), 296 | ( 297 | ['gender', 'ages', 'races', 'rti_race', 'buyin_val'], 298 | ['m', range(67, 74), ['black', 'asian'], False, ['3', 'C']], 299 | ['sex', 'age', 'race', 'buyin'], 300 | [['1'], range(67, 74), ['2', '4'], ['3', 'C']] 301 | ), 302 | ( 303 | ['gender', 'ages', 'races', 'rti_race', 'buyin_val'], 304 | ['f', range(67, 85), ['white', 'hispanic'], True, ['3', 'C']], 305 | ['sex', 'age', 'rti_race_cd', 'buyin'], 306 | [['2'], range(67, 85), ['1', '5'], ['3', 'C']] 307 | ), 308 | ]) # yapf: disable 309 | def test_df_is_expected( 310 | self, init, full_df, attrs, values, exp_vars, exp_isin_vals): 311 | for attr, value in zip(attrs, values): 312 | init[attr] = value 313 | 314 | mdf, attrs = TestGetCohortExtractEachYear().setup_df(**init) 315 | pl, nobs_dropped = mdf._get_cohort_extract_each_year(**attrs) 316 | pl = pl.index 317 | 318 | query = [] 319 | for exp_var, exp_isin_val in zip(exp_vars, exp_isin_vals): 320 | if isinstance(exp_isin_val, range): 321 | exp_isin_val = list(exp_isin_val) 322 | if exp_var in ['buyin', 'hmoind']: 323 | for i in range(1, 13): 324 | j = str(i).zfill(2) 325 | query.append(f'{exp_var}{j}.isin({exp_isin_val})') 326 | else: 327 | query.append(f'{exp_var}.isin({exp_isin_val})') 328 | 329 | query = ' & '.join(query) 330 | expected = full_df.query(query)['bene_id'] 331 | 332 | expected = pd.Index(expected.sort_values()) 333 | pl = pd.Index(pl.sort_values()) 334 | 335 | assert expected.equals(pl) 336 | -------------------------------------------------------------------------------- /medicare_utils/metadata/xw/snfr.json: -------------------------------------------------------------------------------- 1 | { 2 | "bene_id": { 3 | "2006": { 4 | "format": "%15s", 5 | "name": "bene_id", 6 | "type": "str15" 7 | }, 8 | "2007": { 9 | "format": "%15s", 10 | "name": "bene_id", 11 | "type": "str15" 12 | }, 13 | "2008": { 14 | "format": "%15s", 15 | "name": "bene_id", 16 | "type": "str15" 17 | }, 18 | "2009": { 19 | "format": "%15s", 20 | "name": "bene_id", 21 | "type": "str15" 22 | }, 23 | "2010": { 24 | "format": "%15s", 25 | "name": "bene_id", 26 | "type": "str15" 27 | }, 28 | "2011": { 29 | "format": "%15s", 30 | "name": "bene_id", 31 | "type": "str15" 32 | }, 33 | "2012": { 34 | "format": "%15s", 35 | "name": "bene_id", 36 | "type": "str15" 37 | }, 38 | "2013": { 39 | "format": "%15s", 40 | "name": "bene_id", 41 | "type": "str15" 42 | }, 43 | "desc": "encrypted 723 beneficiary id" 44 | }, 45 | "claimindex": { 46 | "2002": { 47 | "format": "%12.0g", 48 | "name": "claimindex", 49 | "type": "long" 50 | }, 51 | "2003": { 52 | "format": "%12.0g", 53 | "name": "claimindex", 54 | "type": "long" 55 | }, 56 | "2004": { 57 | "format": "%12.0g", 58 | "name": "claimindex", 59 | "type": "long" 60 | }, 61 | "2005": { 62 | "format": "%12.0g", 63 | "name": "claimindex", 64 | "type": "long" 65 | }, 66 | "desc": "claimindex" 67 | }, 68 | "clm_id": { 69 | "2006": { 70 | "format": "%15s", 71 | "name": "clm_id", 72 | "type": "str15" 73 | }, 74 | "2007": { 75 | "format": "%15s", 76 | "name": "clm_id", 77 | "type": "str15" 78 | }, 79 | "2008": { 80 | "format": "%15s", 81 | "name": "clm_id", 82 | "type": "str15" 83 | }, 84 | "2009": { 85 | "format": "%15s", 86 | "name": "clm_id", 87 | "type": "str15" 88 | }, 89 | "2010": { 90 | "format": "%15s", 91 | "name": "clm_id", 92 | "type": "str15" 93 | }, 94 | "2011": { 95 | "format": "%15s", 96 | "name": "clm_id", 97 | "type": "str15" 98 | }, 99 | "2012": { 100 | "format": "%15s", 101 | "name": "clm_id", 102 | "type": "str15" 103 | }, 104 | "2013": { 105 | "format": "%15s", 106 | "name": "clm_id", 107 | "type": "str15" 108 | }, 109 | "desc": "encrypted claim id" 110 | }, 111 | "clm_ln": { 112 | "2002": { 113 | "format": "%8.0g", 114 | "name": "cntrindex", 115 | "type": "byte" 116 | }, 117 | "2003": { 118 | "format": "%8.0g", 119 | "name": "cntrindex", 120 | "type": "byte" 121 | }, 122 | "2004": { 123 | "format": "%8.0g", 124 | "name": "cntrindex", 125 | "type": "byte" 126 | }, 127 | "2005": { 128 | "format": "%8.0g", 129 | "name": "cntrindex", 130 | "type": "byte" 131 | }, 132 | "2006": { 133 | "format": "%8.0g", 134 | "name": "clm_ln", 135 | "type": "byte" 136 | }, 137 | "2007": { 138 | "format": "%8.0g", 139 | "name": "clm_ln", 140 | "type": "byte" 141 | }, 142 | "2008": { 143 | "format": "%8.0g", 144 | "name": "clm_ln", 145 | "type": "byte" 146 | }, 147 | "2009": { 148 | "format": "%8.0g", 149 | "name": "clm_ln", 150 | "type": "int" 151 | }, 152 | "2010": { 153 | "format": "%8.0g", 154 | "name": "clm_ln", 155 | "type": "byte" 156 | }, 157 | "2011": { 158 | "format": "%8.0g", 159 | "name": "clm_ln", 160 | "type": "int" 161 | }, 162 | "2012": { 163 | "format": "%8.0g", 164 | "name": "clm_ln", 165 | "type": "byte" 166 | }, 167 | "2013": { 168 | "format": "%8.0g", 169 | "name": "clm_ln", 170 | "type": "byte" 171 | }, 172 | "desc": "claim line number" 173 | }, 174 | "clm_type": { 175 | "2006": { 176 | "format": "%2s", 177 | "name": "clm_type", 178 | "type": "str2" 179 | }, 180 | "2007": { 181 | "format": "%2s", 182 | "name": "clm_type", 183 | "type": "str2" 184 | }, 185 | "2008": { 186 | "format": "%2s", 187 | "name": "clm_type", 188 | "type": "str2" 189 | }, 190 | "2009": { 191 | "format": "%2s", 192 | "name": "clm_type", 193 | "type": "str2" 194 | }, 195 | "2010": { 196 | "format": "%2s", 197 | "name": "clm_type", 198 | "type": "str2" 199 | }, 200 | "2011": { 201 | "format": "%2s", 202 | "name": "clm_type", 203 | "type": "str2" 204 | }, 205 | "2012": { 206 | "format": "%2s", 207 | "name": "clm_type", 208 | "type": "str2" 209 | }, 210 | "2013": { 211 | "format": "%2s", 212 | "name": "clm_type", 213 | "type": "str2" 214 | }, 215 | "desc": "nch claim type code" 216 | }, 217 | "ehic": { 218 | "2002": { 219 | "format": "%11s", 220 | "name": "ehic", 221 | "type": "str11" 222 | }, 223 | "2003": { 224 | "format": "%11s", 225 | "name": "ehic", 226 | "type": "str11" 227 | }, 228 | "2004": { 229 | "format": "%11s", 230 | "name": "ehic", 231 | "type": "str11" 232 | }, 233 | "2005": { 234 | "format": "%11s", 235 | "name": "ehic", 236 | "type": "str11" 237 | }, 238 | "desc": "" 239 | }, 240 | "hcpcs_cd": { 241 | "2002": { 242 | "format": "%5s", 243 | "name": "hcpcs_cd", 244 | "type": "str5" 245 | }, 246 | "2003": { 247 | "format": "%5s", 248 | "name": "hcpcs_cd", 249 | "type": "str5" 250 | }, 251 | "2004": { 252 | "format": "%5s", 253 | "name": "hcpcs_cd", 254 | "type": "str5" 255 | }, 256 | "2005": { 257 | "format": "%5s", 258 | "name": "hcpcs_cd", 259 | "type": "str5" 260 | }, 261 | "2006": { 262 | "format": "%5s", 263 | "name": "hcpcs_cd", 264 | "type": "str5" 265 | }, 266 | "2007": { 267 | "format": "%5s", 268 | "name": "hcpcs_cd", 269 | "type": "str5" 270 | }, 271 | "2008": { 272 | "format": "%5s", 273 | "name": "hcpcs_cd", 274 | "type": "str5" 275 | }, 276 | "2009": { 277 | "format": "%5s", 278 | "name": "hcpcs_cd", 279 | "type": "str5" 280 | }, 281 | "2010": { 282 | "format": "%5s", 283 | "name": "hcpcs_cd", 284 | "type": "str5" 285 | }, 286 | "2011": { 287 | "format": "%5s", 288 | "name": "hcpcs_cd", 289 | "type": "str5" 290 | }, 291 | "2012": { 292 | "format": "%5s", 293 | "name": "hcpcs_cd", 294 | "type": "str5" 295 | }, 296 | "2013": { 297 | "format": "%5s", 298 | "name": "hcpcs_cd", 299 | "type": "str5" 300 | }, 301 | "desc": "revenue center healthcare common procedure coding system" 302 | }, 303 | "rev_chrg": { 304 | "2002": { 305 | "format": "%9.0g", 306 | "name": "rev_chrg", 307 | "type": "float" 308 | }, 309 | "2003": { 310 | "format": "%10.0g", 311 | "name": "rev_chrg", 312 | "type": "double" 313 | }, 314 | "2004": { 315 | "format": "%9.0g", 316 | "name": "rev_chrg", 317 | "type": "float" 318 | }, 319 | "2005": { 320 | "format": "%9.0g", 321 | "name": "rev_chrg", 322 | "type": "float" 323 | }, 324 | "2006": { 325 | "format": "%9.0g", 326 | "name": "rev_chrg", 327 | "type": "float" 328 | }, 329 | "2007": { 330 | "format": "%9.0g", 331 | "name": "rev_chrg", 332 | "type": "float" 333 | }, 334 | "2008": { 335 | "format": "%9.0g", 336 | "name": "rev_chrg", 337 | "type": "float" 338 | }, 339 | "2009": { 340 | "format": "%10.0g", 341 | "name": "rev_chrg", 342 | "type": "double" 343 | }, 344 | "2010": { 345 | "format": "%12.0g", 346 | "name": "rev_chrg", 347 | "type": "double" 348 | }, 349 | "2011": { 350 | "format": "%10.0g", 351 | "name": "rev_chrg", 352 | "type": "double" 353 | }, 354 | "2012": { 355 | "format": "%12.0g", 356 | "name": "rev_chrg", 357 | "type": "double" 358 | }, 359 | "2013": { 360 | "format": "%12.0g", 361 | "name": "rev_chrg", 362 | "type": "double" 363 | }, 364 | "desc": "revenue center total charge amount" 365 | }, 366 | "rev_cntr": { 367 | "2002": { 368 | "format": "%4s", 369 | "name": "rev_cntr", 370 | "type": "str4" 371 | }, 372 | "2003": { 373 | "format": "%4s", 374 | "name": "rev_cntr", 375 | "type": "str4" 376 | }, 377 | "2004": { 378 | "format": "%4s", 379 | "name": "rev_cntr", 380 | "type": "str4" 381 | }, 382 | "2005": { 383 | "format": "%4s", 384 | "name": "rev_cntr", 385 | "type": "str4" 386 | }, 387 | "2006": { 388 | "format": "%4s", 389 | "name": "rev_cntr", 390 | "type": "str4" 391 | }, 392 | "2007": { 393 | "format": "%4s", 394 | "name": "rev_cntr", 395 | "type": "str4" 396 | }, 397 | "2008": { 398 | "format": "%4s", 399 | "name": "rev_cntr", 400 | "type": "str4" 401 | }, 402 | "2009": { 403 | "format": "%4s", 404 | "name": "rev_cntr", 405 | "type": "str4" 406 | }, 407 | "2010": { 408 | "format": "%4s", 409 | "name": "rev_cntr", 410 | "type": "str4" 411 | }, 412 | "2011": { 413 | "format": "%4s", 414 | "name": "rev_cntr", 415 | "type": "str4" 416 | }, 417 | "2012": { 418 | "format": "%4s", 419 | "name": "rev_cntr", 420 | "type": "str4" 421 | }, 422 | "2013": { 423 | "format": "%4s", 424 | "name": "rev_cntr", 425 | "type": "str4" 426 | }, 427 | "desc": "revenue center code" 428 | }, 429 | "rev_cntr_ndc_qty": { 430 | "2010": { 431 | "format": "%8.0g", 432 | "name": "rev_cntr_ndc_qty", 433 | "type": "byte" 434 | }, 435 | "2011": { 436 | "format": "%10.0g", 437 | "name": "rev_cntr_ndc_qty", 438 | "type": "double" 439 | }, 440 | "2012": { 441 | "format": "%8.0g", 442 | "name": "rev_cntr_ndc_qty", 443 | "type": "byte" 444 | }, 445 | "2013": { 446 | "format": "%8.0g", 447 | "name": "rev_cntr_ndc_qty", 448 | "type": "byte" 449 | }, 450 | "desc": "revenue center ndc quantity" 451 | }, 452 | "rev_cntr_ndc_qty_qlfr_cd": { 453 | "2010": { 454 | "format": "%1s", 455 | "name": "rev_cntr_ndc_qty_qlfr_cd", 456 | "type": "str1" 457 | }, 458 | "2011": { 459 | "format": "%2s", 460 | "name": "rev_cntr_ndc_qty_qlfr_cd", 461 | "type": "str2" 462 | }, 463 | "2012": { 464 | "format": "%1s", 465 | "name": "rev_cntr_ndc_qty_qlfr_cd", 466 | "type": "str1" 467 | }, 468 | "2013": { 469 | "format": "%1s", 470 | "name": "rev_cntr_ndc_qty_qlfr_cd", 471 | "type": "str1" 472 | }, 473 | "desc": "revenue center ndc quantity qualifier code" 474 | }, 475 | "rev_dt": { 476 | "2002": { 477 | "format": "%dD_m_Y", 478 | "name": "srev_dt", 479 | "type": "long" 480 | }, 481 | "2003": { 482 | "format": "%dD_m_Y", 483 | "name": "srev_dt", 484 | "type": "long" 485 | }, 486 | "2004": { 487 | "format": "%dD_m_Y", 488 | "name": "srev_dt", 489 | "type": "long" 490 | }, 491 | "2005": { 492 | "format": "%dD_m_Y", 493 | "name": "srev_dt", 494 | "type": "long" 495 | }, 496 | "desc": "204. revenue center date (sas yyyymmdd)" 497 | }, 498 | "rev_ncvr": { 499 | "2002": { 500 | "format": "%9.0g", 501 | "name": "rev_ncvr", 502 | "type": "float" 503 | }, 504 | "2003": { 505 | "format": "%9.0g", 506 | "name": "rev_ncvr", 507 | "type": "float" 508 | }, 509 | "2004": { 510 | "format": "%9.0g", 511 | "name": "rev_ncvr", 512 | "type": "float" 513 | }, 514 | "2005": { 515 | "format": "%9.0g", 516 | "name": "rev_ncvr", 517 | "type": "float" 518 | }, 519 | "2006": { 520 | "format": "%9.0g", 521 | "name": "rev_ncvr", 522 | "type": "float" 523 | }, 524 | "2007": { 525 | "format": "%9.0g", 526 | "name": "rev_ncvr", 527 | "type": "float" 528 | }, 529 | "2008": { 530 | "format": "%9.0g", 531 | "name": "rev_ncvr", 532 | "type": "float" 533 | }, 534 | "2009": { 535 | "format": "%9.0g", 536 | "name": "rev_ncvr", 537 | "type": "float" 538 | }, 539 | "2010": { 540 | "format": "%12.0g", 541 | "name": "rev_ncvr", 542 | "type": "double" 543 | }, 544 | "2011": { 545 | "format": "%10.0g", 546 | "name": "rev_ncvr", 547 | "type": "double" 548 | }, 549 | "2012": { 550 | "format": "%12.0g", 551 | "name": "rev_ncvr", 552 | "type": "double" 553 | }, 554 | "2013": { 555 | "format": "%12.0g", 556 | "name": "rev_ncvr", 557 | "type": "double" 558 | }, 559 | "desc": "revenue center non-covered charge amount" 560 | }, 561 | "rev_rate": { 562 | "2002": { 563 | "format": "%9.0g", 564 | "name": "rev_rate", 565 | "type": "float" 566 | }, 567 | "2003": { 568 | "format": "%9.0g", 569 | "name": "rev_rate", 570 | "type": "float" 571 | }, 572 | "2004": { 573 | "format": "%9.0g", 574 | "name": "rev_rate", 575 | "type": "float" 576 | }, 577 | "2005": { 578 | "format": "%9.0g", 579 | "name": "rev_rate", 580 | "type": "float" 581 | }, 582 | "2006": { 583 | "format": "%9.0g", 584 | "name": "rev_rate", 585 | "type": "float" 586 | }, 587 | "2007": { 588 | "format": "%9.0g", 589 | "name": "rev_rate", 590 | "type": "float" 591 | }, 592 | "2008": { 593 | "format": "%9.0g", 594 | "name": "rev_rate", 595 | "type": "float" 596 | }, 597 | "2009": { 598 | "format": "%9.0g", 599 | "name": "rev_rate", 600 | "type": "float" 601 | }, 602 | "2010": { 603 | "format": "%12.0g", 604 | "name": "rev_rate", 605 | "type": "double" 606 | }, 607 | "2011": { 608 | "format": "%10.0g", 609 | "name": "rev_rate", 610 | "type": "double" 611 | }, 612 | "2012": { 613 | "format": "%12.0g", 614 | "name": "rev_rate", 615 | "type": "double" 616 | }, 617 | "2013": { 618 | "format": "%12.0g", 619 | "name": "rev_rate", 620 | "type": "double" 621 | }, 622 | "desc": "revenue center rate amount" 623 | }, 624 | "rev_unit": { 625 | "2002": { 626 | "format": "%8.0g", 627 | "name": "rev_unit", 628 | "type": "int" 629 | }, 630 | "2003": { 631 | "format": "%8.0g", 632 | "name": "rev_unit", 633 | "type": "int" 634 | }, 635 | "2004": { 636 | "format": "%8.0g", 637 | "name": "rev_unit", 638 | "type": "int" 639 | }, 640 | "2005": { 641 | "format": "%8.0g", 642 | "name": "rev_unit", 643 | "type": "int" 644 | }, 645 | "2006": { 646 | "format": "%8.0g", 647 | "name": "rev_unit", 648 | "type": "int" 649 | }, 650 | "2007": { 651 | "format": "%8.0g", 652 | "name": "rev_unit", 653 | "type": "int" 654 | }, 655 | "2008": { 656 | "format": "%8.0g", 657 | "name": "rev_unit", 658 | "type": "int" 659 | }, 660 | "2009": { 661 | "format": "%8.0g", 662 | "name": "rev_unit", 663 | "type": "int" 664 | }, 665 | "2010": { 666 | "format": "%8.0g", 667 | "name": "rev_unit", 668 | "type": "int" 669 | }, 670 | "2011": { 671 | "format": "%12.0g", 672 | "name": "rev_unit", 673 | "type": "long" 674 | }, 675 | "2012": { 676 | "format": "%8.0g", 677 | "name": "rev_unit", 678 | "type": "int" 679 | }, 680 | "2013": { 681 | "format": "%8.0g", 682 | "name": "rev_unit", 683 | "type": "int" 684 | }, 685 | "desc": "revenue center unit count" 686 | }, 687 | "revdedcd": { 688 | "2002": { 689 | "format": "%1s", 690 | "name": "revdedcd", 691 | "type": "str1" 692 | }, 693 | "2003": { 694 | "format": "%1s", 695 | "name": "revdedcd", 696 | "type": "str1" 697 | }, 698 | "2004": { 699 | "format": "%1s", 700 | "name": "revdedcd", 701 | "type": "str1" 702 | }, 703 | "2005": { 704 | "format": "%1s", 705 | "name": "revdedcd", 706 | "type": "str1" 707 | }, 708 | "2006": { 709 | "format": "%1s", 710 | "name": "revdedcd", 711 | "type": "str1" 712 | }, 713 | "2007": { 714 | "format": "%1s", 715 | "name": "revdedcd", 716 | "type": "str1" 717 | }, 718 | "2008": { 719 | "format": "%1s", 720 | "name": "revdedcd", 721 | "type": "str1" 722 | }, 723 | "2009": { 724 | "format": "%1s", 725 | "name": "revdedcd", 726 | "type": "str1" 727 | }, 728 | "2010": { 729 | "format": "%1s", 730 | "name": "revdedcd", 731 | "type": "str1" 732 | }, 733 | "2011": { 734 | "format": "%1s", 735 | "name": "revdedcd", 736 | "type": "str1" 737 | }, 738 | "2012": { 739 | "format": "%1s", 740 | "name": "revdedcd", 741 | "type": "str1" 742 | }, 743 | "2013": { 744 | "format": "%1s", 745 | "name": "revdedcd", 746 | "type": "str1" 747 | }, 748 | "desc": "revenue center deductible coinsurance code" 749 | }, 750 | "rndrng_physn_npi": { 751 | "2010": { 752 | "format": "%1s", 753 | "name": "rndrng_physn_npi", 754 | "type": "str1" 755 | }, 756 | "2011": { 757 | "format": "%1s", 758 | "name": "rndrng_physn_npi", 759 | "type": "str1" 760 | }, 761 | "2012": { 762 | "format": "%1s", 763 | "name": "rndrng_physn_npi", 764 | "type": "str1" 765 | }, 766 | "2013": { 767 | "format": "%1s", 768 | "name": "rndrng_physn_npi", 769 | "type": "str1" 770 | }, 771 | "desc": "revenue center rendering physician npi" 772 | }, 773 | "rndrng_physn_upin": { 774 | "2010": { 775 | "format": "%1s", 776 | "name": "rndrng_physn_upin", 777 | "type": "str1" 778 | }, 779 | "2011": { 780 | "format": "%1s", 781 | "name": "rndrng_physn_upin", 782 | "type": "str1" 783 | }, 784 | "2012": { 785 | "format": "%1s", 786 | "name": "rndrng_physn_upin", 787 | "type": "str1" 788 | }, 789 | "2013": { 790 | "format": "%1s", 791 | "name": "rndrng_physn_upin", 792 | "type": "str1" 793 | }, 794 | "desc": "revenue center rendering physician upin" 795 | }, 796 | "thru_dt": { 797 | "2006": { 798 | "format": "%dD_m_Y", 799 | "name": "thru_dt", 800 | "type": "long" 801 | }, 802 | "2007": { 803 | "format": "%dD_m_Y", 804 | "name": "thru_dt", 805 | "type": "long" 806 | }, 807 | "2008": { 808 | "format": "%dD_m_Y", 809 | "name": "thru_dt", 810 | "type": "long" 811 | }, 812 | "2009": { 813 | "format": "%d", 814 | "name": "thru_dt", 815 | "type": "long" 816 | }, 817 | "2010": { 818 | "format": "%tdD_m_Y", 819 | "name": "thru_dt", 820 | "type": "long" 821 | }, 822 | "2011": { 823 | "format": "%tdD_m_Y", 824 | "name": "thru_dt", 825 | "type": "long" 826 | }, 827 | "2012": { 828 | "format": "%tdD_m_Y", 829 | "name": "thru_dt", 830 | "type": "long" 831 | }, 832 | "2013": { 833 | "format": "%tdD_m_Y", 834 | "name": "thru_dt", 835 | "type": "long" 836 | }, 837 | "desc": "claim through date (determines year of claim)" 838 | } 839 | } -------------------------------------------------------------------------------- /medicare_utils/metadata/codebook/bsfab.json: -------------------------------------------------------------------------------- 1 | { 2 | "enrl_src": { 3 | "name": "Source of enrollment data", 4 | "values": { 5 | "EDB": "Enrollment Database", 6 | "CME": "Common Medicare Environment" 7 | } 8 | }, 9 | "sample_group": { 10 | "name": "Medicare Sample Group Indicator", 11 | "values": {} 12 | }, 13 | "efivepct": { 14 | "name": "Enhanced Medicare 5% Sample Indicator", 15 | "values": { 16 | "Y": "Yes, included in enhanced 5% sample", 17 | "NULL": "Not included in enhanced 5% sample" 18 | } 19 | }, 20 | "crnt_bic": { 21 | "name": "Current Beneficiary Identification Code", 22 | "values": {} 23 | }, 24 | "state_cd": { 25 | "name": "State code for beneficiary (SSA code)", 26 | "values": { 27 | "01": "Alabama", 28 | "02": "Alaska", 29 | "03": "Arizona", 30 | "04": "Arkansas", 31 | "05": "California", 32 | "06": "Colorado", 33 | "07": "Connecticut", 34 | "08": "Delaware", 35 | "09": "District of Columbia", 36 | "10": "Florida", 37 | "11": "Georgia", 38 | "12": "Hawaii", 39 | "13": "Idaho", 40 | "14": "Illinois", 41 | "15": "Indiana", 42 | "16": "Iowa", 43 | "17": "Kansas", 44 | "18": "Kentucky", 45 | "19": "Louisiana", 46 | "20": "Maine", 47 | "21": "Maryland", 48 | "22": "Massachusetts", 49 | "23": "Michigan", 50 | "24": "Minnesota", 51 | "25": "Mississippi", 52 | "26": "Missouri", 53 | "27": "Montana", 54 | "28": "Nebraska", 55 | "29": "Nevada", 56 | "30": "New Hampshire", 57 | "31": "New Jersey", 58 | "32": "New Mexico", 59 | "33": "New York", 60 | "34": "North Carolina", 61 | "35": "North Dakota", 62 | "36": "Ohio", 63 | "37": "Oklahoma", 64 | "38": "Oregon", 65 | "39": "Pennsylvania", 66 | "40": "Puerto Rico", 67 | "41": "Rhode Island", 68 | "42": "South Carolina", 69 | "43": "South Dakota", 70 | "44": "Tennessee", 71 | "45": "Texas", 72 | "46": "Utah", 73 | "47": "Vermont", 74 | "48": "Virgin Islands", 75 | "49": "Virginia", 76 | "50": "Washington", 77 | "51": "West Virginia", 78 | "52": "Wisconsin", 79 | "53": "Wyoming", 80 | "54": "Africa", 81 | "55": "California", 82 | "56": "Canada & Islands", 83 | "57": "Central America and West Indies", 84 | "58": "Europe", 85 | "59": "Mexico", 86 | "60": "Oceania", 87 | "61": "Philippines", 88 | "62": "South America", 89 | "63": "U.S. Possessions", 90 | "64": "American Samoa", 91 | "65": "Guam", 92 | "66": "Commonwealth of the Northern Marianas Islands", 93 | "67": "Texas", 94 | "68": "Florida", 95 | "69": "Florida", 96 | "70": "Kansas", 97 | "71": "Louisiana", 98 | "72": "Ohio", 99 | "73": "Pennsylvania", 100 | "74": "Texas", 101 | "80": "Maryland", 102 | "97": "Northern Marianas", 103 | "98": "Guam", 104 | "99": "With 000 county code is American Samoa; otherwise unknown" 105 | } 106 | }, 107 | "v_dod_sw": { 108 | "name": "Valid Date of Death Switch", 109 | "values": { 110 | "Null": "Default", 111 | "V": "Valid death date" 112 | } 113 | }, 114 | "sex": { 115 | "name": "Sex", 116 | "values": { 117 | "0": "Unknown", 118 | "1": "Male", 119 | "2": "Female" 120 | } 121 | }, 122 | "race": { 123 | "name": "Beneficiary Race Code", 124 | "values": { 125 | "0": "Unknown", 126 | "1": "White", 127 | "2": "Black", 128 | "3": "Other", 129 | "4": "Asian", 130 | "5": "Hispanic", 131 | "6": "North American Native" 132 | } 133 | }, 134 | "rti_race_cd": { 135 | "name": "Research Triangle Institute (RTI) Race Code", 136 | "values": { 137 | "0": "Unknown", 138 | "1": "Non-Hispanic White", 139 | "2": "Black (or African-American)", 140 | "3": "Other", 141 | "4": "Asian Pacific Islander", 142 | "5": "Hispanic", 143 | "6": "American Indian Alaska Native" 144 | } 145 | }, 146 | "orec": { 147 | "name": "Original Reason for Entitlement Code", 148 | "values": { 149 | "0": "Old Age and Survivors Insurance (OASI)", 150 | "1": "Disability Insurance Benefits (DIB)", 151 | "2": "End-stage Renal Disease (ESRD)", 152 | "3": "Both DIB and ESRD" 153 | } 154 | }, 155 | "crec": { 156 | "name": "Current Reason for Entitlement Code", 157 | "values": { 158 | "0": "Old Age and Survivors Insurance (OASI)", 159 | "1": "Disability Insurance Benefits (DIB)", 160 | "2": "End-stage Renal Disease (ESRD)", 161 | "3": "Both DIB and ESRD" 162 | } 163 | }, 164 | "esrd_ind": { 165 | "name": "End-stage Renal Disease (ESRD) Indicator", 166 | "values": { 167 | "Y": "The beneficiary has ESRD", 168 | "0": "The beneficiary does not have ESRD" 169 | } 170 | }, 171 | "mdcr_stus_cd_01": { 172 | "name": "Medicare Status Code - January", 173 | "values": { 174 | "10": "Aged without ESRD", 175 | "11": "Aged with ESRD", 176 | "20": "Disabled without ESRD", 177 | "21": "Disabled with ESRD", 178 | "31": "ESRD only" 179 | } 180 | }, 181 | "mdcr_stus_cd_02": { 182 | "name": "Medicare Status Code - February", 183 | "values": { 184 | "10": "Aged without ESRD", 185 | "11": "Aged with ESRD", 186 | "20": "Disabled without ESRD", 187 | "21": "Disabled with ESRD", 188 | "31": "ESRD only" 189 | } 190 | }, 191 | "mdcr_stus_cd_03": { 192 | "name": "Medicare Status Code - March", 193 | "values": { 194 | "10": "Aged without ESRD", 195 | "11": "Aged with ESRD", 196 | "20": "Disabled without ESRD", 197 | "21": "Disabled with ESRD", 198 | "31": "ESRD only" 199 | } 200 | }, 201 | "mdcr_stus_cd_04": { 202 | "name": "Medicare Status Code - April", 203 | "values": { 204 | "10": "Aged without ESRD", 205 | "11": "Aged with ESRD", 206 | "20": "Disabled without ESRD", 207 | "21": "Disabled with ESRD", 208 | "31": "ESRD only" 209 | } 210 | }, 211 | "mdcr_stus_cd_05": { 212 | "name": "Medicare Status Code - May", 213 | "values": { 214 | "10": "Aged without ESRD", 215 | "11": "Aged with ESRD", 216 | "20": "Disabled without ESRD", 217 | "21": "Disabled with ESRD", 218 | "31": "ESRD only" 219 | } 220 | }, 221 | "mdcr_stus_cd_06": { 222 | "name": "Medicare Status Code - June", 223 | "values": { 224 | "10": "Aged without ESRD", 225 | "11": "Aged with ESRD", 226 | "20": "Disabled without ESRD", 227 | "21": "Disabled with ESRD", 228 | "31": "ESRD only" 229 | } 230 | }, 231 | "mdcr_stus_cd_07": { 232 | "name": "Medicare Status Code - July", 233 | "values": { 234 | "10": "Aged without ESRD", 235 | "11": "Aged with ESRD", 236 | "20": "Disabled without ESRD", 237 | "21": "Disabled with ESRD", 238 | "31": "ESRD only" 239 | } 240 | }, 241 | "mdcr_stus_cd_08": { 242 | "name": "Medicare Status Code - August", 243 | "values": { 244 | "10": "Aged without ESRD", 245 | "11": "Aged with ESRD", 246 | "20": "Disabled without ESRD", 247 | "21": "Disabled with ESRD", 248 | "31": "ESRD only" 249 | } 250 | }, 251 | "mdcr_stus_cd_09": { 252 | "name": "Medicare Status Code - September", 253 | "values": { 254 | "10": "Aged without ESRD", 255 | "11": "Aged with ESRD", 256 | "20": "Disabled without ESRD", 257 | "21": "Disabled with ESRD", 258 | "31": "ESRD only" 259 | } 260 | }, 261 | "mdcr_stus_cd_10": { 262 | "name": "Medicare Status Code - October", 263 | "values": { 264 | "10": "Aged without ESRD", 265 | "11": "Aged with ESRD", 266 | "20": "Disabled without ESRD", 267 | "21": "Disabled with ESRD", 268 | "31": "ESRD only" 269 | } 270 | }, 271 | "mdcr_stus_cd_11": { 272 | "name": "Medicare Status Code - November", 273 | "values": { 274 | "10": "Aged without ESRD", 275 | "11": "Aged with ESRD", 276 | "20": "Disabled without ESRD", 277 | "21": "Disabled with ESRD", 278 | "31": "ESRD only" 279 | } 280 | }, 281 | "mdcr_stus_cd_12": { 282 | "name": "Medicare Status Code - December", 283 | "values": { 284 | "10": "Aged without ESRD", 285 | "11": "Aged with ESRD", 286 | "20": "Disabled without ESRD", 287 | "21": "Disabled with ESRD", 288 | "31": "ESRD only" 289 | } 290 | }, 291 | "a_trm_cd": { 292 | "name": "Part A Termination Code", 293 | "values": { 294 | "0": "Not terminated", 295 | "1": "Dead", 296 | "2": "Non-payment of premium", 297 | "3": "Voluntary withdrawl", 298 | "9": "Other termination" 299 | } 300 | }, 301 | "b_trm_cd": { 302 | "name": "Part B Termination Code", 303 | "values": { 304 | "0": "Not terminated", 305 | "1": "Dead", 306 | "2": "Non-payment of premium", 307 | "3": "Voluntary withdrawl", 308 | "9": "Other termination" 309 | } 310 | }, 311 | "buyin01": { 312 | "name": "Medicare Entitlement/Buy-In Indicator - January", 313 | "values": { 314 | "0": "Not entitled", 315 | "1": "Part A only", 316 | "2": "Part B only", 317 | "3": "Part A and Part B", 318 | "A": "Part A state buy-in", 319 | "B": "Part B state buy-in", 320 | "C": "Part A and Part B state buy-in" 321 | } 322 | }, 323 | "buyin02": { 324 | "name": "Medicare Entitlement/Buy-In Indicator - February", 325 | "values": { 326 | "0": "Not entitled", 327 | "1": "Part A only", 328 | "2": "Part B only", 329 | "3": "Part A and Part B", 330 | "A": "Part A state buy-in", 331 | "B": "Part B state buy-in", 332 | "C": "Part A and Part B state buy-in" 333 | } 334 | }, 335 | "buyin03": { 336 | "name": "Medicare Entitlement/Buy-In Indicator - March", 337 | "values": { 338 | "0": "Not entitled", 339 | "1": "Part A only", 340 | "2": "Part B only", 341 | "3": "Part A and Part B", 342 | "A": "Part A state buy-in", 343 | "B": "Part B state buy-in", 344 | "C": "Part A and Part B state buy-in" 345 | } 346 | }, 347 | "buyin04": { 348 | "name": "Medicare Entitlement/Buy-In Indicator - April", 349 | "values": { 350 | "0": "Not entitled", 351 | "1": "Part A only", 352 | "2": "Part B only", 353 | "3": "Part A and Part B", 354 | "A": "Part A state buy-in", 355 | "B": "Part B state buy-in", 356 | "C": "Part A and Part B state buy-in" 357 | } 358 | }, 359 | "buyin05": { 360 | "name": "Medicare Entitlement/Buy-In Indicator - May", 361 | "values": { 362 | "0": "Not entitled", 363 | "1": "Part A only", 364 | "2": "Part B only", 365 | "3": "Part A and Part B", 366 | "A": "Part A state buy-in", 367 | "B": "Part B state buy-in", 368 | "C": "Part A and Part B state buy-in" 369 | } 370 | }, 371 | "buyin06": { 372 | "name": "Medicare Entitlement/Buy-In Indicator - June", 373 | "values": { 374 | "0": "Not entitled", 375 | "1": "Part A only", 376 | "2": "Part B only", 377 | "3": "Part A and Part B", 378 | "A": "Part A state buy-in", 379 | "B": "Part B state buy-in", 380 | "C": "Part A and Part B state buy-in" 381 | } 382 | }, 383 | "buyin07": { 384 | "name": "Medicare Entitlement/Buy-In Indicator - July", 385 | "values": { 386 | "0": "Not entitled", 387 | "1": "Part A only", 388 | "2": "Part B only", 389 | "3": "Part A and Part B", 390 | "A": "Part A state buy-in", 391 | "B": "Part B state buy-in", 392 | "C": "Part A and Part B state buy-in" 393 | } 394 | }, 395 | "buyin08": { 396 | "name": "Medicare Entitlement/Buy-In Indicator - August", 397 | "values": { 398 | "0": "Not entitled", 399 | "1": "Part A only", 400 | "2": "Part B only", 401 | "3": "Part A and Part B", 402 | "A": "Part A state buy-in", 403 | "B": "Part B state buy-in", 404 | "C": "Part A and Part B state buy-in" 405 | } 406 | }, 407 | "buyin09": { 408 | "name": "Medicare Entitlement/Buy-In Indicator - September", 409 | "values": { 410 | "0": "Not entitled", 411 | "1": "Part A only", 412 | "2": "Part B only", 413 | "3": "Part A and Part B", 414 | "A": "Part A state buy-in", 415 | "B": "Part B state buy-in", 416 | "C": "Part A and Part B state buy-in" 417 | } 418 | }, 419 | "buyin10": { 420 | "name": "Medicare Entitlement/Buy-In Indicator - October", 421 | "values": { 422 | "0": "Not entitled", 423 | "1": "Part A only", 424 | "2": "Part B only", 425 | "3": "Part A and Part B", 426 | "A": "Part A state buy-in", 427 | "B": "Part B state buy-in", 428 | "C": "Part A and Part B state buy-in" 429 | } 430 | }, 431 | "buyin11": { 432 | "name": "Medicare Entitlement/Buy-In Indicator - November", 433 | "values": { 434 | "0": "Not entitled", 435 | "1": "Part A only", 436 | "2": "Part B only", 437 | "3": "Part A and Part B", 438 | "A": "Part A state buy-in", 439 | "B": "Part B state buy-in", 440 | "C": "Part A and Part B state buy-in" 441 | } 442 | }, 443 | "buyin12": { 444 | "name": "Medicare Entitlement/Buy-In Indicator - December", 445 | "values": { 446 | "0": "Not entitled", 447 | "1": "Part A only", 448 | "2": "Part B only", 449 | "3": "Part A and Part B", 450 | "A": "Part A state buy-in", 451 | "B": "Part B state buy-in", 452 | "C": "Part A and Part B state buy-in" 453 | } 454 | }, 455 | "hmoind01": { 456 | "name": "HMO Indicator - January", 457 | "values": { 458 | "0": "Not a member of an HMO", 459 | "1": "Non-lock-in, CMS to process provider claims", 460 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 461 | "4": "Fee-for-service participant in case or disease management demonstration project", 462 | "5": "Not in documentation", 463 | "A": "Lock-in, CMS to process provider claims", 464 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 465 | "C": "Lock-in, GHO to process all provider claims" 466 | } 467 | }, 468 | "hmoind02": { 469 | "name": "HMO Indicator - February", 470 | "values": { 471 | "0": "Not a member of an HMO", 472 | "1": "Non-lock-in, CMS to process provider claims", 473 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 474 | "4": "Fee-for-service participant in case or disease management demonstration project", 475 | "5": "Not in documentation", 476 | "A": "Lock-in, CMS to process provider claims", 477 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 478 | "C": "Lock-in, GHO to process all provider claims" 479 | } 480 | }, 481 | "hmoind03": { 482 | "name": "HMO Indicator - March", 483 | "values": { 484 | "0": "Not a member of an HMO", 485 | "1": "Non-lock-in, CMS to process provider claims", 486 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 487 | "4": "Fee-for-service participant in case or disease management demonstration project", 488 | "5": "Not in documentation", 489 | "A": "Lock-in, CMS to process provider claims", 490 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 491 | "C": "Lock-in, GHO to process all provider claims" 492 | } 493 | }, 494 | "hmoind04": { 495 | "name": "HMO Indicator - April", 496 | "values": { 497 | "0": "Not a member of an HMO", 498 | "1": "Non-lock-in, CMS to process provider claims", 499 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 500 | "4": "Fee-for-service participant in case or disease management demonstration project", 501 | "5": "Not in documentation", 502 | "A": "Lock-in, CMS to process provider claims", 503 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 504 | "C": "Lock-in, GHO to process all provider claims" 505 | } 506 | }, 507 | "hmoind05": { 508 | "name": "HMO Indicator - May", 509 | "values": { 510 | "0": "Not a member of an HMO", 511 | "1": "Non-lock-in, CMS to process provider claims", 512 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 513 | "4": "Fee-for-service participant in case or disease management demonstration project", 514 | "5": "Not in documentation", 515 | "A": "Lock-in, CMS to process provider claims", 516 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 517 | "C": "Lock-in, GHO to process all provider claims" 518 | } 519 | }, 520 | "hmoind06": { 521 | "name": "HMO Indicator - June", 522 | "values": { 523 | "0": "Not a member of an HMO", 524 | "1": "Non-lock-in, CMS to process provider claims", 525 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 526 | "4": "Fee-for-service participant in case or disease management demonstration project", 527 | "5": "Not in documentation", 528 | "A": "Lock-in, CMS to process provider claims", 529 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 530 | "C": "Lock-in, GHO to process all provider claims" 531 | } 532 | }, 533 | "hmoind07": { 534 | "name": "HMO Indicator - July", 535 | "values": { 536 | "0": "Not a member of an HMO", 537 | "1": "Non-lock-in, CMS to process provider claims", 538 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 539 | "4": "Fee-for-service participant in case or disease management demonstration project", 540 | "5": "Not in documentation", 541 | "A": "Lock-in, CMS to process provider claims", 542 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 543 | "C": "Lock-in, GHO to process all provider claims" 544 | } 545 | }, 546 | "hmoind08": { 547 | "name": "HMO Indicator - August", 548 | "values": { 549 | "0": "Not a member of an HMO", 550 | "1": "Non-lock-in, CMS to process provider claims", 551 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 552 | "4": "Fee-for-service participant in case or disease management demonstration project", 553 | "5": "Not in documentation", 554 | "A": "Lock-in, CMS to process provider claims", 555 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 556 | "C": "Lock-in, GHO to process all provider claims" 557 | } 558 | }, 559 | "hmoind09": { 560 | "name": "HMO Indicator - September", 561 | "values": { 562 | "0": "Not a member of an HMO", 563 | "1": "Non-lock-in, CMS to process provider claims", 564 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 565 | "4": "Fee-for-service participant in case or disease management demonstration project", 566 | "5": "Not in documentation", 567 | "A": "Lock-in, CMS to process provider claims", 568 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 569 | "C": "Lock-in, GHO to process all provider claims" 570 | } 571 | }, 572 | "hmoind10": { 573 | "name": "HMO Indicator - October", 574 | "values": { 575 | "0": "Not a member of an HMO", 576 | "1": "Non-lock-in, CMS to process provider claims", 577 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 578 | "4": "Fee-for-service participant in case or disease management demonstration project", 579 | "5": "Not in documentation", 580 | "A": "Lock-in, CMS to process provider claims", 581 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 582 | "C": "Lock-in, GHO to process all provider claims" 583 | } 584 | }, 585 | "hmoind11": { 586 | "name": "HMO Indicator - November", 587 | "values": { 588 | "0": "Not a member of an HMO", 589 | "1": "Non-lock-in, CMS to process provider claims", 590 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 591 | "4": "Fee-for-service participant in case or disease management demonstration project", 592 | "5": "Not in documentation", 593 | "A": "Lock-in, CMS to process provider claims", 594 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 595 | "C": "Lock-in, GHO to process all provider claims" 596 | } 597 | }, 598 | "hmoind12": { 599 | "name": "HMO Indicator - December", 600 | "values": { 601 | "0": "Not a member of an HMO", 602 | "1": "Non-lock-in, CMS to process provider claims", 603 | "2": "Non-lock-in, group health organization (GHO; MA plan) to process in plan Part A and in area Part B claims", 604 | "4": "Fee-for-service participant in case or disease management demonstration project", 605 | "5": "Not in documentation", 606 | "A": "Lock-in, CMS to process provider claims", 607 | "B": "Lock-in, GHO to process in plan Part A and in area Part B claims", 608 | "C": "Lock-in, GHO to process all provider claims" 609 | } 610 | } 611 | } -------------------------------------------------------------------------------- /medicare_utils/metadata/codebook/med.json: -------------------------------------------------------------------------------- 1 | { 2 | "clm_type": { 3 | "name": "MEDPAR NCH Claim Type Code", 4 | "values": { 5 | "10": "HHA claim", 6 | "20": "Non swing bed SNF claim", 7 | "30": "Swing bed SNF claim", 8 | "40": "Outpatient claim", 9 | "50": "Hospice claim", 10 | "60": "Inpatient claim", 11 | "61": "Inpatient 'Full-Encounter' claim", 12 | "62": "Medicare Advantage IME/GME claims", 13 | "63": "Medicare Advantage (no-pay) claims", 14 | "64": "Medicare Advantage (paid as FFS) claim", 15 | "71": "RIC O local carrier non-DMEPOS claim", 16 | "72": "RIC O local carrier DMEPOS claim", 17 | "81": "RIC M DMERC non-DMEPOS claim", 18 | "82": "RIC M DMERC DMEPOS claim" 19 | } 20 | }, 21 | "sex": { 22 | "name": "MEDPAR Beneficiary Sex Code", 23 | "values": { 24 | "0": "Unknown", 25 | "2": "Female", 26 | "1": "Male" 27 | } 28 | }, 29 | "race": { 30 | "name": "MEDPAR Beneficiary Race Code", 31 | "values": { 32 | "1": "White", 33 | "2": "Black", 34 | "3": "Other", 35 | "4": "Asian", 36 | "5": "Hispanic", 37 | "6": "North American Native", 38 | "0": "Unknown" 39 | } 40 | }, 41 | "ms_cd": { 42 | "name": "MEDPAR Beneficiary Medicare Status Code", 43 | "values": { 44 | "10": "Aged without ESRD", 45 | "11": "Aged with ESRD", 46 | "20": "Disabled without ESRD", 47 | "21": "Disabled with ESRD", 48 | "31": "ESRD only" 49 | } 50 | }, 51 | "state_cd": { 52 | "name": "MEDPAR Beneficiary Residence SSA Standard State Code", 53 | "values": { 54 | "01": "Alabama", 55 | "02": "Alaska", 56 | "03": "Arizona", 57 | "04": "Arkansas", 58 | "05": "California", 59 | "06": "Colorado", 60 | "07": "Connecticut", 61 | "08": "Delaware", 62 | "09": "District of Columbia", 63 | "10": "Florida", 64 | "11": "Georgia", 65 | "12": "Hawaii", 66 | "13": "Idaho", 67 | "14": "Illinois", 68 | "15": "Indiana", 69 | "16": "Iowa", 70 | "17": "Kansas", 71 | "18": "Kentucky", 72 | "19": "Louisiana", 73 | "20": "Maine", 74 | "21": "Maryland", 75 | "22": "Massachusetts", 76 | "23": "Michigan", 77 | "24": "Minnesota", 78 | "25": "Mississippi", 79 | "26": "Missouri", 80 | "27": "Montana", 81 | "28": "Nebraska", 82 | "29": "Nevada", 83 | "30": "New Hampshire", 84 | "31": "New Jersey", 85 | "32": "New Mexico", 86 | "33": "New York", 87 | "34": "North Carolina", 88 | "35": "North Dakota", 89 | "36": "Ohio", 90 | "37": "Oklahoma", 91 | "38": "Oregon", 92 | "39": "Pennsylvania", 93 | "40": "Puerto Rico", 94 | "41": "Rhode Island", 95 | "42": "South Carolina", 96 | "43": "South Dakota", 97 | "44": "Tennesee", 98 | "45": "Texas", 99 | "46": "Utah", 100 | "47": "Vermont", 101 | "48": "Virgin Islands", 102 | "49": "Virginia", 103 | "50": "Washington", 104 | "51": "West Virginia", 105 | "52": "Wisconsin", 106 | "53": "Wyoming", 107 | "54": "Africa", 108 | "55": "Asia", 109 | "56": "Canada", 110 | "57": "Central America & West Indies", 111 | "58": "Europe", 112 | "59": "Mexico", 113 | "60": "Oceania", 114 | "61": "Philippines", 115 | "62": "South America", 116 | "63": "U.S. Possessions", 117 | "97": "Saipan - MP", 118 | "98": "Guam", 119 | "99": "American Samoa" 120 | } 121 | }, 122 | "admsnday": { 123 | "name": "MEDPAR Admission Day Code", 124 | "values": { 125 | "1": "Sunday", 126 | "2": "Monday", 127 | "3": "Tuesday", 128 | "4": "Wednesday", 129 | "5": "Thursday", 130 | "6": "Friday", 131 | "7": "Saturday" 132 | } 133 | }, 134 | "dschrgcd": { 135 | "name": "MEDPAR Beneficiary Discharge Status Code", 136 | "values": { 137 | "A": "Discharged alive", 138 | "B": "Discharged dead", 139 | "C": "Still a patient" 140 | } 141 | }, 142 | "ghopdcd": { 143 | "name": "MEDPAR GHO Paid Code", 144 | "values": { 145 | "1": "GHO has paid the provider", 146 | "0": "GHO has not paid the provider", 147 | "": "GHO has not paid the provider" 148 | } 149 | }, 150 | "pps_ind": { 151 | "name": "MEDPAR PPS Indicator Code", 152 | "values": { 153 | "0": "Non PPS", 154 | "2": "PPS" 155 | } 156 | }, 157 | "prvdrnum": { 158 | "name": "MEDPAR Provider Number", 159 | "values": {} 160 | }, 161 | "spclunit": { 162 | "name": "MEDPAR Provider Number Special Unit Code", 163 | "values": { 164 | "M": "PPS-exempt psychiatric unit in CAH", 165 | "R": "PPS-exempt rehabilitation unit in CAH", 166 | "S": "PPS-exempt psychiatric unit", 167 | "T": "PPS-exempt rehabilitation unit", 168 | "U": "Swing-bed short-term/acute care hospital", 169 | "W": "Swing-bed long-term hospital", 170 | "Y": "Swing-bed rehabilitation hospital", 171 | "Z": "Swing-bed rural primary care hospital; eff 10/97 changed to critical access hospitals", 172 | "": "Not PPS-exempt or swing-bed designation" 173 | } 174 | }, 175 | "sslssnf": { 176 | "name": "MEDPAR Short Stay/Long Stay/SNF Indicator Code", 177 | "values": { 178 | "N": "SNF Stay", 179 | "S": "Short-Stay", 180 | "L": "Long-Stay" 181 | } 182 | }, 183 | "actv_xref_ind": { 184 | "name": "MEDPAR Active Cross Reference Indicator", 185 | "values": { 186 | "X": "Cross-Reference", 187 | "A": "Active" 188 | } 189 | }, 190 | "icuindcd": { 191 | "name": "MEDPAR Intensive Care Unit (ICU) Indicator Code", 192 | "values": { 193 | "0": "General", 194 | "1": "Surgical", 195 | "2": "Medical", 196 | "3": "Pediatric", 197 | "4": "Psychiatric", 198 | "6": "Intermediate IOU", 199 | "7": "Burn care", 200 | "8": "Trauma", 201 | "9": "Other intensive care" 202 | } 203 | }, 204 | "crnry_cd": { 205 | "name": "MEDPAR Coronary Care Indicator Code", 206 | "values": { 207 | "0": "General", 208 | "1": "Myocardial", 209 | "2": "Pulmonary care", 210 | "3": "Heart transplant", 211 | "4": "Intermediate CCU", 212 | "9": "Other Coronary Care", 213 | "": "No coronary care indication" 214 | } 215 | }, 216 | "phrmcycd": { 217 | "name": "MEDPAR Pharmacy Indicator Code", 218 | "values": { 219 | "0": "No drugs", 220 | "1": "General drugs and/pr IV therapy", 221 | "2": "Erythropoietin", 222 | "3": "Blood clotting drugs", 223 | "4": "General drugs and/or IV therapy; and epoetin", 224 | "5": "General drugs and/or IV therapy; and blood clotting drugs" 225 | } 226 | }, 227 | "trnsplnt": { 228 | "name": "MEDPAR Transplant Indicator Code", 229 | "values": { 230 | "0": "No organ or kidney transplant", 231 | "2": "Organ transplant other than kidney", 232 | "7": "Kidney transplant" 233 | } 234 | }, 235 | "onclgysw": { 236 | "name": "MEDPAR Radiology Oncology Indicator Switch", 237 | "values": { 238 | "0": "No radiology-oncology", 239 | "1": "Yes radiology-oncology" 240 | } 241 | }, 242 | "dgnstcsw": { 243 | "name": "MEDPAR Radiology Diagnostic Indicator Switch", 244 | "values": { 245 | "0": "No radiology-diagnostic", 246 | "1": "Yes radiology-diagnostic" 247 | } 248 | }, 249 | "thrptcsw": { 250 | "name": "MEDPAR Radiology Therapeutic Indicator Switch", 251 | "values": { 252 | "0": "No radiology-therapeutic", 253 | "1": "Yes radiology-therapeutic" 254 | } 255 | }, 256 | "nuclr_sw": { 257 | "name": "MEDPAR Radiology Nuclear Medicine Indicator Switch", 258 | "values": { 259 | "0": "No nuclear medicine", 260 | "1": "Yes nuclear medicine" 261 | } 262 | }, 263 | "ctscansw": { 264 | "name": "MEDPAR Radiology CT Scan Indicator Switch", 265 | "values": { 266 | "0": "No radiology CT scan", 267 | "1": "Yes radiology CT scan" 268 | } 269 | }, 270 | "imgng_sw": { 271 | "name": "MEDPAR Radiology Other Imaging Indicator Switch", 272 | "values": { 273 | "0": "No other imaging services", 274 | "1": "Yes other imaging services" 275 | } 276 | }, 277 | "opsrvccd": { 278 | "name": "MEDPAR Outpatient Services Indicator Code", 279 | "values": { 280 | "0": "No outpatient services/ambulatory surgical care", 281 | "1": "Outpatient services", 282 | "2": "Ambulatory surgical care", 283 | "3": "Outpatient services and ambulatory surgical care" 284 | } 285 | }, 286 | "orgncd": { 287 | "name": "MEDPAR Organ Acquisition Indicator Code", 288 | "values": { 289 | "K1": "General classification", 290 | "K2": "Living donor kidney", 291 | "K3": "Cadaver donor kidney", 292 | "K4": "Unknown donor kidney", 293 | "K5": "Other kidney acquisition", 294 | "H1": "Cadaver donor heart", 295 | "H2": "Other heart acquisition", 296 | "L1": "Donor liver", 297 | "01": "Other organ acquisition", 298 | "02": "General acquisition", 299 | "B1": "Bone donor bank", 300 | "03": "Organ donor bank other than kidney", 301 | "S1": "Skin donor bank", 302 | "04": "Other donor bank", 303 | "": "No organ acquisition indication" 304 | } 305 | }, 306 | "esrdstg{x}": { 307 | "name": "MEDPAR ESRD Setting Indicator Code", 308 | "values": { 309 | "00": "Ip renal dialysis-general", 310 | "01": "Ip renal dialysis-hemodialysis", 311 | "02": "Ip renal dialysis-peritoneal", 312 | "03": "Ip renal dialysis-capd", 313 | "04": "Ip renal dialysis-ccpd", 314 | "09": "Ip renal dialysis-other", 315 | "20": "Hemodialysis-op-general", 316 | "21": "Hemodialysis-op-hemodialysis/composite", 317 | "22": "Hemodialysis-op-home supplies", 318 | "23": "Hemodialysis-op-home equipment", 319 | "24": "Hemodialysis-op-maintenance/100%", 320 | "25": "Hemodialysis-op-support services", 321 | "29": "Hemodialysis-op-other", 322 | "30": "Peritoneal-op/home-general", 323 | "31": "Peritoneal-op/home-peritoneal/composite", 324 | "32": "Peritoneal-op/home-home supplies", 325 | "33": "Peritoneal-op/home-home equipment", 326 | "34": "Peritoneal-op/home-maintenance/100%", 327 | "35": "Peritoneal-op/home-support services", 328 | "39": "Peritoneal-op/home-other", 329 | "40": "Capd-op-capd/general", 330 | "41": "Capd-op-capd/composite", 331 | "42": "Capd-op-home supplies", 332 | "43": "Capd-op-home equipment", 333 | "44": "Capd-op-maintenance/100%", 334 | "45": "Capd-op-support services", 335 | "49": "Capd-op-other", 336 | "50": "Ccpd-op-ccpd/general", 337 | "51": "Ccpd-op-ccpd/composite", 338 | "52": "Ccpd-op-home supplies", 339 | "53": "Ccpd-op-home equipment", 340 | "54": "Ccpd-op-maintenance/100%", 341 | "55": "Ccpd-op-support services", 342 | "59": "Ccpd-op-other", 343 | "80": "Miscellaneous dialysis-general", 344 | "81": "Miscellaneous dialysis-ultrafiltration", 345 | "89": "Miscellaneous dialysis-other", 346 | "": "No ESRD setting indication" 347 | } 348 | }, 349 | "poa_dgns_e_{x}_ind_cd": { 350 | "name": "MEDPAR Diagnosis E Code Present on Admission Indicator", 351 | "values": { 352 | "Y": "Diagnosis was present at the time of inpatient admission. CMS will pay the CC/MCC DRG for those selected HACs that are coded as 'Y' for the POA Indicator.", 353 | "N": "Diagnosis was not present at the time of inpatient admission. CMS will not pay the CC/MCC DRG for those selected HACs that are coded as 'N' for the POA Indicator.", 354 | "U": "Documentation is insufficient to determine if the condition was present at the time of inpatient admission. CMS will not pay the CC/MCC DRG for those selected HACs that are coded as 'U' for the POA Indicator.", 355 | "W": "Clinically undetermined. Provider is unable to clinically determine whether condition was present at the time of inpatient admission. CMS will pay the CC/MCC DRG for those selected HACs that are coded as 'W' for the POA Indicator.", 356 | "1": "Unreported/not used -- exempt from POA reporting -- This code is equivalent to a blank pn the UB-04, however, it was determined that blanks are undesirable when submitting this data via the 4010A. CMS will not pay the CC/MCC DRG for those selected HACs that are coded as '1' for the POA Indicator. The '1' POA Indicator should not be applied to any codes on the HAC list.", 357 | "Z": "Denotes the end of the POA indicators (terminated 1/2011).", 358 | "X": "Denotes the end of the POA indicators in special data processing situations that may be identified by CMS in the future (terminated 1/2011).", 359 | "": "Identifies diagnosis codes that are exempt from the POA reporting requirements (replaces the '1'). NOTE: NCH/NMUD will carry a '0' in place of a blank." 360 | } 361 | }, 362 | "poa_dgns_{x}_ind_cd": { 363 | "name": "MEDPAR Diagnosis Present on Admission Indicator Code", 364 | "values": { 365 | "Y": "Diagnosis was present at the time of inpatient admission. CMS will pay the CC/MCC DRG for those selected HACs that are coded as 'Y' for the POA Indicator.", 366 | "N": "Diagnosis was not present at the time of inpatient admission. CMS will not pay the CC/MCC DRG for those selected HACs that are coded as 'N' for the POA Indicator.", 367 | "U": "Documentation is insufficient to determine if the condition was present at the time of inpatient admission. CMS will not pay the CC/MCC DRG for those selected HACs that are coded as 'U' for the POA Indicator.", 368 | "W": "Clinically undetermined. Provider is unable to clinically determine whether condition was present at the time of inpatient admission. CMS will pay the CC/MCC DRG for those selected HACs that are coded as 'W' for the POA Indicator.", 369 | "1": "Unreported/not used -- exempt from POA reporting -- This code is equivalent to a blank pn the UB-04, however, it was determined that blanks are undesirable when submitting this data via the 4010A. CMS will not pay the CC/MCC DRG for those selected HACs that are coded as '1' for the POA Indicator. The '1' POA Indicator should not be applied to any codes on the HAC list.", 370 | "Z": "Denotes the end of the POA indicators (terminated 1/2011).", 371 | "X": "Denotes the end of the POA indicators in special data processing situations that may be identified by CMS in the future (terminated 1/2011).", 372 | "": "Identifies diagnosis codes that are exempt from the POA reporting requirements (replaces the '1'). NOTE: NCH/NMUD will carry a '0' in place of a blank." 373 | } 374 | }, 375 | "prcdrsw": { 376 | "name": "MEDPAR Surgical Procedure Indicator Switch", 377 | "values": { 378 | "0": "No surgery indicated", 379 | "1": "Yes surgery indicated" 380 | } 381 | }, 382 | "dstntncd": { 383 | "name": "MEDPAR Discharge Destination Code", 384 | "values": { 385 | "01": "Discharged to home/self care (routine charge).", 386 | "02": "Discharged/transferred to other short term general hospital for inpatient care.", 387 | "03": "Discharged/transferred to skilled nursing facility (SNF)", 388 | "04": "Discharged/transferred to intermediate care facility (ICF).", 389 | "05": "Discharged/transferred to another type of institution for inpatient care", 390 | "06": "Discharged/transferred to home care of organized home health service organization.", 391 | "07": "Left against medical advice or discontinued care.", 392 | "08": "Discharged/transferred to home under care of a home IV drug therapy provider.", 393 | "09": "Admitted as an inpatient to this hospital (effective 3/1/91). In situations where a patient is admitted before midnight of the third day following the day of an outpatient service, the outpatient services are considered inpatient.", 394 | "20": "Expired (did not recover - Christian Science patient).", 395 | "30": "Still patient.", 396 | "40": "Expired at home (hospice claims only)", 397 | "41": "Expired in a medical facility such as hospital, SNF, ICF, or freestanding hospice. (Hospice claims only)", 398 | "42": "Expired - place unknown (Hospice claims only)", 399 | "43": "Discharged/transferred to a federal hospital", 400 | "50": "Hospice - home", 401 | "51": "Hospice - medical facility", 402 | "61": "Discharged/transferred within this institution to a hospital-based swing bed", 403 | "62": "Discharged/transferred to an inpatient rehabilitation facility.", 404 | "63": "Discharged/transferred to a long term care hospitals.", 405 | "64": "Discharged/transferred to a nursing facility certified under Medicaid only", 406 | "65": "Discharged/Transferred to a psychiatric hospital or psychiatric distinct unit of a hospital.", 407 | "66": "Discharged/transferred to a Critical Access Hospital (CAH)", 408 | "70": "Other", 409 | "71": "Discharged/transferred/referred to another institution for outpatient services as specified by the discharge plan of care", 410 | "72": "Discharged/transferred/referred to this institution for outpatient services as specified by the discharge plan of care" 411 | } 412 | }, 413 | "outlr_cd": { 414 | "name": "MEDPAR DRG/Outlier Stay Code", 415 | "values": { 416 | "0": "No Outlier", 417 | "1": "Day Outlier", 418 | "2": "Cost Outlier", 419 | "6": "Valid DRG Received From Intermediary", 420 | "7": "HCFA-Developed DRG", 421 | "8": "HCFA-Developed DRG Using Claim Status Code", 422 | "9": "Not Groupable" 423 | } 424 | }, 425 | "prpay_cd": { 426 | "name": "MEDPAR Beneficiary Primary Payer Code", 427 | "values": { 428 | "A": "Working aged bene/spouse with eghp", 429 | "B": "ESRD bene in 18-month coordination period with eghp", 430 | "C": "Conditional Medicare payment; future reimbursement expected", 431 | "D": "Auto no-fault or any liability insurance", 432 | "E": "Worker's compensation", 433 | "F": "Phs or other federal agency (other than dept of veterans affairs)", 434 | "G": "Working disabled", 435 | "H": "Black lung", 436 | "I": "Dept of veterans affairs", 437 | "J": "Any liability insurance", 438 | "Z": "Medicare is primary payer", 439 | "": "Medicare is primary payer" 440 | } 441 | }, 442 | "src_adms": { 443 | "name": "MEDPAR Source Inpatient Admission Code", 444 | "values": { 445 | "0": "ANOMALY: invalid value, if present, translate to '9'", 446 | "1": "Non-Health Care Facility Point of Origin (Physician Referral) - The patient was admitted to this facility upon an order of a physician.", 447 | "2": "Clinical referral - The patient was admitted upon the recommendation of this facility's clinic physician.", 448 | "3": "HMO referral - Reserved for national assignment. (eff. 3/08) Prior to 3/08, HMO referral - The patient was admitted upon the recommendation of a health maintenance organization (HMO) physician.", 449 | "4": "Transfer from hospital (Different Facility) - The patient was admitted to this facility as a hospital transfer from an acute care facility where he or she was an inpatient.", 450 | "5": "Transfer from a skilled nursing facility (SNF) or Intermediate Care Facility (ICF) - The patient was admitted to this facility as a transfer from a SNF or ICF where he or she was a resident.", 451 | "6": "Transfer from another health care facility - The patient was admitted to this facility as a transfer from another type of health care facility not defined elsewhere in this code list where he or she was an inpatient.", 452 | "7": "Emergency room - The patient was admitted to this facility after receiving services in this facility's emergency room department. (Obsolete - eff. 7/1/10)", 453 | "8": "Court/law enforcement - The patient was admitted upon the direction of a court of law or upon the request of a law enforcement agency's representative. Includes transfers from incarceration facilities.", 454 | "9": "Information not available - The means by which the patient was admitted is not known.", 455 | "A": "Reserved for National Assignment. (eff. 3/08) Prior to 3/08 defined as: Transfer from a Critical Access Hospital - patient was admitted/referred to this facility as a transfer from a Critical Access Hospital.", 456 | "B": "Transfer from Another Home Health Agency - The patient was admitted to this home health agency as a transfer from another home health agency. (Discontinued July 1, 2010 - See Condition Code 47)", 457 | "C": "Readmission to Same Home Health Agency - The patient was readmitted to this home health agency within the same home health episode period. (Discontinued July 1, 2010)", 458 | "D": "Transfer from hospital inpatient in the same facility resulting in a separate claim to the payer - The patient was admitted to this facility as a transfer from hospital inpatient within this facility resulting in a separate claim to the payer.", 459 | "E": "Transfer from Ambulatory Surgery Center - The patient was admitted to this facility as a transfer from an ambulatory surgery center. (eff. 10/1/2007)", 460 | "F": "Transfer from Hospice and is under a Hospice Plan of Care or Enrolled in a Hospice Program - The patient was admitted to this facility as a transfer from a hospice. (eff. 10/1/2007)" 461 | } 462 | }, 463 | "type_adm": { 464 | "name": "MEDPAR Inpatient Admission Type Code", 465 | "values": { 466 | "0": "Blank", 467 | "1": "Emergency - The patient required immediate medical intervention as a result of severe, life threatening, or potentially disabling conditions. Generally, the patient was admitted through the emergency room.", 468 | "2": "Urgent - The patient required immediate attention for the care and treatment of a physical or mental disorder. Generally, the patient was admitted to the first available and suitable accommodation.", 469 | "3": "Elective - The patient's condition permitted adequate time to schedule the availability of suitable accommodations.", 470 | "4": "Newborn - Necessitates the use of special source of admission codes.", 471 | "5": "Trauma Center - visits to a trauma center/hospital as licensed or designated by the State or local government authority authorized to do so, or as verified by the American College of Surgeons and involving a trauma activation.", 472 | "9": "Unknown - Information not available." 473 | } 474 | }, 475 | "wrngcd": { 476 | "name": "MEDPAR Warning Indicators Code", 477 | "values": {} 478 | } 479 | } -------------------------------------------------------------------------------- /medicare_utils/parquet.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | import re 3 | import math 4 | import json 5 | import inspect 6 | import pkg_resources 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from time import time 11 | from joblib import Parallel, delayed 12 | from typing import Any, Dict, List, Optional, Union 13 | from pathlib import Path 14 | from pkg_resources import resource_filename 15 | from pandas.api.types import CategoricalDtype 16 | 17 | from .utils import fpath, _mywrap 18 | pkg_resources.require("pandas>=0.21.0") 19 | 20 | 21 | def convert_med( 22 | pcts: Union[str, List[str]] = ['0001', '01', '05', '100'], 23 | years: Union[int, List[int]] = range(2001, 2013), 24 | data_types: Union[str, List[str]] = ['carc', 'opc', 'bsfab', 'med'], 25 | rg_size: float = 2.5, 26 | parquet_engine: str = 'pyarrow', 27 | compression_type: str = 'SNAPPY', 28 | manual_schema: bool = False, 29 | ehic_xw: bool = True, 30 | n_jobs: int = 6, 31 | med_dta: str = '/disk/aging/medicare/data', 32 | med_pq: 33 | str = '/disk/agebulk3/medicare.work/doyle-DUA51929/barronk-DUA51929/raw/pq' 34 | ) -> None: # yapf: disable 35 | """Convert Medicare Stata files to parquet 36 | 37 | Args: 38 | pcts: percent samples to convert 39 | years: file years to convert 40 | data_types: 41 | types of data files to convert 42 | 43 | - ``bsfab`` (`Beneficiary Summary File, Base segment`_) 44 | - ``bsfcc`` (`Beneficiary Summary File, Chronic Conditions segment`_) 45 | - ``bsfcu`` (`Beneficiary Summary File, Cost & Use segment`_) 46 | - ``bsfd`` (`Beneficiary Summary File, National Death Index segment`_) 47 | - ``carc`` (`Carrier File, Claims segment`_) 48 | - ``carl`` (`Carrier File, Line segment`_) 49 | - ``den`` (Denominator File) 50 | - ``dmec`` (`Durable Medical Equipment File, Claims segment`_) 51 | - ``dmel`` (`Durable Medical Equipment File, Line segment`_) 52 | - ``hhac`` (`Home Health Agency File, Claims segment`_) 53 | - ``hhar`` (`Home Health Agency File, Revenue Center segment`_) 54 | - ``hosc`` (`Hospice File, Claims segment`_) 55 | - ``hosr`` (`Hospice File, Revenue Center segment`_) 56 | - ``ipc`` (`Inpatient File, Claims segment`_) 57 | - ``ipr`` (`Inpatient File, Revenue Center segment`_) 58 | - ``med`` (`MedPAR File`_) 59 | - ``opc`` (`Outpatient File, Claims segment`_) 60 | - ``opr`` (`Outpatient File, Revenue Center segment`_) 61 | - ``snfc`` (`Skilled Nursing Facility File, Claims segment`_) 62 | - ``snfr`` (`Skilled Nursing Facility File, Revenue Center segment`_) 63 | - ``xw`` (Crosswalks files for ``ehic`` - ``bene_id``) 64 | 65 | .. _`Beneficiary Summary File, Base segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#base-abcd-segment_2 66 | .. _`Beneficiary Summary File, Chronic Conditions segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#chronic-conditions-segment_2 67 | .. _`Beneficiary Summary File, Cost & Use segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#cost-and-use-segment_1 68 | .. _`Beneficiary Summary File, National Death Index segment`: https://kylebarron.github.io/medicare-documentation/resdac/mbsf/#national-death-index-segment_1 69 | .. _`Carrier File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/carrier-rif/#carrier-rif_1 70 | .. _`Carrier File, Line segment`: https://kylebarron.github.io/medicare-documentation/resdac/carrier-rif/#line-file 71 | .. _`Durable Medical Equipment File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/dme-rif/#durable-medical-equipment-rif_1 72 | .. _`Durable Medical Equipment File, Line segment`: https://kylebarron.github.io/medicare-documentation/resdac/dme-rif/#line-file 73 | .. _`Home Health Agency File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/hha-rif/#home-health-agency-rif_1 74 | .. _`Home Health Agency File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/hha-rif/#revenue-center-file 75 | .. _`Hospice File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/hospice-rif/#hospice-rif_1 76 | .. _`Hospice File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/hospice-rif/#revenue-center-file 77 | .. _`Inpatient File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/ip-rif/#inpatient-rif_1 78 | .. _`Inpatient File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/ip-rif/#revenue-center-file 79 | .. _`MedPAR File`: https://kylebarron.github.io/medicare-documentation/resdac/medpar-rif/#medpar-rif_1 80 | .. _`Outpatient File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/op-rif/#outpatient-rif_1 81 | .. _`Outpatient File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/op-rif/#revenue-center-file 82 | .. _`Skilled Nursing Facility File, Claims segment`: https://kylebarron.github.io/medicare-documentation/resdac/snf-rif/#skilled-nursing-facility-rif_1 83 | .. _`Skilled Nursing Facility File, Revenue Center segment`: https://kylebarron.github.io/medicare-documentation/resdac/snf-rif/#revenue-center-file 84 | 85 | rg_size: size in GB of each Parquet row group 86 | parquet_engine: either 'fastparquet' or 'pyarrow' 87 | compression_type: 'SNAPPY' or 'GZIP' 88 | manual_schema: whether to create manual parquet schema. Doesn't 89 | always work. 90 | ehic_xw: Merge bene_id onto old files with ehic 91 | n_jobs: number of processes to use 92 | med_dta: top of tree for medicare stata files 93 | med_pq: top of tree to output new parquet files 94 | """ 95 | 96 | if type(pcts) is str: 97 | pcts = [pcts] 98 | elif type(pcts) is list: 99 | pass 100 | else: 101 | raise TypeError('pcts must be string or list of strings') 102 | 103 | if type(years) is int: 104 | years = [years] 105 | elif type(years) is list: 106 | pass 107 | elif type(years) is range: 108 | pass 109 | else: 110 | raise TypeError('years must be int, range, or list of ints') 111 | 112 | if type(data_types) is str: 113 | data_types = [data_types] 114 | elif type(data_types) is list: 115 | pass 116 | else: 117 | raise TypeError('data_types must be string or list of strings') 118 | 119 | data_list = [[x, y, z] for x in pcts for y in years for z in data_types] 120 | 121 | # Drop 100% carrier: 122 | # data_list = [ 123 | # x for x in data_list if not (x[2] == 'carc') & (x[0] == '100')] 124 | 125 | # Or: 126 | # Replace 100% carrier with 20% carrier: 127 | data_list = [['20', x[1], x[2]] 128 | if ((x[2] == 'carc') & (x[0] == '100')) else x 129 | for x in data_list] 130 | 131 | # Make sure list is unique: 132 | data_list = sorted([list(x) for x in set(tuple(y) for y in data_list)]) 133 | 134 | Parallel(n_jobs=n_jobs)( 135 | delayed(_convert_med)( 136 | *i, 137 | rg_size=rg_size, 138 | parquet_engine=parquet_engine, 139 | compression_type=compression_type, 140 | manual_schema=manual_schema, 141 | ehic_xw=ehic_xw, 142 | med_dta=med_dta, 143 | med_pq=med_pq) for i in data_list) 144 | 145 | 146 | def _convert_med( 147 | pct: str, 148 | year: int, 149 | data_type: Union[str, List[str]], 150 | rg_size: float = 2.5, 151 | parquet_engine: str = 'pyarrow', 152 | compression_type: str = 'SNAPPY', 153 | manual_schema: bool = False, 154 | ehic_xw: bool = True, 155 | med_dta: str = '/disk/aging/medicare/data', 156 | med_pq: 157 | str = '/disk/agebulk3/medicare.work/doyle-DUA51929/barronk-DUA51929/raw/pq' 158 | ) -> None: # yapf: disable 159 | """Convert a single Medicare file to parquet format. 160 | 161 | Args: 162 | pct: percent sample to convert 163 | year: year of data to convert 164 | data_type: 165 | type of data files to convert 166 | 167 | - ``bsfab`` Beneficiary Summary File, Base segment 168 | - ``bsfcc`` Beneficiary Summary File, Chronic Conditions segment 169 | - ``bsfcu`` Beneficiary Summary File, Cost & Use segment 170 | - ``bsfd`` Beneficiary Summary File, National Death Index segment 171 | - ``carc`` Carrier File, Claims segment 172 | - ``carl`` Carrier File, Line segment 173 | - ``den`` Denominator File 174 | - ``dmec`` Durable Medical Equipment File, Claims segment 175 | - ``dmel`` Durable Medical Equipment File, Line segment 176 | - ``hhac`` Home Health Agency File, Claims segment 177 | - ``hhar`` Home Health Agency File, Revenue Center segment 178 | - ``hosc`` Hospice File, Claims segment 179 | - ``hosr`` Hospice File, Revenue Center segment 180 | - ``ipc`` Inpatient File, Claims segment 181 | - ``ipr`` Inpatient File, Revenue Center segment 182 | - ``med`` MedPAR File 183 | - ``opc`` Outpatient File, Claims segment 184 | - ``opr`` Outpatient File, Revenue Center segment 185 | - ``snfc`` Skilled Nursing Facility File, Claims segment 186 | - ``snfr`` Skilled Nursing Facility File, Revenue Center segment 187 | - ``xw`` Crosswalks files for ``ehic`` - ``bene_id`` 188 | rg_size: size in GB of each Parquet row group 189 | parquet_engine: either 'fastparquet' or 'pyarrow' 190 | compression_type: 'SNAPPY' or 'GZIP' 191 | manual_schema: whether to create manual parquet schema. Doesn't 192 | always work. 193 | med_dta: canonical path for raw medicare dta files 194 | med_pq: top of tree to output new parquet files 195 | ehic_xw: Merge bene_id onto old files with ehic 196 | Returns: 197 | nothing. Writes parquet file to disk. 198 | Raises: 199 | NameError if data_type is not one of the above 200 | """ 201 | 202 | if type(pct) != str: 203 | raise TypeError('pct must be str') 204 | if type(year) != int: 205 | raise TypeError('year must be int') 206 | 207 | infile = fpath(percent=pct, year=year, data_type=data_type, dta=True) 208 | outfile = fpath( 209 | percent=pct, year=year, data_type=data_type, dta=False, pq_path=med_pq) 210 | 211 | if not data_type.startswith('bsf'): 212 | # TODO Refactor this into separate function. 213 | path = resource_filename( 214 | 'medicare_utils', f'metadata/xw/{data_type}.json') 215 | try: 216 | with open(path) as f: 217 | varnames = json.load(f) 218 | except OSError: 219 | varnames = {} 220 | 221 | rename_dict = {} 222 | for varname, names in varnames.items(): 223 | n = {k: v for k, v in names.items() if k == str(year)} 224 | if n: 225 | rename_dict[n[str(year)]['name']] = varname 226 | 227 | if rename_dict: 228 | # Remove items from dict that map to duplicate values 229 | # Can't save a parquet file where multiple cols have same name 230 | rev_rename_dict = {} 231 | for key, value in rename_dict.items(): 232 | rev_rename_dict.setdefault(value, set()).add(key) 233 | dups = [key for key, val in rev_rename_dict.items() if len(val) > 1] 234 | 235 | for k, v in rename_dict.copy().items(): 236 | if v in dups: 237 | rename_dict.pop(k) 238 | else: 239 | print(f'Year not in variable dictionary: {year}') 240 | rename_dict = None 241 | else: 242 | rename_dict = None 243 | 244 | # Make folder path if it doesn't exist 245 | folder = Path(outfile).parents[0] 246 | folder.mkdir(exist_ok=True, parents=True) 247 | 248 | msg = f"""\ 249 | Starting {data_type} conversion 250 | - Percent: {pct} 251 | - Year {year} 252 | """ 253 | print(_mywrap(msg)) 254 | 255 | if ehic_xw and (year <= 2005) and not (data_type.startswith('bsf')): 256 | ehic_xw = fpath(pct, year, 'xw_bsf', pq_path=med_pq) 257 | if not Path(ehic_xw).is_file(): 258 | ehic_xw = fpath(pct, year, 'xw_bsf', dta=True, dta_path=med_dta) 259 | else: 260 | ehic_xw = None 261 | 262 | try: 263 | convert_file( 264 | infile=infile, 265 | outfile=outfile, 266 | rename_dict=rename_dict, 267 | rg_size=rg_size, 268 | parquet_engine=parquet_engine, 269 | compression_type=compression_type, 270 | manual_schema=manual_schema, 271 | ehic_xw=ehic_xw) 272 | except: 273 | pass 274 | 275 | 276 | def convert_file( 277 | infile: str, 278 | outfile: str, 279 | rename_dict: Dict[str, str] = None, 280 | rg_size: float = 2.5, 281 | parquet_engine: str = 'pyarrow', 282 | compression_type: str = 'SNAPPY', 283 | manual_schema: bool = False, 284 | ehic_xw: Optional[str] = None) -> None: 285 | """Convert arbitrary Stata file to Parquet format 286 | 287 | Args: 288 | infile: path of file to read from 289 | outfile: path of file to export to 290 | rename_dict: keys should be initial variable names; values should 291 | be new variable names 292 | rg_size: Size in GB of the individual row groups 293 | parquet_engine: either ``pyarrow`` or ``fastparquet`` 294 | compression_type: Compression algorithm to use. Can be ``SNAPPY`` or 295 | ``GZIP``. 296 | manual_schema: Create parquet schema manually. For use with 297 | pyarrow; doesn't always work 298 | ehic_xw: Merge bene_id onto old files with ehic 299 | Returns: 300 | Writes .parquet file to disk. 301 | """ 302 | if parquet_engine == 'pyarrow': 303 | import pyarrow as pa 304 | import pyarrow.parquet as pq 305 | elif parquet_engine == 'fastparquet': 306 | import fastparquet as fp 307 | 308 | t0 = time() 309 | 310 | infile = Path(infile) 311 | # File name without suffix 312 | infile_stub = infile.stem 313 | # Extension 314 | infile_type = infile.suffix[1:] 315 | 316 | # Set row group size. The following makes an even multiple of row groups 317 | # as close as possible to the given `rg_size` 318 | file_size = infile.stat().st_size / (1024 ** 3) 319 | n_rg = round(file_size / rg_size) 320 | if n_rg == 0: 321 | n_rg += 1 322 | 323 | nrow_total = pd.read_stata(infile, iterator=True).nobs 324 | nrow_rg = math.ceil(nrow_total / n_rg) 325 | gb_per_rg = file_size / n_rg 326 | 327 | msg = f"""\ 328 | Row groups: 329 | - {n_rg} of size {gb_per_rg:.2f} GB 330 | Beginning scanning dtypes of file: 331 | - infile: {infile_stub}.{infile_type} 332 | - time: {(time() - t0) / 60:.2f} minutes 333 | """ 334 | print(_mywrap(msg)) 335 | 336 | if parquet_engine == 'pyarrow': 337 | dtypes = _scan_file(infile, categorical=False) 338 | elif parquet_engine == 'fastparquet': 339 | dtypes = _scan_file(infile, categorical=True) 340 | 341 | if rename_dict is not None: 342 | for old_name, new_name in rename_dict.items(): 343 | try: 344 | dtypes[new_name] = dtypes.pop(old_name) 345 | except KeyError: 346 | pass 347 | 348 | msg = f"""\ 349 | Finished scanning dtypes of file 350 | - infile: {infile_stub}.{infile_type} 351 | - time: {(time() - t0) / 60:.2f} minutes 352 | """ 353 | print(_mywrap(msg)) 354 | 355 | if ehic_xw: 356 | ehic_xw = Path(ehic_xw) 357 | if ehic_xw.suffix == '.parquet': 358 | xw = pd.read_parquet(ehic_xw, columns=['ehic', 'bene_id']) 359 | elif ehic_xw.suffix == '.dta': 360 | xw = pd.read_stata(ehic_xw, columns=['ehic', 'bene_id']) 361 | xw = xw.set_index('ehic') 362 | 363 | itr = pd.read_stata(infile, chunksize=nrow_rg) 364 | i = 0 365 | for df in itr: 366 | i += 1 367 | msg = f"""\ 368 | Read from file: 369 | - Group {i} 370 | - infile: {infile_stub}.{infile_type} 371 | - time: {(time() - t0) / 60:.2f} minutes 372 | """ 373 | print(_mywrap(msg)) 374 | 375 | if rename_dict is not None: 376 | df = df.rename(columns=rename_dict) 377 | 378 | # Rename columns that aren't in XW with `x_` prefix 379 | non_xw_cols = set(df.columns).difference(rename_dict.values()) 380 | df = df.rename(columns={x: 'x_' + x for x in non_xw_cols}) 381 | for col in non_xw_cols: 382 | try: 383 | dtypes['x_' + col] = dtypes.pop(col) 384 | except KeyError: 385 | pass 386 | 387 | df = df.astype(dtypes) 388 | 389 | if ehic_xw: 390 | df = df.merge(xw, how='left', left_on='ehic', right_index=True) 391 | 392 | msg = f"""\ 393 | Cleaned file: 394 | - Group {i} 395 | - infile: {infile_stub}.{infile_type} 396 | - time: {(time() - t0) / 60:.2f} minutes 397 | """ 398 | print(_mywrap(msg)) 399 | 400 | if parquet_engine == 'pyarrow': 401 | if i == 1: 402 | if manual_schema: 403 | schema = _create_parquet_schema(df.dtypes) 404 | else: 405 | schema = pa.Table.from_pandas( 406 | df, preserve_index=False).schema 407 | writer = pq.ParquetWriter(outfile, schema, flavor='spark') 408 | 409 | writer.write_table(pa.Table.from_pandas(df, preserve_index=False)) 410 | elif parquet_engine == 'fastparquet': 411 | if i == 1: 412 | fp.write( 413 | outfile, 414 | df, 415 | compression=compression_type, 416 | has_nulls=False, 417 | write_index=False, 418 | object_encoding='utf8') 419 | else: 420 | fp.write( 421 | outfile, 422 | df, 423 | compression=compression_type, 424 | has_nulls=False, 425 | write_index=False, 426 | object_encoding='utf8', 427 | append=True) 428 | 429 | msg = f"""\ 430 | Wrote to parquet: 431 | - Group {i} 432 | - infile: {infile_stub}.{infile_type} 433 | - time: {(time() - t0) / 60:.2f} minutes 434 | """ 435 | print(_mywrap(msg)) 436 | 437 | if parquet_engine == 'pyarrow': 438 | writer.close() 439 | 440 | print('Wrote to .parquet:\n\tAll groups') 441 | 442 | 443 | def _convert_dates(df, datecols): 444 | for col in datecols: 445 | if not pd.core.dtypes.common.is_datetimelike(df.iloc[:, col]): 446 | if df[col].dtype == np.number: 447 | df.iloc[:, col] = pd.to_datetime( 448 | df.iloc[:, col], 449 | unit='D', 450 | origin=pd.Timestamp('1960-01-01'), 451 | errors='coerce') 452 | elif df[col].dtype == 'object': 453 | df.loc[:, 'from_dt'] = pd.to_datetime( 454 | df.loc[:, 'from_dt'], format='%Y-%m-%d', errors='coerce') 455 | return df 456 | 457 | 458 | def _scan_file( 459 | infile: Union[str, Path], 460 | categorical: bool = True, 461 | chunksize: int = 100000, 462 | cat_threshold: float = 0.1, 463 | unsigned: bool = False) -> Dict[str, Any]: 464 | """Scan dta file to find minimal dtypes to hold data in 465 | 466 | For each of the chunks of df: 467 | for string columns: hold all unique values if I want them categorical 468 | for float columns: do nothing 469 | for integer columns: search for missings, highest and lowest value 470 | for date columns: nothing 471 | 472 | Args: 473 | infile: dta file to scan 474 | categorical: whether to change strings to categorical 475 | chunksize: number of rows of infile to read at a time 476 | cat_threshold: maximum fraction of unique values in order 477 | to convert to categorical 478 | 479 | Returns: 480 | dictionary with variable names and dtyplist 481 | """ 482 | itr = pd.read_stata(infile, iterator=True) 483 | varlist_df = pd.DataFrame({ 484 | 'format': itr.fmtlist, 485 | 'name': itr.varlist, 486 | 'col_size': itr.col_sizes, 487 | 'dtype': itr.dtyplist, 488 | 'label': list(itr.variable_labels().values())}) 489 | 490 | start_cols = {} 491 | 492 | date_fmts = ('%tc', '%tC', '%td', '%d', '%tw', '%tm', '%tq', '%th', '%ty') 493 | date_cols = varlist_df['format'].apply(lambda x: x.startswith(date_fmts)) 494 | date_cols = varlist_df[date_cols]['name'].values.tolist() 495 | start_cols['date_cols'] = date_cols 496 | 497 | int_cols = varlist_df['dtype'].apply( 498 | lambda x: np.issubdtype(x, np.integer) if inspect.isclass(x) else False) 499 | int_cols = varlist_df[int_cols]['name'].values.tolist() 500 | int_cols = sorted(list(set(int_cols) - set(date_cols))) 501 | start_cols['int_cols'] = int_cols 502 | 503 | regex = r'%.+s' 504 | str_cols = varlist_df['format'].apply(lambda x: bool(re.search(regex, x))) 505 | str_cols = varlist_df[str_cols]['name'].values.tolist() 506 | start_cols['str_cols'] = str_cols 507 | 508 | float_cols = varlist_df['dtype'].apply( 509 | lambda x: np.issubdtype(x, np.floating) if inspect.isclass(x) else False 510 | ) 511 | float_cols = varlist_df[float_cols]['name'].values.tolist() 512 | start_cols['float_cols'] = float_cols 513 | 514 | end_cols = { 515 | 'date_cols': start_cols['date_cols'], 516 | 'int_cols': { 517 | 'names': start_cols['int_cols'], 518 | 'min': {key: None 519 | for key in start_cols['int_cols']}, 520 | 'max': {key: None 521 | for key in start_cols['int_cols']}}, 522 | 'float_cols': start_cols['float_cols']} 523 | if categorical: 524 | end_cols['cat_cols'] = { 525 | 'names': start_cols['str_cols'], 526 | 'cats': {key: set() 527 | for key in start_cols['str_cols']}} 528 | end_cols['str_cols'] = [] 529 | else: 530 | end_cols['cat_cols'] = {} 531 | end_cols['str_cols'] = start_cols['str_cols'] 532 | 533 | tokeep = [] 534 | tokeep.extend(start_cols['int_cols']) 535 | if categorical: 536 | tokeep.extend(start_cols['str_cols']) 537 | itr = pd.read_stata(infile, columns=tokeep, chunksize=chunksize) 538 | 539 | i = 0 540 | for df in itr: 541 | i += 1 542 | print(f'Scanning group {i} of data') 543 | # Integer vars: 544 | int_cols = end_cols['int_cols']['names'].copy() 545 | for col in int_cols: 546 | # Check missings 547 | if df.loc[:, col].isnull().values.any(): 548 | # If missings, convert to float 549 | end_cols['float_cols'].append(col) 550 | end_cols['int_cols']['names'].remove(col) 551 | end_cols['int_cols']['max'].pop(col) 552 | end_cols['int_cols']['min'].pop(col) 553 | else: 554 | # Check minimum 555 | minval = min(df.loc[:, col]) 556 | if end_cols['int_cols']['min'][col] is None: 557 | end_cols['int_cols']['min'][col] = minval 558 | elif minval < end_cols['int_cols']['min'][col]: 559 | end_cols['int_cols']['min'][col] = minval 560 | 561 | # Check maximum 562 | maxval = max(df.loc[:, col]) 563 | if end_cols['int_cols']['max'][col] is None: 564 | end_cols['int_cols']['max'][col] = maxval 565 | elif maxval > end_cols['int_cols']['max'][col]: 566 | end_cols['int_cols']['max'][col] = maxval 567 | 568 | if categorical: 569 | # Scan str vars for categories 570 | cat_cols = end_cols['cat_cols']['names'].copy() 571 | for col in cat_cols: 572 | num_unique_values = len(df[col].unique()) 573 | num_total_values = len(df[col]) 574 | 575 | if num_unique_values / num_total_values < cat_threshold: 576 | # Then stays as category 577 | # Add category values 578 | unique_vals = df[col].unique().tolist() 579 | end_cols['cat_cols']['cats'][col].update(unique_vals) 580 | else: 581 | print(f'{col} is now a string') 582 | # Becomes regular string column 583 | end_cols['str_cols'].append(col) 584 | end_cols['cat_cols']['cats'].pop(col) 585 | end_cols['cat_cols']['names'].remove(col) 586 | 587 | # Not currently scanning date or float vars 588 | 589 | dtypes_dict = {} 590 | 591 | # Int dtypes: 592 | for col in end_cols['int_cols']['names']: 593 | if unsigned and (end_cols['int_cols']['min'][col] >= 0): 594 | if end_cols['int_cols']['max'][col] <= np.iinfo(np.uint8).max: 595 | dtypes_dict[col] = np.uint8 596 | elif end_cols['int_cols']['max'][col] <= np.iinfo(np.uint16).max: 597 | dtypes_dict[col] = np.uint16 598 | elif end_cols['int_cols']['max'][col] <= np.iinfo(np.uint32).max: 599 | dtypes_dict[col] = np.uint32 600 | elif end_cols['int_cols']['max'][col] <= np.iinfo(np.uint64).max: 601 | dtypes_dict[col] = np.uint64 602 | else: 603 | if False: 604 | pass 605 | elif ((end_cols['int_cols']['max'][col] <= np.iinfo(np.int8).max) & 606 | (end_cols['int_cols']['min'][col] >= np.iinfo(np.int8).min)): 607 | dtypes_dict[col] = np.int8 608 | elif ((end_cols['int_cols']['max'][col] <= np.iinfo(np.int16).max) & 609 | (end_cols['int_cols']['min'][col] >= np.iinfo(np.int16).min)): 610 | dtypes_dict[col] = np.int16 611 | elif ((end_cols['int_cols']['max'][col] <= np.iinfo(np.int32).max) & 612 | (end_cols['int_cols']['min'][col] >= np.iinfo(np.int32).min)): 613 | dtypes_dict[col] = np.int32 614 | elif ((end_cols['int_cols']['max'][col] <= np.iinfo(np.int64).max) & 615 | (end_cols['int_cols']['min'][col] >= np.iinfo(np.int64).min)): 616 | dtypes_dict[col] = np.int64 617 | 618 | for col in end_cols['float_cols']: 619 | dtypes_dict[col] = np.float64 620 | 621 | if categorical: 622 | for col in end_cols['cat_cols']['names']: 623 | dtypes_dict[col] = CategoricalDtype( 624 | end_cols['cat_cols']['cats'][col]) 625 | 626 | return dtypes_dict 627 | 628 | 629 | def _create_parquet_schema(dtypes): 630 | """Create parquet schema from Pandas dtypes 631 | 632 | Args: 633 | dtypes: A dict or Series of dtypes 634 | Returns: 635 | pyarrow.Schema 636 | """ 637 | import pyarrow as pa 638 | 639 | dtypes = dict(dtypes) 640 | fields = [] 641 | for varname, vartype in dtypes.items(): 642 | if vartype == np.float16: 643 | fields.append(pa.field(varname, pa.float16())) 644 | elif vartype == np.float32: 645 | fields.append(pa.field(varname, pa.float32())) 646 | elif vartype == np.float64: 647 | fields.append(pa.field(varname, pa.float64())) 648 | elif vartype == np.int8: 649 | fields.append(pa.field(varname, pa.int8())) 650 | elif vartype == np.int16: 651 | fields.append(pa.field(varname, pa.int16())) 652 | elif vartype == np.int32: 653 | fields.append(pa.field(varname, pa.int32())) 654 | elif vartype == np.int64: 655 | fields.append(pa.field(varname, pa.int64())) 656 | elif vartype == np.uint8: 657 | fields.append(pa.field(varname, pa.uint8())) 658 | elif vartype == np.uint16: 659 | fields.append(pa.field(varname, pa.uint16())) 660 | elif vartype == np.uint32: 661 | fields.append(pa.field(varname, pa.uint32())) 662 | elif vartype == np.uint64: 663 | fields.append(pa.field(varname, pa.uint64())) 664 | elif vartype == np.bool_: 665 | fields.append(pa.field(varname, pa.bool_())) 666 | elif (vartype == object) | (vartype.name == 'category'): 667 | fields.append(pa.field(varname, pa.string())) 668 | elif np.issubdtype(vartype, np.datetime64): 669 | fields.append(pa.field(varname, pa.timestamp('ns'))) 670 | 671 | assert len(dtypes) == len(fields) 672 | schema = pa.schema(fields) 673 | return schema 674 | 675 | 676 | if __name__ == '__main__': 677 | convert_med() 678 | -------------------------------------------------------------------------------- /tests/test_medicare_df.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pytest 3 | import pandas as pd 4 | import medicare_utils as med 5 | 6 | 7 | class TestInit(object): 8 | # All the non-default arguments 9 | @pytest.fixture 10 | def init(self): 11 | return {'percent': '01', 'years': 2012} 12 | 13 | @pytest.mark.parametrize( 14 | 'pct,pct_act', 15 | [('0001', '0001'), 16 | ('01', '01'), 17 | ('05', '05'), 18 | ('20', '20'), 19 | ('100', '100'), 20 | (0.01, '0001'), 21 | (1, '01'), 22 | (5, '05'), 23 | (20, '20'), 24 | (100, '100')]) # yapf: disable 25 | def test_percents(self, init, pct, pct_act): 26 | init['percent'] = pct 27 | mdf = med.MedicareDF(**init) 28 | assert mdf.percent == pct_act 29 | 30 | @pytest.mark.parametrize('pct', ['02', '45', 2, 56]) 31 | def test_invalid_percents(self, init, pct): 32 | init['percent'] = pct 33 | with pytest.raises(ValueError): 34 | med.MedicareDF(**init) 35 | 36 | @pytest.mark.parametrize( 37 | 'years,years_act', 38 | [(2005, [2005]), 39 | (range(2010, 2013), range(2010, 2013)), 40 | ([2010, 2011, 2012], [2010, 2011, 2012])]) # yapf: disable 41 | def test_years(self, init, years, years_act): 42 | init['years'] = years 43 | mdf = med.MedicareDF(**init) 44 | assert mdf.years == years_act 45 | 46 | @pytest.mark.parametrize('years', ['2012', 2012.0]) 47 | def test_invalid_years(self, init, years): 48 | init['years'] = years 49 | with pytest.raises(TypeError): 50 | med.MedicareDF(**init) 51 | 52 | @pytest.mark.parametrize('year_type', ['calendar', 'age']) 53 | def test_year_type(self, year_type): 54 | mdf = med.MedicareDF('01', [2011, 2012], year_type=year_type) 55 | mdf.year_type == year_type 56 | 57 | @pytest.mark.parametrize( 58 | 'years', [2012, [2012], range(2012, 2013), [2010, 2012]]) 59 | def test_invalid_age_years(self, init, years): 60 | init['year_type'] = 'age' 61 | init['years'] = years 62 | with pytest.raises(ValueError): 63 | med.MedicareDF(**init) 64 | 65 | 66 | class TestGetCohortTypeCheck(object): 67 | @pytest.fixture 68 | def init(self): 69 | return { 70 | 'gender': None, 71 | 'ages': None, 72 | 'races': None, 73 | 'rti_race': False, 74 | 'buyin_val': None, 75 | 'hmo_val': None, 76 | 'join': 'left', 77 | 'keep_vars': None, 78 | 'dask': False, 79 | 'verbose': True} 80 | 81 | @pytest.fixture 82 | def mdf(self): 83 | return med.MedicareDF('01', 2012) 84 | 85 | @pytest.mark.parametrize( 86 | 'gender,expected', 87 | [(None, None), 88 | ('unknown', '0'), 89 | ('male', '1'), 90 | ('female', '2'), 91 | ('u', '0'), 92 | ('m', '1'), 93 | ('f', '2'), 94 | ('UNKNOWN', '0'), 95 | ('MALE', '1'), 96 | ('FEMALE', '2'), 97 | ('U', '0'), 98 | ('M', '1'), 99 | ('F', '2'), 100 | ('0', '0'), 101 | ('1', '1'), 102 | ('2', '2')]) # yapf: disable 103 | def test_gender(self, mdf, init, gender, expected): 104 | init['gender'] = gender 105 | result = mdf._get_cohort_type_check(**init) 106 | assert result.gender == expected 107 | 108 | @pytest.mark.parametrize( 109 | 'gender,error', [ 110 | (['string_in_list'], TypeError), 111 | ([1], TypeError), 112 | (1, TypeError), 113 | (2, TypeError), 114 | (0.1, TypeError), 115 | ('ma', ValueError), 116 | ('mal', ValueError), 117 | ('fem', ValueError), 118 | ('femal', ValueError), 119 | ('3', ValueError), 120 | ('-1', ValueError), 121 | ('unkn', ValueError), ]) 122 | def test_gender_type_error(self, mdf, init, gender, error): 123 | init['gender'] = gender 124 | with pytest.raises(error): 125 | mdf._get_cohort_type_check(**init) 126 | 127 | @pytest.mark.parametrize('ages', ['65', 65.5, ['65'], [65, '66'], True]) 128 | def test_ages_type_error(self, mdf, init, ages): 129 | init['ages'] = ages 130 | with pytest.raises(TypeError): 131 | mdf._get_cohort_type_check(**init) 132 | 133 | @pytest.mark.parametrize('rti_race', ['1', '0', 0, 1]) 134 | def test_rti_race(self, mdf, init, rti_race): 135 | init['rti_race'] = rti_race 136 | with pytest.raises(TypeError): 137 | mdf._get_cohort_type_check(**init) 138 | 139 | @pytest.mark.parametrize( 140 | 'races,expected', 141 | [(None, None), 142 | ('unknown', ['0']), 143 | ('white', ['1']), 144 | ('black (or african-american)', ['2']), 145 | ('black', ['2']), 146 | ('african-american', ['2']), 147 | ('other', ['3']), 148 | ('asian pacific islander', ['4']), 149 | ('asian', ['4']), 150 | ('hispanic', ['5']), 151 | ('american indian alaska native', ['6']), 152 | ('american indian', ['6']), 153 | ('UNKNOWN', ['0']), 154 | ('WHITE', ['1']), 155 | ('BLACK (OR AFRICAN-AMERICAN)', ['2']), 156 | ('BLACK', ['2']), 157 | ('AFRICAN-AMERICAN', ['2']), 158 | ('OTHER', ['3']), 159 | ('ASIAN PACIFIC ISLANDER', ['4']), 160 | ('ASIAN', ['4']), 161 | ('HISPANIC', ['5']), 162 | ('AMERICAN INDIAN ALASKA NATIVE', ['6']), 163 | ('AMERICAN INDIAN', ['6']), 164 | (['white', 'black'], ['1', '2']), 165 | (['white', 'black', 'asian'], ['1', '2', '4']), 166 | (['white', 'asian'], ['1', '4']), 167 | (['black', 'asian'], ['2', '4']), 168 | (['0', '1', '2'], ['0', '1', '2']), 169 | ('0', ['0']), 170 | ('1', ['1']), 171 | ('2', ['2']), 172 | ('3', ['3']), 173 | ('4', ['4']), 174 | ('5', ['5']), 175 | ('6', ['6'])]) # yapf: disable 176 | def test_races_rti_true(self, mdf, init, races, expected): 177 | init['rti_race'] = True 178 | init['races'] = races 179 | result = mdf._get_cohort_type_check(**init) 180 | assert result.races == expected 181 | 182 | @pytest.mark.parametrize( 183 | 'races,expected', 184 | [(None, None), 185 | ('unknown', ['0']), 186 | ('white', ['1']), 187 | ('black', ['2']), 188 | ('other', ['3']), 189 | ('asian', ['4']), 190 | ('hispanic', ['5']), 191 | ('north american native', ['6']), 192 | ('UNKNOWN', ['0']), 193 | ('WHITE', ['1']), 194 | ('BLACK', ['2']), 195 | ('OTHER', ['3']), 196 | ('ASIAN', ['4']), 197 | ('HISPANIC', ['5']), 198 | ('NORTH AMERICAN NATIVE', ['6']), 199 | (['white', 'black'], ['1', '2']), 200 | (['white', 'black', 'asian'], ['1', '2', '4']), 201 | (['white', 'asian'], ['1', '4']), 202 | (['black', 'asian'], ['2', '4']), 203 | (['0', '1', '2'], ['0', '1', '2']), 204 | ('0', ['0']), 205 | ('1', ['1']), 206 | ('2', ['2']), 207 | ('3', ['3']), 208 | ('4', ['4']), 209 | ('5', ['5']), 210 | ('6', ['6']) 211 | ]) # yapf: disable 212 | def test_races_rti_false(self, mdf, init, races, expected): 213 | init['rti_race'] = False 214 | init['races'] = races 215 | result = mdf._get_cohort_type_check(**init) 216 | assert result.races == expected 217 | 218 | @pytest.mark.parametrize( 219 | 'buyin_val,expected', [('3', ['3']), (['3'], ['3'])]) 220 | def test_buyin_val(self, mdf, init, buyin_val, expected): 221 | init['buyin_val'] = buyin_val 222 | result = mdf._get_cohort_type_check(**init) 223 | assert result.buyin_val == expected 224 | 225 | @pytest.mark.parametrize('hmo_val,expected', [('3', ['3']), (['3'], ['3'])]) 226 | def test_hmo_val(self, mdf, init, hmo_val, expected): 227 | init['hmo_val'] = hmo_val 228 | result = mdf._get_cohort_type_check(**init) 229 | assert result.hmo_val == expected 230 | 231 | @pytest.mark.parametrize( 232 | 'keep_vars,expected', 233 | [ 234 | ('3', ['3']), 235 | (['3'], ['3']), 236 | (['3', re.compile('a')], ['3', re.compile('a')]), 237 | (re.compile('a'), [re.compile('a')]), 238 | ]) # yapf: disable 239 | def test_keep_vars(self, mdf, init, keep_vars, expected): 240 | init['keep_vars'] = keep_vars 241 | result = mdf._get_cohort_type_check(**init) 242 | assert result.keep_vars == expected 243 | 244 | @pytest.mark.parametrize('join,expected', 245 | [('left', 'left'), 246 | ('right', 'right'), 247 | ('inner', 'inner'), 248 | ('outer', 'outer')]) # yapf: disable 249 | def test_allowed_join(self, mdf, init, join, expected): 250 | init['join'] = join 251 | result = mdf._get_cohort_type_check(**init) 252 | assert result.join == expected 253 | 254 | def test_allowed_join_value_error(self, mdf, init): 255 | init['join'] = 'invalid_string' 256 | with pytest.raises(ValueError): 257 | mdf._get_cohort_type_check(**init) 258 | 259 | def test_dask_type_error(self, mdf, init): 260 | init['dask'] = 'string' 261 | with pytest.raises(TypeError): 262 | mdf._get_cohort_type_check(**init) 263 | 264 | def test_verbose_type_error(self, mdf, init): 265 | init['verbose'] = 'string' 266 | with pytest.raises(TypeError): 267 | mdf._get_cohort_type_check(**init) 268 | 269 | 270 | class TestGetCohortMonthFilter(object): 271 | @pytest.fixture 272 | def mdf(self): 273 | return med.MedicareDF('01', [2010, 2011, 2012], year_type='age') 274 | 275 | @pytest.fixture 276 | def pl(self): 277 | # yapf: disable 278 | data = [ 279 | [1, '2','2','1','1','2','2','1','2','1','2','2','2'], 280 | [2, '2','2','2','1','2','2','1','1','2','2','2','1'], 281 | [3, '2','2','2','1','2','2','1','1','1','2','2','2'], 282 | [4, '1','2','1','1','1','1','2','2','2','1','2','2'], 283 | [5, '2','2','2','1','1','2','2','1','2','1','2','1'], 284 | [6, '2','1','1','1','2','1','1','1','1','2','2','2'], 285 | [7, '2','2','1','1','2','2','1','2','1','2','2','2'], 286 | [8, '2','2','2','1','2','2','1','1','2','2','2','2'], 287 | [9, '2','2','1','1','2','2','1','1','2','2','2','2'], 288 | [10, '1','2','1','1','1','1','2','2','2','2','2','2'], 289 | [11, '2','2','2','1','1','2','2','1','2','2','2','2'], 290 | [12, '2','1','1','1','2','1','1','1','1','2','2','2']] 291 | # yapf: enable 292 | cols = [ 293 | 'dob_month', 'var01', 'var02', 'var03', 'var04', 'var05', 'var06', 294 | 'var07', 'var08', 'var09', 'var10', 'var11', 'var12'] 295 | return pd.DataFrame.from_records(data, columns=cols) 296 | 297 | @pytest.fixture 298 | def exp(self): 299 | return pd.DataFrame({ 300 | 'dob_month': [1, 2, 3, 9, 10, 11, 12], 301 | 'var_younger': [True, True, True, False, False, False, False], 302 | 'var_older': [False, False, False, True, True, True, True]}, 303 | index=[0, 1, 2, 8, 9, 10, 11]) 304 | 305 | def test_month_filter_mid(self, mdf, pl, exp): 306 | df = mdf._get_cohort_month_filter( 307 | pl=pl, var='var', values=['2'], year=2011, keep_vars=[]) 308 | assert df.equals(exp) 309 | 310 | def test_month_filter_first(self, mdf, pl, exp): 311 | df = mdf._get_cohort_month_filter( 312 | pl=pl, var='var', values=['2'], year=2010, keep_vars=[]) 313 | exp = exp.loc[exp['var_older'], ['dob_month', 'var_older']] 314 | assert df.equals(exp) 315 | 316 | def test_month_filter_last(self, mdf, pl, exp): 317 | df = mdf._get_cohort_month_filter( 318 | pl=pl, var='var', values=['2'], year=2012, keep_vars=[]) 319 | exp = exp.loc[exp['var_younger'], ['dob_month', 'var_younger']] 320 | assert df.equals(exp) 321 | 322 | 323 | class TestStrInKeepVars(object): 324 | @pytest.fixture 325 | def mdf(self): 326 | return med.MedicareDF('01', 2012) 327 | 328 | @pytest.mark.parametrize( 329 | 'instr,keep_vars,res', 330 | [('a', ['a', 'b', 'c'], True), 331 | ('d', ['a', 'b', 'c'], False), 332 | ('a', ['a', re.compile(r'b')], True), 333 | ('d', ['a', re.compile(r'b')], False), 334 | ('a', [re.compile(r'a')], True), 335 | ('a', [re.compile(r'b')], False)]) # yapf: disable 336 | def test_str_in_keep_vars(self, mdf, instr, keep_vars, res): 337 | assert res == mdf._str_in_keep_vars(instr, keep_vars) 338 | 339 | 340 | class TestGetPattern(object): 341 | @pytest.fixture 342 | def mdf(self): 343 | return med.MedicareDF('01', 2012) 344 | 345 | def test_get_pattern_str(self, mdf): 346 | assert mdf._get_pattern('string') == 'string' 347 | 348 | def test_get_pattern_regex(self, mdf): 349 | regex = re.compile('regex_match') 350 | assert mdf._get_pattern(regex) == 'regex_match' 351 | 352 | @pytest.mark.parametrize( 353 | 'obj', [True, 1, 1.0, ['string'], [re.compile('regex')]]) 354 | def test_get_pattern_invalid_type(self, mdf, obj): 355 | with pytest.raises(TypeError): 356 | mdf._get_pattern(obj) 357 | 358 | 359 | class TestCreateRenameDict(object): 360 | @pytest.fixture 361 | def mdf(self): 362 | return med.MedicareDF('01', 2012) 363 | 364 | @pytest.mark.parametrize('hcpcs,icd9_dx,icd9_sg,rename,expected', [ 365 | (None, None, None, {}, {}), 366 | ('a', 'b', 'c', 367 | {'hcpcs': '1', 'icd9_dx': '2', 'icd9_sg': '3'}, 368 | {'a': '1', 'b': '2', 'c': '3'}), 369 | ('a', 'b', 'c', 370 | {'hcpcs': ['1'], 'icd9_dx': ['2'], 'icd9_sg': ['3']}, 371 | {'a': '1', 'b': '2', 'c': '3'}), 372 | (['a'], ['b'], ['c'], 373 | {'hcpcs': ['1'], 'icd9_dx': ['2'], 'icd9_sg': ['3']}, 374 | {'a': '1', 'b': '2', 'c': '3'}), 375 | (['a'], ['b'], ['c'], 376 | {'hcpcs': {'a': '1'}, 'icd9_dx': {'b': '2'}, 'icd9_sg': {'c': '3'}}, 377 | {'a': '1', 'b': '2', 'c': '3'}), 378 | (['a', 'd'], ['b', 'e'], ['c', 'f'], 379 | {'hcpcs': ['1', '4'], 'icd9_dx': ['2', '5'], 'icd9_sg': ['3', '6']}, 380 | {'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5', 'f': '6'}), 381 | (['a', 'b'], ['c', 'd'], ['e', 'f'], 382 | {'hcpcs': {'a': '1'}, 'icd9_dx': {'c': '2'}, 'icd9_sg': {'e': '3'}}, 383 | {'a': '1', 'c': '2', 'e': '3'}), 384 | ]) # yapf: disable 385 | def test_rename_dict_noerror( 386 | self, mdf, hcpcs, icd9_dx, icd9_sg, rename, expected): 387 | codes = {'hcpcs': hcpcs, 'icd9_dx': icd9_dx, 'icd9_sg': icd9_sg} 388 | result = mdf._create_rename_dict(codes=codes, rename=rename) 389 | assert result == expected 390 | 391 | @pytest.mark.parametrize('hcpcs,icd9_dx,icd9_sg,rename', [ 392 | (None, None, None, 393 | {'hcpcs': ['1', '2'], 394 | 'icd9_dx': ['2', '3'], 395 | 'icd9_sg': ['3', '4']}), 396 | ('a', 'b', 'c', 397 | {'hcpcs': ['1', '2'], 398 | 'icd9_dx': ['2', '3'], 399 | 'icd9_sg': ['3', '4']}), 400 | ('a', 'b', 'c', {'hcpcs': [], 'icd9_dx': [], 'icd9_sg': []}), 401 | (['a', 'b'], ['c', 'd'], ['e', 'f'], 402 | {'hcpcs': [], 'icd9_dx': [], 'icd9_sg': []}), 403 | (['a', 'b'], ['c', 'd'], ['e', 'f'], 404 | {'hcpcs': '1', 'icd9_dx': '2', 'icd9_sg': '3'}), 405 | ]) # yapf: disable 406 | def test_rename_dict_wrong_list_len( 407 | self, mdf, hcpcs, icd9_dx, icd9_sg, rename): 408 | codes = {'hcpcs': hcpcs, 'icd9_dx': icd9_dx, 'icd9_sg': icd9_sg} 409 | with pytest.raises(AssertionError): 410 | mdf._create_rename_dict(codes=codes, rename=rename) 411 | 412 | @pytest.mark.parametrize('hcpcs,icd9_dx,icd9_sg,rename', [ 413 | (None, None, None, 414 | {'hcpcs': '1', 'icd9_dx': '2', 'icd9_sg': '3'}), 415 | ('a', 'b', 'c', 416 | {'hcpcs': {'a': '1', 'x': '5'}, 417 | 'icd9_dx': {'b': '2', 'y': '6'}, 418 | 'icd9_sg': {'c': '3', 'z': '7'}}), 419 | ]) # yapf: disable 420 | def test_rename_dict_wrong_dict_length( 421 | self, mdf, hcpcs, icd9_dx, icd9_sg, rename): 422 | codes = {'hcpcs': hcpcs, 'icd9_dx': icd9_dx, 'icd9_sg': icd9_sg} 423 | with pytest.raises(AssertionError): 424 | mdf._create_rename_dict(codes=codes, rename=rename) 425 | 426 | 427 | class TestSearchForCodesTypeCheck(object): 428 | @pytest.fixture 429 | def init(self): 430 | return { 431 | 'data_types': 'med', 432 | 'pl': None, 433 | 'hcpcs': None, 434 | 'icd9_dx': None, 435 | 'icd9_dx_max_cols': None, 436 | 'icd9_sg': None, 437 | 'icd9_sg_max_cols': None, 438 | 'keep_vars': {}, 439 | 'collapse_codes': True, 440 | 'rename': { 441 | 'hcpcs': None, 442 | 'icd9_dx': None, 443 | 'icd9_sg': None}, 444 | 'convert_ehic': True, 445 | 'dask': False, 446 | 'verbose': False} 447 | 448 | @pytest.fixture 449 | def mdf(self): 450 | return med.MedicareDF('01', 2012) 451 | 452 | @pytest.mark.parametrize( 453 | 'data_types,expected', 454 | [('carc', ['carc']), 455 | (['carc'], ['carc']), 456 | (['carc', 'carl', 'ipc', 'ipr', 'med', 'opc', 'opr'], 457 | ['carc', 'carl', 'ipc', 'ipr', 'med', 'opc', 'opr']), 458 | ]) # yapf: disable 459 | def test_data_types(self, mdf, init, data_types, expected): 460 | init['data_types'] = data_types 461 | result = mdf._search_for_codes_type_check(**init) 462 | assert result.data_types == expected 463 | 464 | @pytest.mark.parametrize( 465 | 'data_types,error', 466 | [(None, TypeError), 467 | ('a', ValueError), 468 | ('sdb', ValueError), 469 | (1, TypeError)]) # yapf: disable 470 | def test_wrong_data_types(self, mdf, init, data_types, error): 471 | init['data_types'] = data_types 472 | with pytest.raises(error): 473 | mdf._search_for_codes_type_check(**init) 474 | 475 | @pytest.mark.parametrize( 476 | 'pl,error', 477 | [('a', TypeError), 478 | (1, TypeError), 479 | (pd.DataFrame({'a': [1, 2, 3, 4]}), ValueError)]) # yapf: disable 480 | def test_pl_error(self, mdf, init, pl, error): 481 | init['pl'] = pl 482 | with pytest.raises(error): 483 | mdf._search_for_codes_type_check(**init) 484 | 485 | @pytest.mark.parametrize( 486 | 'pl,res', 487 | [( 488 | pd.DataFrame({'bene_id': [1, 2, 3, 4]}), 489 | pd.DataFrame({'bene_id': [1, 2, 3, 4]}) 490 | ), ( 491 | pd.DataFrame({'bene_id': [1, 2, 3, 4], 'other': ['a', 'b', 'c', 'd']}), 492 | pd.DataFrame({'bene_id': [1, 2, 3, 4]}) 493 | ), ( 494 | pd.DataFrame({'bene_id': [1, 2, 3, 4], 'ehic': ['a', 'b', 'c', 'd']}), 495 | pd.DataFrame({'bene_id': [1, 2, 3, 4], 'ehic': ['a', 'b', 'c', 'd']}), 496 | ), ( 497 | pd.DataFrame({'bene_id': [1, 2, 3, 4], 'ehic': ['a', 'b', 'c', 'd'], 'other': ['a', 'b', 'c', 'd']}), 498 | pd.DataFrame({'bene_id': [1, 2, 3, 4], 'ehic': ['a', 'b', 'c', 'd']}), 499 | ), ( 500 | pd.DataFrame({'ehic': [1, 2, 3, 4], 'other': ['a', 'b', 'c', 'd']}), 501 | pd.DataFrame({'ehic': [1, 2, 3, 4]}) 502 | )]) # yapf: disable 503 | def test_pl(self, mdf, init, pl, res): 504 | init['pl'] = pl 505 | obj = mdf._search_for_codes_type_check(**init) 506 | pl = obj.pl_ids_to_filter 507 | for col in pl.columns: 508 | assert pl[col].equals(res[col]) 509 | 510 | @pytest.mark.parametrize( 511 | 'codes,error', 512 | [(1, TypeError), 513 | (1.1, TypeError), 514 | ([1], TypeError), 515 | ([1.1], TypeError), 516 | ([['a']], TypeError), 517 | (['a', ['b']], TypeError), 518 | ([[re.compile('a')]], TypeError), 519 | ([re.compile('a'), [re.compile('b')]], TypeError), 520 | ]) # yapf: disable 521 | def test_codes_error(self, mdf, init, codes, error): 522 | for x in ['hcpcs', 'icd9_dx', 'icd9_sg']: 523 | init[x] = codes 524 | with pytest.raises(error): 525 | mdf._search_for_codes_type_check(**init) 526 | 527 | @pytest.mark.parametrize( 528 | 'hcpcs,icd9_dx,icd9_sg,expected', 529 | [ 530 | (None, None, None, 531 | {'hcpcs': [], 532 | 'icd9_dx': [], 533 | 'icd9_sg': []}), 534 | ('a', 'a', 'a', 535 | {'hcpcs': ['a'], 536 | 'icd9_dx': ['a'], 537 | 'icd9_sg': ['a']}), 538 | (['a'], ['a'], ['a'], 539 | {'hcpcs': ['a'], 540 | 'icd9_dx': ['a'], 541 | 'icd9_sg': ['a']}), 542 | ('a', 'b', 'c', 543 | {'hcpcs': ['a'], 544 | 'icd9_dx': ['b'], 545 | 'icd9_sg': ['c']}), 546 | (['a'], ['b'], ['c'], 547 | {'hcpcs': ['a'], 548 | 'icd9_dx': ['b'], 549 | 'icd9_sg': ['c']}), 550 | ('', '', '', 551 | {'hcpcs': [''], 552 | 'icd9_dx': [''], 553 | 'icd9_sg': ['']}), 554 | ([''], [''], [''], 555 | {'hcpcs': [''], 556 | 'icd9_dx': [''], 557 | 'icd9_sg': ['']}), 558 | ('a', re.compile('b'), 'c', 559 | {'hcpcs': ['a'], 560 | 'icd9_dx': [re.compile('b')], 561 | 'icd9_sg': ['c']}), 562 | (re.compile('a'), re.compile('a'), re.compile('a'), 563 | {'hcpcs': [re.compile('a')], 564 | 'icd9_dx': [re.compile('a')], 565 | 'icd9_sg': [re.compile('a')]}), 566 | ([re.compile('a')], [re.compile('a')], [re.compile('a')], 567 | {'hcpcs': [re.compile('a')], 568 | 'icd9_dx': [re.compile('a')], 569 | 'icd9_sg': [re.compile('a')]}), 570 | ]) # yapf: disable 571 | def test_codes(self, mdf, init, hcpcs, icd9_dx, icd9_sg, expected): 572 | init['collapse_codes'] = True 573 | init['hcpcs'] = hcpcs 574 | init['icd9_dx'] = icd9_dx 575 | init['icd9_sg'] = icd9_sg 576 | result = mdf._search_for_codes_type_check(**init) 577 | assert result.codes == expected 578 | 579 | @pytest.mark.parametrize( 580 | 'hcpcs,icd9_dx,icd9_sg,error', 581 | [ 582 | ('a', 'a', None, ValueError), 583 | (None, 'a', 'a', ValueError), 584 | ('a', None, 'a', ValueError), 585 | (re.compile('a'), re.compile('a'), None, ValueError), 586 | (None, re.compile('a'), re.compile('a'), ValueError), 587 | (re.compile('a'), None, re.compile('a'), ValueError), 588 | (re.compile('a'), 'a', None, ValueError), 589 | (None, re.compile('a'), 'a', ValueError), 590 | (re.compile('a'), None, 'a', ValueError), 591 | ]) # yapf: disable 592 | def test_dup_code_patterns(self, mdf, init, hcpcs, icd9_dx, icd9_sg, error): 593 | init['collapse_codes'] = False 594 | init['hcpcs'] = hcpcs 595 | init['icd9_dx'] = icd9_dx 596 | init['icd9_sg'] = icd9_sg 597 | with pytest.raises(error): 598 | mdf._search_for_codes_type_check(**init) 599 | 600 | def test_icd9_dx_max_cols(self, mdf, init): 601 | init['icd9_dx'] = None 602 | init['icd9_dx_max_cols'] = 5 603 | with pytest.raises(ValueError): 604 | mdf._search_for_codes_type_check(**init) 605 | 606 | def test_icd9_sg_max_cols(self, mdf, init): 607 | init['icd9_sg'] = None 608 | init['icd9_sg_max_cols'] = 5 609 | with pytest.raises(ValueError): 610 | mdf._search_for_codes_type_check(**init) 611 | 612 | @pytest.mark.parametrize( 613 | 'value,error', 614 | [(1, TypeError), 615 | ('a', TypeError), 616 | ([1], TypeError), 617 | (True, TypeError), 618 | ({'invalid_key': 'string'}, ValueError), 619 | ({'med': 1}, TypeError), 620 | ({'med': True}, TypeError), 621 | ]) # yapf: disable 622 | def test_keep_vars_error(self, mdf, init, value, error): 623 | init['keep_vars'] = value 624 | with pytest.raises(error): 625 | mdf._search_for_codes_type_check(**init) 626 | 627 | @pytest.mark.parametrize( 628 | 'value,expected', 629 | [({'med': 'string'}, {'med': ['string']}), 630 | ({'med': ['string']}, {'med': ['string']})]) # yapf: disable 631 | def test_keep_vars(self, mdf, init, value, expected): 632 | init['keep_vars'] = value 633 | result = mdf._search_for_codes_type_check(**init) 634 | assert result.keep_vars == expected 635 | 636 | @pytest.mark.parametrize( 637 | 'hcpcs,icd9_dx,icd9_sg,rename', 638 | [(None, None, None, { 639 | 'wrongkey': ['new_name']}), 640 | ('a', 'b', 'c', { 641 | 'wrongkey': ['new_name']})]) 642 | # More `rename` tests in TestCreateRenameDict class 643 | def test_rename_dict_wrong_dict_key( 644 | self, mdf, init, hcpcs, icd9_dx, icd9_sg, rename): 645 | init['hcpcs'] = hcpcs 646 | init['icd9_dx'] = icd9_dx 647 | init['icd9_sg'] = icd9_sg 648 | init['rename'] = rename 649 | with pytest.raises(ValueError): 650 | mdf._search_for_codes_type_check(**init) 651 | 652 | @pytest.mark.parametrize( 653 | 'rename,error', 654 | [({'hcpcs': ['somevalue']}, ValueError), 655 | ({'icd9_dx': 'string'}, ValueError)]) # yapf: disable 656 | # Rename argument not allowed when collapse_codes is True 657 | def test_rename_collapse_codes_error(self, mdf, init, rename, error): 658 | init['collapse_codes'] = True 659 | init['rename'] = rename 660 | with pytest.raises(error): 661 | mdf._search_for_codes_type_check(**init) 662 | 663 | @pytest.mark.parametrize( 664 | 'value,var,error', 665 | [(1, 'collapse_codes', TypeError), 666 | ('a', 'collapse_codes', TypeError), 667 | ([True], 'collapse_codes', TypeError), 668 | (None, 'collapse_codes', TypeError), 669 | (1, 'convert_ehic', TypeError), 670 | ('a', 'convert_ehic', TypeError), 671 | ([True], 'convert_ehic', TypeError), 672 | (None, 'convert_ehic', TypeError), 673 | (1, 'verbose', TypeError), 674 | ('a', 'verbose', TypeError), 675 | ([True], 'verbose', TypeError), 676 | (None, 'verbose', TypeError), 677 | ]) # yapf: disable 678 | def test_bool_input_type_error(self, mdf, init, value, var, error): 679 | init[var] = value 680 | with pytest.raises(error): 681 | mdf._search_for_codes_type_check(**init) 682 | 683 | 684 | class TestSearchForCodesDfInner(object): 685 | @pytest.fixture 686 | def init(self): 687 | # yapf: disable 688 | # strings of random numbers between 10000 and 20000 689 | cl = [ 690 | ['a', 'a1', '12330', '11561', '16595', '19645', '12857'], 691 | ['a', 'a2', '19119', '15046', '11443', '10912', '12049'], 692 | ['a', 'a3', '11970', '11287', '15761', '18922', '17237'], 693 | ['d', 'd1', '12339', '13261', '16721', '16916', '14030'], 694 | ['d', 'd2', '17472', '12268', '16866', '19018', '15955'], 695 | ['d', 'd3', '19984', '12176', '15422', '17639', '15978'], 696 | ['g', 'g1', '14664', '16756', '17961', '11753', '14142'], 697 | ['h', 'h1', '17978', '17134', '19126', '15506', '19840'], 698 | ['h', 'h2', '19970', '14396', '10766', '13759', '16496'], 699 | ['h', 'h3', '10135', '19787', '15254', '16429', '19755'], 700 | ['k', 'k1', '14184', '14980', '11988', '19129', '15954'], 701 | ['l', 'l1', '18656', '16262', '17277', '14809', '13158'], 702 | ['l', 'l2', '12183', '17934', '14647', '16925', '10645'], 703 | ['l', 'l3', '16389', '15936', '15057', '11984', '16037'], 704 | ['o', 'o1', '17409', '13543', '10463', '12570', '14592'], 705 | ['p', 'p1', '14828', '11101', '18290', '15968', '10171'], 706 | ['q', 'q1', '15680', '10538', '16378', '18132', '15117'], 707 | ['r', 'r1', '19623', '17485', '11370', '18089', '14946'], 708 | ['r', 'r2', '12488', '13445', '16946', '11697', '17000'], 709 | ['r', 'r3', '12433', '15126', '16657', '10305', '13371'], 710 | ['r', 'r4', '14366', '11205', '18033', '15486', '10191'], 711 | ['v', 'v1', '19662', '16793', '18033', '10708', '17447'], 712 | ['v', 'v2', '13856', '16934', '19373', '13596', '19218'], 713 | ['v', 'v3', '18428', '12335', '14074', '15931', '12287'], 714 | ['v', 'v4', '11527', '16453', '15934', '11127', '19378'], 715 | ['z', 'z1', '13459', '17823', '15864', '19867', '11651'], 716 | ] 717 | # yapf: enable 718 | cols_toload = [ 719 | 'bene_id', 'medparid', 'dgnscd1', 'dgnscd2', 'dgnscd3', 'dgnscd4', 720 | 'dgnscd5'] 721 | cl = pd.DataFrame.from_records(cl, columns=cols_toload) 722 | cl = cl.set_index('bene_id') 723 | 724 | codes = {'icd9_dx': [re.compile(r'^12'), '18428']} 725 | 726 | cols = { 727 | 'cl_id': 'medparid', 728 | 'pl_id': 'bene_id', 729 | 'keep_vars': [], 730 | 'icd9_dx': ['dgnscd1', 'dgnscd2', 'dgnscd3', 'dgnscd4', 'dgnscd5']} 731 | 732 | return { 733 | 'cl': cl, 734 | 'codes': codes, 735 | 'cols': cols, 736 | 'year': 2012, 737 | 'keep_vars': [], 738 | 'rename': {}, 739 | 'collapse_codes': True, 740 | 'pl_ids_to_filter': None} 741 | 742 | @pytest.fixture 743 | def mdf(self): 744 | return med.MedicareDF('01', 2012) 745 | 746 | # yapf: disable 747 | @pytest.mark.parametrize( 748 | 'args,exp', 749 | [( 750 | {}, 751 | pd.Series( 752 | True, 753 | index=['a1', 'a2', 'd1', 'd2', 'd3', 'l2', 'o1', 'r2', 'r3', 754 | 'v3']) 755 | ), ( 756 | {'cols': {'cl_id': 'medparid', 757 | 'pl_id': 'bene_id', 758 | 'keep_vars': [], 759 | 'icd9_dx': ['dgnscd1']}}, 760 | pd.Series(True, index=['a1', 'd1', 'l2', 'r2', 'r3', 'v3']) 761 | ) 762 | ]) 763 | # yapf: enable 764 | def test_output_collapse_codes(self, mdf, init, args, exp): 765 | args = {**init, **args} 766 | 767 | df = mdf._search_for_codes_df_inner(**args) 768 | df = df[df['match']] 769 | assert df.set_index('medparid')['match'].equals(exp) 770 | 771 | # verbose 772 | --------------------------------------------------------------------------------