├── MIMIC3py ├── __init__.py ├── tests │ └── test_utilities.py ├── readme.md ├── Example_Analyses │ └── ICD9_PCA.py ├── config.py ├── download.py ├── ICD9_One_Hot_Encoded.py ├── utilities.py └── load_to_pandas.py ├── setup.cfg ├── MANIFEST.in ├── .vscode └── settings.json ├── Project_Management ├── Notes.md └── Project_Charter.md ├── Pipfile ├── MANIFEST ├── WIP.md ├── README.rst ├── README.md ├── LICENSE ├── .gitignore ├── setup.py └── Pipfile.lock /MIMIC3py/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the license file 2 | include LICENSE -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "C:\\Users\\peep\\.virtualenvs\\MIMIC3py-nnraHvdA\\Scripts\\python.exe" 3 | } -------------------------------------------------------------------------------- /Project_Management/Notes.md: -------------------------------------------------------------------------------- 1 | #Notes 2 | Project Management templates are from: https://github.com/documize/document-templates 3 | 4 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | requests = "*" 10 | tensorflow = "*" 11 | pandas = "*" 12 | 13 | [requires] 14 | python_version = "3.6" 15 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | LICENSE 3 | setup.cfg 4 | setup.py 5 | MIMIC3py\ICD9_One_Hot_Encoded.py 6 | MIMIC3py\__init__.py 7 | MIMIC3py\config.py 8 | MIMIC3py\download.py 9 | MIMIC3py\load_to_pandas.py 10 | MIMIC3py\sandbox.py 11 | MIMIC3py\utilities.py 12 | -------------------------------------------------------------------------------- /MIMIC3py/tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | # Spiro Ganas 2 | # 3 | # Test script to make sure my utility functions work property. 4 | 5 | 6 | import pytest 7 | from MIMIC3.utilities import * 8 | 9 | 10 | def test_D_CPT_Zipped(): 11 | assert(verifyData('D_CPT.csv.gz')==True) 12 | 13 | 14 | def test_Bad_Input(): 15 | with pytest.raises(FileNotFoundError): 16 | md5("BadInput") 17 | 18 | -------------------------------------------------------------------------------- /WIP.md: -------------------------------------------------------------------------------- 1 | # Work-in-Progress MIMIC3py 2 | 3 | ### Data Engineering 4 | 1. Add a function to check if the user has entered a valid username and password. I need to add some error handling to talke care of bad values. 5 | 2. Add a wrapper to the download function so it can be easily used from the command line 6 | 3. Finish the function to load the data into Pandas data frames. Maybe store all the dataframes in a class or fictionary? 7 | 4. Add code to create a MySQL database and load all the data. 8 | 5. 9 | 10 | 11 | 12 | ### Analysis 13 | 1. Create a sparse matrix storing the ICD codes for each admission ID. Figure out how to "One-Hot Encode" it for predictive modeling purposes. 14 | 2. Try using PCA or Tensor Factorization to identify major disease groups. 15 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | MIMIC3py 2 | A Python library to load and analyze the MIMIC III Critical Care Database 3 | 4 | "MIMIC-III (Medical Information Mart for Intensive Care III) is a large, freely-available database comprising deidentified health-related data associated with over forty thousand patients who stayed in critical care units of the Beth Israel Deaconess Medical Center between 2001 and 2012." It includes demographics, vital signs, laboratory tests, medications, clinical notes, and more. 5 | 6 | More details are availible at: 7 | * https://mimic.physionet.org/about/mimic/ 8 | * http://www.nature.com/articles/sdata201635 9 | 10 | 11 | 12 | This repository contains Python 3 code that will help you: 13 | * Download the data. 14 | * Load the data into Pandas data frames. 15 | -------------------------------------------------------------------------------- /MIMIC3py/readme.md: -------------------------------------------------------------------------------- 1 | # Downloading MIMIC III 2 | 3 | The code is this library will help you: 4 | 1. Download the MIMIC III files. 5 | 2. Validate the downloaded files by verifying their MD5 hashes. 6 | 3. Load the data into Pandas data frames. 7 | 8 | 9 | Here's what you need to do: 10 | 1. Get a username and password from the physinnet website: https://mimic.physionet.org/gettingstarted/access/ 11 | 2. Open config.py. 12 | 1. Fill in your username and password 13 | 2. Enter the folder where you want the downloaded files to be stored. 14 | 3. Make sure you un-comment all of the files that you want to download 15 | * NOTE: some of the files can be HUGE (33 GB!!!!). 16 | 4. Save the filled-in confi.py file 17 | 3. Run the download.py file. 18 | * NOTE: you may need to first: pip install requests 19 | * See requirements.txt for a list of all the libraries you want need/want to install. 20 | 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MIMIC3py 2 | A Python library to load and analyze the MIMIC III Critical Care Database 3 | 4 | "MIMIC-III (**M**edical **I**nformation **M**art for **I**ntensive **C**are III) is a large, freely-available database comprising deidentified health-related data associated with over forty thousand patients who stayed in critical care units of the Beth Israel Deaconess Medical Center between 2001 and 2012." It includes demographics, vital signs, laboratory tests, medications, clinical notes, and more. 5 | 6 | More details are availible at: 7 | * https://mimic.physionet.org/about/mimic/ 8 | * http://www.nature.com/articles/sdata201635 9 | 10 | 11 | 12 | This repository contains Python 3 code that will help you: 13 | * Download the data. 14 | * Load the data into Pandas data frames. 15 | 16 | The package is hosted on pypi at: https://pypi.python.org/pypi/MIMIC3py/0.11 17 | 18 | You can install it using `pip install MIMIC3py` 19 | 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Spiro Ganas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MIMIC3py/Example_Analyses/ICD9_PCA.py: -------------------------------------------------------------------------------- 1 | # Spiro Ganas 2 | # 11/16/17 3 | # 4 | # This script one-hot encodes the ICD9 data and then applies a Pincipal Components Analysis 5 | 6 | 7 | 8 | import pandas as pd 9 | from sklearn.decomposition import PCA 10 | 11 | from MIMIC3py.ICD9_One_Hot_Encoded import ICD9_One_HotEncoded 12 | 13 | 14 | 15 | def ICD9_PCA(numer_of_components = 100, print_details = True, save_as_csv_filename = None): 16 | """Applies a principal components analysis to the MIMIC III ICD9 Data.""" 17 | 18 | # Convert the MIMIC III ICD9 data to a one-hot encoded matrix using the function I created. 19 | ICD9_data = ICD9_One_HotEncoded() 20 | 21 | # Set up the PCA model 22 | MyPCA = PCA(n_components=numer_of_components, svd_solver='randomized') 23 | 24 | # trains the PCA using all of the one-hot encoded data 25 | MyPCA.fit(ICD9_data) 26 | 27 | # Convert the output of the model to a pandas Dataframe 28 | MyPCA_df = pd.DataFrame(MyPCA.components_) 29 | 30 | # Add column names to the DataFrame 31 | MyPCA_df.columns = list(ICD9_data.axes[1]) 32 | 33 | 34 | 35 | # Transpose the data to make it easy to sort in microsoft excel 36 | MyResults = MyPCA_df.transpose() 37 | 38 | # Export the results to a csv file 39 | if save_as_csv_filename: 40 | MyResults.to_csv(save_as_csv_filename) 41 | #MyResults.to_csv("D:\\MIMIC-III Clinical Database\\Data\\output\\PCA_Analysis.csv") 42 | 43 | if print_details: 44 | #print(MyPCA.n_components_) 45 | print(MyPCA.explained_variance_ratio_) 46 | print("Total Variance Captured: ", sum(MyPCA.explained_variance_ratio_)) 47 | 48 | 49 | return MyResults 50 | 51 | 52 | if __name__ == "__main__": 53 | df = ICD9_PCA(numer_of_components = 100, print_details = True, save_as_csv_filename = None) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | 104 | # PyCharm files 105 | .idea/ 106 | 107 | 108 | # PyPi file that contains passwords 109 | .pypirc 110 | MIMIC3py/WIP.py 111 | MIMIC3py/sandbox.py 112 | -------------------------------------------------------------------------------- /MIMIC3py/config.py: -------------------------------------------------------------------------------- 1 | # Spiro Ganas 2 | # 3 | # This file contains all the configuration data (username, password, etc.) 4 | 5 | 6 | # This is the folder where the files will located 7 | local_save_folder = "D:\\MIMIC-III Clinical Database\\Data\\" 8 | 9 | # ENTER YOUR PHYSIONET USERNAME AND PASSWORD HERE ############################## 10 | physionet_USERNAME = "" 11 | physionet_PASSWORD = "" 12 | ################################################################################ 13 | 14 | # You may need to update some of these variables if MIMIC III gets updated 15 | physionet_BASE_URL = "https://physionet.org/works/MIMICIIIClinicalDatabase/files/version_1_4/" 16 | 17 | # Comment out any file you don't want to download. 18 | # Note that some of the files are very, very big. 19 | physionet_FILENAMES = [ 20 | "ADMISSIONS.csv.gz", # 12 MB 21 | "CALLOUT.csv.gz", # 6.1 MB 22 | "CAREGIVERS.csv.gz", # 199 KB 23 | "CHARTEVENTS.csv.gz", # 33 GB ------BIG!!! 24 | "CPTEVENTS.csv.gz", # 56 MB 25 | "DATETIMEEVENTS.csv.gz", # 502 MB 26 | "D_CPT.csv.gz", # 14 KB 27 | "DIAGNOSES_ICD.csv.gz", # 19 MB 28 | "D_ICD_DIAGNOSES.csv.gz", # 1.4 KB 29 | "D_ICD_PROCEDURES.csv.gz", # 305 KB 30 | "D_ITEMS.csv.gz", # 933 KB 31 | "D_LABITEMS.csv.gz", # 43 KB 32 | "DRGCODES.csv.gz", # 11 MB 33 | "ICUSTAYS.csv.gz", # 6.1 MB 34 | "INPUTEVENTS_CV.csv.gz", # 2.3 GB ------BIG!!! 35 | "INPUTEVENTS_MV.csv.gz", # 931 MB 36 | "LABEVENTS.csv.gz", # 1.8GB ------BIG!!! 37 | "MICROBIOLOGYEVENTS.csv.gz", # 70 MB 38 | "NOTEEVENTS.csv.gz", # 3.8 GB ------BIG!!! 39 | "OUTPUTEVENTS.csv.gz", # 379 MB 40 | "PATIENTS.csv.gz", # 2.6 MB 41 | "PRESCRIPTIONS.csv.gz", # 735 MB 42 | "PROCEDUREEVENTS_MV.csv.gz", # 47 MB 43 | "PROCEDURES_ICD.csv.gz", # 6.5 MB 44 | "SERVICES.csv.gz", # 3.4 MB 45 | "TRANSFERS.csv.gz", # 24 MB 46 | ] 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Notes on releasing the software to pypi 2 | # 3 | # Step 1: Update this file with the new version number. 4 | # Step 2: Make sure everything has been commited and pushed to GitHub. 5 | # Step 3: Create a new release on GitHub. 6 | # Step 4: package the source file by running: python setup.py sdist 7 | # Step 5: Create an installation "wheel" by running: python setup.py bdist_wheel 8 | # Step 6: Delete any old files in the dist folder 9 | # Step 7: Do an upload to pypi Test Server by running this in the terminal: 10 | # C:\ProgramData\Anaconda3\Scripts\twine.exe upload --repository-url https://test.pypi.org/legacy/ dist/* 11 | # Step 8: Upload to the production pypi server by running this in the terminal: 12 | # C:\ProgramData\Anaconda3\Scripts\twine.exe upload --repository-url https://upload.pypi.org/legacy/ dist/* 13 | # Step 9: Test everything by running: pip install MIMIC3py 14 | 15 | 16 | 17 | 18 | 19 | import setuptools #This import allows me to create the wheel file 20 | from distutils.core import setup 21 | 22 | setup( 23 | name = 'MIMIC3py', 24 | packages = ['MIMIC3py'], 25 | version = '0.11', # Ideally should be same as your GitHub release tag varsion 26 | description = 'A Python library to load and analyze the MIMIC III Critical Care Database.', 27 | author = 'Spiro Ganas', 28 | author_email = 'spiroganas@gatech.edu', 29 | url = 'https://github.com/SpiroGanas/MIMIC3py', 30 | download_url = 'https://github.com/SpiroGanas/MIMIC3py/archive/v0.11.tar.gz', 31 | keywords = ['MIMIC', 'Critical Care', 'Informatics'], 32 | classifiers = [ 33 | 'Development Status :: 2 - Pre-Alpha', 34 | 'License :: OSI Approved :: MIT License', 35 | 'Programming Language :: Python :: 3', 36 | 'Intended Audience :: Healthcare Industry', 37 | 'Intended Audience :: Science/Research' 38 | ], 39 | install_requires=['pandas', 'numpy', 'scikit-learn', 'requests'], 40 | python_requires='>=3', 41 | ) 42 | 43 | -------------------------------------------------------------------------------- /MIMIC3py/download.py: -------------------------------------------------------------------------------- 1 | # Spiro Ganas 2 | # 3 | # Python program to download MIMIC III csv files from 4 | 5 | import requests 6 | import urllib.parse 7 | 8 | from MIMIC3py.config import * 9 | from MIMIC3py.utilities import verify_data 10 | 11 | 12 | def download_mimic3_files(physionet_filenames=physionet_FILENAMES): 13 | """ Downloads the MIMIC 3 critical care database csv files. 14 | The CVS file are zipped in the .gz format. 15 | You can upzip the files, or just load the .gz file 16 | directly into a Pandas data frame. 17 | """ 18 | 19 | for file in physionet_filenames: 20 | url = urllib.parse.urljoin(physionet_BASE_URL, file) 21 | success = download_file(url) 22 | print(success) 23 | 24 | 25 | def download_file(url, save_folder=local_save_folder, username=physionet_USERNAME, password=physionet_PASSWORD): 26 | """ This code downloads files from a password protected http site. 27 | https://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py 28 | """ 29 | 30 | if not verify_physionet_credentials(username, password): return "Error: Username/Password invalid." 31 | 32 | 33 | 34 | local_filename = url.split('/')[-1] 35 | 36 | # If the file already exists, don't download it again 37 | if verify_data(local_filename, save_folder): 38 | return "Verified existing file: " + local_filename 39 | 40 | # NOTE the stream=True parameter 41 | r = requests.get(url, auth=(username, password), stream=True) 42 | with open(save_folder + local_filename, 'wb') as f: 43 | for chunk in r.iter_content(chunk_size=1024): 44 | if chunk: # filter out keep-alive new chunks 45 | f.write(chunk) 46 | # If the file already exists, don't download it again 47 | if verify_data(local_filename, save_folder): 48 | return "Downloaded and successfully verified: " + local_filename 49 | else: 50 | return "Downloaded but FAILED TO VERIFY: " + local_filename 51 | 52 | 53 | 54 | 55 | 56 | def verify_physionet_credentials(username, password): 57 | """Returns True if the username and password are valid for the physionet.org website.""" 58 | url = "https://physionet.org/works/MIMICIIIClinicalDatabase/files/" 59 | r = requests.get(url, auth=(username, password)) 60 | return True if r.status_code == 200 else False 61 | 62 | 63 | 64 | 65 | if __name__ == "__main__": 66 | print("Verifying existing files and downloading any missing files.") 67 | print("Please Wait...") 68 | print() 69 | download_mimic3_files(physionet_FILENAMES) 70 | -------------------------------------------------------------------------------- /MIMIC3py/ICD9_One_Hot_Encoded.py: -------------------------------------------------------------------------------- 1 | # Spiro Ganas 2 | # 11/15/17 3 | # 4 | # One-hot encodes the ICD9 data at a patient level 5 | 6 | from MIMIC3py.load_to_pandas import load_mimic_to_pandas 7 | import pandas as pd 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | def one_hot_encoder(dictionary_of_Dictionaries: dict)-> pd.DataFrame: 18 | '''Accepts a dictionary of dictionaries and returns a one-hot encoded pandas dataframe. 19 | The outer key is the subject_id, the inner key is an ICD9 code, and the value is always 1. 20 | The returned DataFrame has subject_id as the rows and ICD9s as the columns, with 1 meaning 21 | the patient had that illness and 0 meaning they didn't 22 | ''' 23 | 24 | # Convert the sparse "dictionary of dictionaries" to a pandas dataframe. 25 | one_hot_df = pd.DataFrame(dictionary_of_Dictionaries, dtype=int) 26 | 27 | # Replace in all the NaN values with 0 28 | one_hot_df = one_hot_df.fillna(0) 29 | 30 | # Transpose the DataFrame so rows represent a single patient and each column represents the presence an ICD9 code. 31 | one_hot_df = one_hot_df.transpose() 32 | return one_hot_df 33 | 34 | 35 | 36 | def get_ICD9_descriptions(filename = None): 37 | '''Returns a pandas DataFrame containing ICD9 codes and their text descriptions. 38 | The file comes from CMS: https://www.cms.gov/Medicare/Coding/ICD9ProviderDiagnosticCodes/codes.html 39 | ''' 40 | # TODO: Implement this function and move it to a utility module 41 | pass 42 | 43 | 44 | 45 | def Rollup_ICD9_Code(IDC9_Code: str)-> str: 46 | '''Rolls up the ICD9 codes to the 3 character level''' 47 | if str(IDC9_Code)[0]== 'E' or str(IDC9_Code)[0]== 'V': 48 | return str(IDC9_Code)[:4] # E and V codes are rolled up to 4-character 49 | else: 50 | return str(IDC9_Code)[:3] 51 | 52 | 53 | 54 | 55 | def ICD9_One_HotEncoded(Location_of_the_CSV_Files = "D:\\MIMIC-III Clinical Database\\Data\\", print_details = False): 56 | '''Imports the Subject_IDs and ICD9 Codes from the DIAGNOSES_ICD.csv and outputs the 57 | data as a one-hot encoded matrix where each row is a patient and each column is 58 | a disease. 59 | ''' 60 | 61 | df = load_mimic_to_pandas(CSV_Folder_Location=Location_of_the_CSV_Files, CSV_List=['DIAGNOSES_ICD'], gunzip=False) 62 | 63 | 64 | 65 | one_hot = {} 66 | for index, row in df['DIAGNOSES_ICD'].iterrows(): 67 | patient = row['SUBJECT_ID'] 68 | icd = Rollup_ICD9_Code(row['ICD9_CODE']) # I rill up the ICD 9 to the 3-character level 69 | 70 | if patient in one_hot: 71 | if icd not in one_hot[patient]: 72 | one_hot[patient][icd] = 1 73 | else: 74 | one_hot[patient] = {icd : 1} 75 | 76 | 77 | one_hot_df = one_hot_encoder(one_hot) 78 | 79 | # These are the icd9 column names 80 | My_column_names = list(one_hot_df.axes[1]) 81 | 82 | 83 | 84 | # Print a summary of the one-hot encoded dataframe 85 | if print_details: 86 | print("memory used: ", one_hot_df.memory_usage().sum()/1000/1000, " MB") 87 | print("Shape: ", one_hot_df.shape) 88 | print() 89 | print(one_hot_df.head()) 90 | 91 | return one_hot_df 92 | 93 | 94 | 95 | # import MIMIC3py.ICD9_One_Hot_Encoded 96 | # 97 | # x = MIMIC3py.ICD9_One_Hot_Encoded.MyPCA 98 | 99 | 100 | -------------------------------------------------------------------------------- /MIMIC3py/utilities.py: -------------------------------------------------------------------------------- 1 | # Spiro Ganas 2 | # 3 | # Utility code and other small functions 4 | 5 | 6 | import os 7 | import hashlib 8 | 9 | from MIMIC3py.config import * 10 | 11 | 12 | def verify_data(file_to_verify, local_save_folder=local_save_folder): 13 | """Calculates the MD5 value of a downloaded file and returns true if that value 14 | matches the value in the checksum file""" 15 | 16 | # From the files checksum_md5_unzipped.txt and checksum_md5_zipped.txt, available on physionet.org 17 | checksums = { 18 | 'ADMISSIONS.csv.gz': 'f301f427b38268ae9b3e129f49484b8d', 19 | 'CALLOUT.csv.gz': 'f9443602f929725d0d240edde9c37848', 20 | 'CAREGIVERS.csv.gz': '14e103974ac30c7e8ec92fa15966730a', 21 | 'CHARTEVENTS.csv.gz': '6322b029b09e75a8aa7c692eec2b656c', 22 | 'CPTEVENTS.csv.gz': '88ec8e5aa63f3f50c49b67482a0a691d', 23 | 'DATETIMEEVENTS.csv.gz': '68b7b786469c66af182bc0d41a4d9e69', 24 | 'D_CPT.csv.gz': '4801e130461a147e647cd4c3e4579208', 25 | 'DIAGNOSES_ICD.csv.gz': '1d8007cc3115fd87a95321df33e1de86', 26 | 'D_ICD_DIAGNOSES.csv.gz': '9c5b6a1e0c6ebe7a96b99441ca9c0499', 27 | 'D_ICD_PROCEDURES.csv.gz': '7927c2e262485b69ae6dd31359faa664', 28 | 'D_ITEMS.csv.gz': '83413729fa45a4d594e9a530bfbad911', 29 | 'D_LABITEMS.csv.gz': '75e79ca14a2b8c02e3535b87498c4f9d', 30 | 'DRGCODES.csv.gz': '34f5b2327cda3ffa53089b6c23b53191', 31 | 'ICUSTAYS.csv.gz': '79f2b70cb07a6598642e04aaa364c27c', 32 | 'INPUTEVENTS_CV.csv.gz': '1d02246e9fd8d379a4e0c41623890c25', 33 | 'INPUTEVENTS_MV.csv.gz': 'f49043f52cd3ee121f36b66e5504a9c0', 34 | 'LABEVENTS.csv.gz': '9d78e309dc5b5fcf80e34803a688a354', 35 | 'MICROBIOLOGYEVENTS.csv.gz': 'b600550677d8dd8d13184f40c5162093', 36 | 'NOTEEVENTS.csv.gz': '90c05ebe5e7e631f0a55f34f568fbfc0', 37 | 'OUTPUTEVENTS.csv.gz': 'c98f08478614282571557017264468c6', 38 | 'PATIENTS.csv.gz': '4383677427d53def256367a7c94a6b31', 39 | 'PRESCRIPTIONS.csv.gz': '15c67d6d84e88ec57044064e170390f6', 40 | 'PROCEDUREEVENTS_MV.csv.gz': '118ab22f1d98ef21de8e75154574d617', 41 | 'PROCEDURES_ICD.csv.gz': 'be9d0a0ce5acf0bbda1b61fe66353c89', 42 | 'SERVICES.csv.gz': '48d2da2ab02aba284a7a10962b93b061', 43 | 'TRANSFERS.csv.gz': '4440b72090dc9207c88368cfb1ae2146', 44 | 'ADMISSIONS.csv': '57d940b69dd066da5ba57e008cc7f92c', 45 | 'CALLOUT.csv': 'cd4e416337e68f6678ff4e091938be58', 46 | 'CAREGIVERS.csv': '258e3c2bc11798c99ffc4f33aa1e9bcd', 47 | 'CHARTEVENTS.csv': '2b5211d1045ffac4c4e345ccb56ccc1b', 48 | 'CPTEVENTS.csv': 'bf32e07ecd2b946a675a0d7cea75d2e7', 49 | 'DATETIMEEVENTS.csv': '6245f2f4f581dce8e05e147228a51522', 50 | 'D_CPT.csv': '49fdda583f5e85e5e0a92686b3106fac', 51 | 'DIAGNOSES_ICD.csv': 'e2c0a05d768a6273038ae84e576186a7', 52 | 'D_ICD_DIAGNOSES.csv': 'd0a7026ca3618f4360d8329643d9041d', 53 | 'D_ICD_PROCEDURES.csv': 'a9707e5361f939f45ca2bc2eb8bd5652', 54 | 'D_ITEMS.csv': '749e350c22531ec8589b7391b4e7b660', 55 | 'D_LABITEMS.csv': '2f77db8fc2f2a21e4fad1a1781d98709', 56 | 'DRGCODES.csv': 'ea61c7dfe180c6d6a1273f220b5e70c5', 57 | 'ICUSTAYS.csv': 'b2a57affcda3c60fa38a022b2df7fcf2', 58 | 'INPUTEVENTS_CV.csv': '5fce8501d6723a470c74affcda32e52b', 59 | 'INPUTEVENTS_MV.csv': '4d57864670f51e7230c0fef52d206049', 60 | 'LABEVENTS.csv': 'bc2fe94983576207758635924c047dbe', 61 | 'MICROBIOLOGYEVENTS.csv': '0a35833252bdecb32b9d1348adec2085', 62 | 'NOTEEVENTS.csv': 'df33ab9764256b34bfc146828f440c2b', 63 | 'OUTPUTEVENTS.csv': '6e137e00d7a7fd14c291d015573ed375', 64 | 'PATIENTS.csv': '3b06f45153c66e2b7c49b35805971145', 65 | 'PRESCRIPTIONS.csv': '43f469da09c65bae252d02a51787ad85', 66 | 'PROCEDUREEVENTS_MV.csv': '8a73340cf7a09d8f42ddd64f4863fff3', 67 | 'PROCEDURES_ICD.csv': 'ed7e6f1efa7e334404f6fb26e4c3c7d2', 68 | 'SERVICES.csv': '52259c657c4fd4bc43bcb35ec5c96d98', 69 | 'TRANSFERS.csv': '136b75c89bec2aab588ca64cd4b582bd' 70 | } 71 | try: 72 | return checksums[file_to_verify] == md5(os.path.join(local_save_folder, file_to_verify)) 73 | except FileNotFoundError: 74 | # If the file doesn't exist, return false 75 | return False 76 | except KeyError: 77 | # If I don't have an MD5 hash in the dictionary, return false 78 | return False 79 | 80 | 81 | def md5(fname): 82 | """Calculates the MD5 checksum of the file. 83 | SOURCE: https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file 84 | """ 85 | try: 86 | hash_md5 = hashlib.md5() 87 | with open(fname, "rb") as f: 88 | for chunk in iter(lambda: f.read(4096), b""): 89 | hash_md5.update(chunk) 90 | return hash_md5.hexdigest() 91 | except FileNotFoundError: 92 | raise FileNotFoundError 93 | 94 | 95 | def verify_all(local_save_folder=local_save_folder): 96 | """Loops over every file in the folder and determines if it's a valid MIMIC III file""" 97 | # TODO: Implement verifyAll function 98 | pass 99 | 100 | 101 | if __name__ == "__main__": 102 | print(verify_data('D_CPT.csv.gz')) 103 | print(verify_data('D_CPT.csv.gzsdsdsd')) 104 | print(verify_data('ADMISSIONS.csv.gz')) 105 | -------------------------------------------------------------------------------- /Project_Management/Project_Charter.md: -------------------------------------------------------------------------------- 1 | # Project Charter 2 | 3 | **Project Charter For:** MIMIC3py 4 | 5 | **Document Created:** 2017-11-23 6 | 7 | **Author:** Spiro Ganas 8 | 9 | **Distribution:** N/A 10 | 11 | **Document Revision History** 12 | 13 | | Revised By | Date | Comment | 14 | | -----------------|:--------------:|:---------------------:| 15 | | Spiro Ganas | 2017-11-23 | Initial Version | 16 | 17 | 18 | | Start Date | End Date | Project Manager | Client | 19 | | :---------:|:-----------:| :--------------:| ------------| 20 | |2017-11-23 |2018-03-01 | Spiro Ganas| MIMIC3py Community | 21 | 22 | # Introduction 23 | 24 | ## Purpose of the document 25 | 26 | 27 | This document provides the information necessary to frame the project in the context of its scope, purpose, funding, resource requirements and approval. 28 | 29 | ## Project Overview 30 | MIMIC3py is a Python library. It makes the MIMIC III Critical Care Database easily accessible to healthcare data scientists. 31 | 32 | ## Business Case 33 | 34 | MIMIC is an openly available dataset developed by the MIT Lab for Computational Physiology, comprising deidentified health data associated with ~40,000 critical care patients. It includes demographics, vital signs, laboratory tests, medications, and more. 35 | 36 | The MIMIC3py Python library is designed to help healthcare data scientists analyze this data. It includes tools to download, verify and load the data. 37 | 38 | MIMIC3py also demonstrates how to perform "feature engineering", converting the data into a format that can be analyzed using machine learning algorithms. Sample scripts show how sk-learn and TensorFlow models can be applied to the MIMIC III data. 39 | 40 | The business case for this project is the belief that MIMIC III data contains hidden knowledge that can improve healthcare. This project contributes to the discovery of that knowledge by automating routine tasks, allowing healthcare data scientist to "focus on the fun stuff". 41 | ## Project Scope 42 | 43 | ### Objectives 44 | 45 | * Allow authorized users to easily download the MIMIC III data. 46 | * Provide tools to load the MIMIC III data into pandas DataFrames. 47 | * Provide tools to convert the data into formats that can be analyzed using machine learning algorithms. 48 | * Provide sample code demonstrating the analysis of MIMIC III data. 49 | 50 | ### High-Level Requirements 51 | List the requirements that must be satisfied in order for the project’s goals to be realised. 52 | 53 | | Requirement | Comment | 54 | | ------------- |---------------------------------| 55 | |Verify physionet.org authorization |Required to dowload data files| 56 | |Dowload data | | 57 | |Load data into pandas DataFrames | | 58 | |Convert data into machine learning format | | 59 | |Analyze data with machine learning algorithm |Examples using sk-learn and TensorFlow | 60 | 61 | 62 |   63 | ### Milestones and Deliverables 64 | 65 | 66 | | Milestone | Deliverable | 67 | | ------------- |---------------------------------------------------------------------| 68 | |Version 0.1 |Ability to download data | 69 | |Version 0.2 |Ability to load data into pandas Dataframe | 70 | |Version 0.3 |One data set in a Machine Learning format | 71 | |Version 0.4 |One functioning machine learning example using sk-learn | 72 | |Version 0.5 |One functioning machine learning example using TensorFlow | 73 | 74 | ### Project Plan 75 | 76 | #### Timeline 77 | | Milestone | Target Date | 78 | | ------------- |---------------------------------------------------------------------| 79 | |Version 0.1 |2017-12-01 | 80 | |Version 0.2 |2017-12-15 | 81 | |Version 0.3 |2017-12-31 | 82 | |Version 0.4 |2017-01-15 | 83 | |Version 0.5 |2017-02-15 | 84 | 85 | ### Financial Estimates 86 | #### Estimate 87 | This project has no expected financial cost. 88 | 89 | ## Risks and Assumptions 90 | ### Risk Analysis 91 | N/A 92 | 93 | ### Assumptions 94 | * MIMIC III data will continue to be publically availible on physionet.org. 95 | * Developers will contribute to this project pro-bono. 96 | * There is no critical need to complete this project by the stated deadlines. 97 | 98 | 99 | 100 |   101 | ## Project Organization 102 | Describe the key roles in the project, who fills them, and the responsibilities of each role. 103 | 104 | | Name | Role | Responsibilities | 105 | | --------- |-------------------|--------------------------| 106 | |Spiro Ganas | Project Manager | PM Responsibilities | 107 | 108 | 109 | 110 | 111 | 112 | ### Project Advisory Board/Steering Committee 113 | N/A 114 | ### Project Stakeholders 115 | * MIMIC3py library developers 116 | * Healthcare data scientists 117 | * Students studying healthcare data analytics 118 | 119 | ## Project Approval 120 | 121 | *This project has been reviewed and the Project Charter accepted by the following people, as indicated by signature below:* 122 | 123 | 124 | **Full Name** Spiro Ganas 125 | 126 | **Title** Project Manager 127 | 128 | **Signature** Spiro Ganas 129 | 130 | **Date**2017-11-23 131 | 132 | 133 | 134 | ## APPENDIX A - REFERENCES 135 | | Reference | Comments | 136 | | ------------------------|----------------------------------| 137 | |MIMIC III Website|https://mimic.physionet.org/| 138 | 139 | 140 | ## APPENDIX B - GLOSSARY 141 | | Term | Definition | 142 | | ------------------------|----------------------------------| 143 | |MIMIC III Critical Care Database|A database containing electronic health records derived from critical care unit EHR systems. | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "3efefafc536d6c1c595d045217ead811a273d7cf695fb6773a8c292ad0aa57ee" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "absl-py": { 20 | "hashes": [ 21 | "sha256:b943d1c567743ed0455878fcd60bc28ac9fae38d129d1ccfad58079da00b8951" 22 | ], 23 | "version": "==0.7.1" 24 | }, 25 | "astor": { 26 | "hashes": [ 27 | "sha256:95c30d87a6c2cf89aa628b87398466840f0ad8652f88eb173125a6df8533fb8d", 28 | "sha256:fb503b9e2fdd05609fbf557b916b4a7824171203701660f0c55bbf5a7a68713e" 29 | ], 30 | "version": "==0.7.1" 31 | }, 32 | "certifi": { 33 | "hashes": [ 34 | "sha256:59b7658e26ca9c7339e00f8f4636cdfe59d34fa37b9b04f6f9e9926b3cece1a5", 35 | "sha256:b26104d6835d1f5e49452a26eb2ff87fe7090b89dfcaee5ea2212697e1e1d7ae" 36 | ], 37 | "version": "==2019.3.9" 38 | }, 39 | "chardet": { 40 | "hashes": [ 41 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 42 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 43 | ], 44 | "version": "==3.0.4" 45 | }, 46 | "gast": { 47 | "hashes": [ 48 | "sha256:fe939df4583692f0512161ec1c880e0a10e71e6a232da045ab8edd3756fbadf0" 49 | ], 50 | "version": "==0.2.2" 51 | }, 52 | "grpcio": { 53 | "hashes": [ 54 | "sha256:177fb72c6c8f89788d8065ee5dd239af7a7a26962a4d81777c156c4813ae87dc", 55 | "sha256:181c0066d0005f81f384cbeade9d5a41dee1e97dc9e6ba005122d4a753b1f056", 56 | "sha256:26860ffbb5830d3dca76955699996e59703a6b2d4f71c33617178817a6fea303", 57 | "sha256:277ab0a2c0eb503491300c89a34093caa33dbc5c05c5f49e3f01c65c5fbbeb83", 58 | "sha256:36719664bc1bf8db4027b576b0f65f36a6cbcea69fe522ea29ee9d02fb4191a5", 59 | "sha256:43caff66c30fcfcb59424ec121bac0db6b0831da1310e7ec7ec8ad847ecef30b", 60 | "sha256:4666caf3e642ece8119597482734ab16f14b5d842334b45267182aace0fb2dae", 61 | "sha256:48bb0526c612c0e5ae39d6e1f5d12eb75ee5e588560a86347dfc381717ed091d", 62 | "sha256:69d5bb14479be7d6c7097715c54eebbda2c0ba6a8e228aabc69f18e1fd6eaed4", 63 | "sha256:6aaf92894f12bc7d7c467e0b7f665a319a20b48d41297a3a77de95da8d03a73b", 64 | "sha256:739e97e7d25b5fb4ed3dc7daedd725a04e1caccce7bf7f2cbc6fa4efef975b05", 65 | "sha256:79977b7fd02f83ca1eb2489415509f13bf2ee16b49f53e1893547fa54a057242", 66 | "sha256:7ac4c298579015b84d612ac70ec7b7aeba44db7e87ac075402a3b20a4402553a", 67 | "sha256:7c0ebcbe5c52ca179fe3e47c3f12bb74d62227d762bb62a42d0fb6178a8b5845", 68 | "sha256:9207a8339078dc5558e5ae15a8e95012c31dcc317029224a159292c1cae382b2", 69 | "sha256:953a72d9f4f4f011789c6147fbe5e93419fdf3891d8ee5840fd9c7bd02960739", 70 | "sha256:9b93e74fceec8808e74a7cfde9f852d35f34283cf6441cdecbfbfb39611bd143", 71 | "sha256:9fe8d4d694b2ccb272473ac4b8813b3a649ea53448235b43759913c99fe5f0d0", 72 | "sha256:a4bde6b6800d82f772f72669d1772cf740d1aec5ca98c8edffa3dfd531ecf7c6", 73 | "sha256:a9d8f04e820dc20fa120e0af4f8e995155b8dcc475c34fb31be105c8eeac74c3", 74 | "sha256:bc26186dca4ee75dbe33a367217e7318f4fc135c5f02437e7f84fa853e9a877d", 75 | "sha256:ceaa0e4ea3259bea36d08f332e49bb28120669b651b1a50d64e553ac4add3a86", 76 | "sha256:d4fea86ed6c1585ac51cb15f2ae8d58feeebcdf11cb6de38807fffedb2e019b2", 77 | "sha256:d51a486a1cfdedb19d3f50e12c030841384f46fcb1ec0de216fc55a85e752321", 78 | "sha256:dc28b9ea5e6c2c75a54cd194c15a8a742a7ad710c1ccfca8e84d50420a2a1aef", 79 | "sha256:dd55e694ddd1c439fb933ac21c297a7e019c7db7b44178579822129cd28b8c44", 80 | "sha256:e20804844b4ce2db0f700762c352170a3957a80080703ee3a53cce2e72849d36", 81 | "sha256:ecbd9900d4237cdb166ab923a9c76ad16880d166494e04df58ae7f8e2a14a9dd", 82 | "sha256:f8c5c455e4a843500f236231fa250eb4e2e81c1e8f2280ae56f1ed3f25f8c332", 83 | "sha256:f9119f484a04ea1a006f4d7b4a9ca4a77ff91c5afd166a2aa0417e6bd5b4eb3b", 84 | "sha256:f936b811ecb44cf8a83c8b8fea7b9191e04f96d168681b2744a4b7c1b3812b34", 85 | "sha256:fa61afe6258f16efc4c6793a3a39bcd5a7eec43ea1789fa2d9e437a2e4dbeaf4" 86 | ], 87 | "version": "==1.20.0" 88 | }, 89 | "h5py": { 90 | "hashes": [ 91 | "sha256:05750b91640273c69989c657eaac34b091abdd75efc8c4824c82aaf898a2da0a", 92 | "sha256:082a27208aa3a2286e7272e998e7e225b2a7d4b7821bd840aebf96d50977abbb", 93 | "sha256:08e2e8297195f9e813e894b6c63f79372582787795bba2014a2db6a2de95f713", 94 | "sha256:0dd2adeb2e9de5081eb8dcec88874e7fd35dae9a21557be3a55a3c7d491842a4", 95 | "sha256:0f94de7a10562b991967a66bbe6dda9808e18088676834c0a4dcec3fdd3bcc6f", 96 | "sha256:106e42e2e01e486a3d32eeb9ba0e3a7f65c12fa8998d63625fa41fb8bdc44cdb", 97 | "sha256:1606c66015f04719c41a9863c156fc0e6b992150de21c067444bcb82e7d75579", 98 | "sha256:1854c4beff9961e477e133143c5e5e355dac0b3ebf19c52cf7cc1b1ef757703c", 99 | "sha256:1e9fb6f1746500ea91a00193ce2361803c70c6b13f10aae9a33ad7b5bd28e800", 100 | "sha256:2cca17e80ddb151894333377675db90cd0279fa454776e0a4f74308376afd050", 101 | "sha256:30e365e8408759db3778c361f1e4e0fe8e98a875185ae46c795a85e9bafb9cdf", 102 | "sha256:3206bac900e16eda81687d787086f4ffd4f3854980d798e191a9868a6510c3ae", 103 | "sha256:3c23d72058647cee19b30452acc7895621e2de0a0bd5b8a1e34204b9ea9ed43c", 104 | "sha256:407b5f911a83daa285bbf1ef78a9909ee5957f257d3524b8606be37e8643c5f0", 105 | "sha256:4162953714a9212d373ac953c10e3329f1e830d3c7473f2a2e4f25dd6241eef0", 106 | "sha256:5fc7aba72a51b2c80605eba1c50dbf84224dcd206279d30a75c154e5652e1fe4", 107 | "sha256:713ac19307e11de4d9833af0c4bd6778bde0a3d967cafd2f0f347223711c1e31", 108 | "sha256:71b946d80ef3c3f12db157d7778b1fe74a517ca85e94809358b15580983c2ce2", 109 | "sha256:8cc4aed71e20d87e0a6f02094d718a95252f11f8ed143bc112d22167f08d4040", 110 | "sha256:9d41ca62daf36d6b6515ab8765e4c8c4388ee18e2a665701fef2b41563821002", 111 | "sha256:a744e13b000f234cd5a5b2a1f95816b819027c57f385da54ad2b7da1adace2f3", 112 | "sha256:b087ee01396c4b34e9dc41e3a6a0442158206d383c19c7d0396d52067b17c1cb", 113 | "sha256:b0f03af381d33306ce67d18275b61acb4ca111ced645381387a02c8a5ee1b796", 114 | "sha256:b9e4b8dfd587365bdd719ae178fa1b6c1231f81280b1375eef8626dfd8761bf3", 115 | "sha256:c5dd4ec75985b99166c045909e10f0534704d102848b1d9f0992720e908928e7", 116 | "sha256:d2b82f23cd862a9d05108fe99967e9edfa95c136f532a71cb3d28dc252771f50", 117 | "sha256:e58a25764472af07b7e1c4b10b0179c8ea726446c7141076286e41891bf3a563", 118 | "sha256:f3b49107fbfc77333fc2b1ef4d5de2abcd57e7ea3a1482455229494cf2da56ce" 119 | ], 120 | "version": "==2.9.0" 121 | }, 122 | "idna": { 123 | "hashes": [ 124 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 125 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 126 | ], 127 | "version": "==2.8" 128 | }, 129 | "keras-applications": { 130 | "hashes": [ 131 | "sha256:60607b2b98868983e5153bf1cc6aa468ba73adc93bc977a90edaa4bc595e69fa", 132 | "sha256:94b8acc84fb8b1e3d752e20ed4cafa8377c9ecf6e6c1aa09942d959dc02e439a" 133 | ], 134 | "version": "==1.0.7" 135 | }, 136 | "keras-preprocessing": { 137 | "hashes": [ 138 | "sha256:0170b799a7562f80ad7931d22d56de22cf4bdd502e11c48f31a46380137a70a8", 139 | "sha256:5e3700117981c2db762e512ed6586638124fac5842170701628088a11aeb51ac" 140 | ], 141 | "version": "==1.0.9" 142 | }, 143 | "markdown": { 144 | "hashes": [ 145 | "sha256:fc4a6f69a656b8d858d7503bda633f4dd63c2d70cf80abdc6eafa64c4ae8c250", 146 | "sha256:fe463ff51e679377e3624984c829022e2cfb3be5518726b06f608a07a3aad680" 147 | ], 148 | "version": "==3.1" 149 | }, 150 | "mock": { 151 | "hashes": [ 152 | "sha256:5ce3c71c5545b472da17b72268978914d0252980348636840bd34a00b5cc96c1", 153 | "sha256:b158b6df76edd239b8208d481dc46b6afd45a846b7812ff0ce58971cf5bc8bba" 154 | ], 155 | "version": "==2.0.0" 156 | }, 157 | "numpy": { 158 | "hashes": [ 159 | "sha256:1980f8d84548d74921685f68096911585fee393975f53797614b34d4f409b6da", 160 | "sha256:22752cd809272671b273bb86df0f505f505a12368a3a5fc0aa811c7ece4dfd5c", 161 | "sha256:23cc40313036cffd5d1873ef3ce2e949bdee0646c5d6f375bf7ee4f368db2511", 162 | "sha256:2b0b118ff547fecabc247a2668f48f48b3b1f7d63676ebc5be7352a5fd9e85a5", 163 | "sha256:3a0bd1edf64f6a911427b608a894111f9fcdb25284f724016f34a84c9a3a6ea9", 164 | "sha256:3f25f6c7b0d000017e5ac55977a3999b0b1a74491eacb3c1aa716f0e01f6dcd1", 165 | "sha256:4061c79ac2230594a7419151028e808239450e676c39e58302ad296232e3c2e8", 166 | "sha256:560ceaa24f971ab37dede7ba030fc5d8fa173305d94365f814d9523ffd5d5916", 167 | "sha256:62be044cd58da2a947b7e7b2252a10b42920df9520fc3d39f5c4c70d5460b8ba", 168 | "sha256:6c692e3879dde0b67a9dc78f9bfb6f61c666b4562fd8619632d7043fb5b691b0", 169 | "sha256:6f65e37b5a331df950ef6ff03bd4136b3c0bbcf44d4b8e99135d68a537711b5a", 170 | "sha256:7a78cc4ddb253a55971115f8320a7ce28fd23a065fc33166d601f51760eecfa9", 171 | "sha256:80a41edf64a3626e729a62df7dd278474fc1726836552b67a8c6396fd7e86760", 172 | "sha256:893f4d75255f25a7b8516feb5766c6b63c54780323b9bd4bc51cdd7efc943c73", 173 | "sha256:972ea92f9c1b54cc1c1a3d8508e326c0114aaf0f34996772a30f3f52b73b942f", 174 | "sha256:9f1d4865436f794accdabadc57a8395bd3faa755449b4f65b88b7df65ae05f89", 175 | "sha256:9f4cd7832b35e736b739be03b55875706c8c3e5fe334a06210f1a61e5c2c8ca5", 176 | "sha256:adab43bf657488300d3aeeb8030d7f024fcc86e3a9b8848741ea2ea903e56610", 177 | "sha256:bd2834d496ba9b1bdda3a6cf3de4dc0d4a0e7be306335940402ec95132ad063d", 178 | "sha256:d20c0360940f30003a23c0adae2fe50a0a04f3e48dc05c298493b51fd6280197", 179 | "sha256:d3b3ed87061d2314ff3659bb73896e622252da52558f2380f12c421fbdee3d89", 180 | "sha256:dc235bf29a406dfda5790d01b998a1c01d7d37f449128c0b1b7d1c89a84fae8b", 181 | "sha256:fb3c83554f39f48f3fa3123b9c24aecf681b1c289f9334f8215c1d3c8e2f6e5b" 182 | ], 183 | "version": "==1.16.2" 184 | }, 185 | "pandas": { 186 | "hashes": [ 187 | "sha256:071e42b89b57baa17031af8c6b6bbd2e9a5c68c595bc6bf9adabd7a9ed125d3b", 188 | "sha256:17450e25ae69e2e6b303817bdf26b2cd57f69595d8550a77c308be0cd0fd58fa", 189 | "sha256:17916d818592c9ec891cbef2e90f98cc85e0f1e89ed0924c9b5220dc3209c846", 190 | "sha256:2538f099ab0e9f9c9d09bbcd94b47fd889bad06dc7ae96b1ed583f1dc1a7a822", 191 | "sha256:366f30710172cb45a6b4f43b66c220653b1ea50303fbbd94e50571637ffb9167", 192 | "sha256:42e5ad741a0d09232efbc7fc648226ed93306551772fc8aecc6dce9f0e676794", 193 | "sha256:4e718e7f395ba5bfe8b6f6aaf2ff1c65a09bb77a36af6394621434e7cc813204", 194 | "sha256:4f919f409c433577a501e023943e582c57355d50a724c589e78bc1d551a535a2", 195 | "sha256:4fe0d7e6438212e839fc5010c78b822664f1a824c0d263fd858f44131d9166e2", 196 | "sha256:5149a6db3e74f23dc3f5a216c2c9ae2e12920aa2d4a5b77e44e5b804a5f93248", 197 | "sha256:627594338d6dd995cfc0bacd8e654cd9e1252d2a7c959449228df6740d737eb8", 198 | "sha256:83c702615052f2a0a7fb1dd289726e29ec87a27272d775cb77affe749cca28f8", 199 | "sha256:8c872f7fdf3018b7891e1e3e86c55b190e6c5cee70cab771e8f246c855001296", 200 | "sha256:90f116086063934afd51e61a802a943826d2aac572b2f7d55caaac51c13db5b5", 201 | "sha256:a3352bacac12e1fc646213b998bce586f965c9d431773d9e91db27c7c48a1f7d", 202 | "sha256:bcdd06007cca02d51350f96debe51331dec429ac8f93930a43eb8fb5639e3eb5", 203 | "sha256:c1bd07ebc15285535f61ddd8c0c75d0d6293e80e1ee6d9a8d73f3f36954342d0", 204 | "sha256:c9a4b7c55115eb278c19aa14b34fcf5920c8fe7797a09b7b053ddd6195ea89b3", 205 | "sha256:cc8fc0c7a8d5951dc738f1c1447f71c43734244453616f32b8aa0ef6013a5dfb", 206 | "sha256:d7b460bc316064540ce0c41c1438c416a40746fd8a4fb2999668bf18f3c4acf1" 207 | ], 208 | "index": "pypi", 209 | "version": "==0.24.2" 210 | }, 211 | "pbr": { 212 | "hashes": [ 213 | "sha256:8257baf496c8522437e8a6cfe0f15e00aedc6c0e0e7c9d55eeeeab31e0853843", 214 | "sha256:8c361cc353d988e4f5b998555c88098b9d5964c2e11acf7b0d21925a66bb5824" 215 | ], 216 | "version": "==5.1.3" 217 | }, 218 | "protobuf": { 219 | "hashes": [ 220 | "sha256:21e395d7959551e759d604940a115c51c6347d90a475c9baf471a1a86b5604a9", 221 | "sha256:57e05e16955aee9e6a0389fcbd58d8289dd2420e47df1a1096b3a232c26eb2dd", 222 | "sha256:67819e8e48a74c68d87f25cad9f40edfe2faf278cdba5ca73173211b9213b8c9", 223 | "sha256:75da7d43a2c8a13b0bc7238ab3c8ae217cbfd5979d33b01e98e1f78defb2d060", 224 | "sha256:78e08371e236f193ce947712c072542ff19d0043ab5318c2ea46bbc2aaebdca6", 225 | "sha256:7ee5b595db5abb0096e8c4755e69c20dfad38b2d0bcc9bc7bafc652d2496b471", 226 | "sha256:86260ecfe7a66c0e9d82d2c61f86a14aa974d340d159b829b26f35f710f615db", 227 | "sha256:92c77db4bd33ea4ee5f15152a835273f2338a5246b2cbb84bab5d0d7f6e9ba94", 228 | "sha256:9c7b90943e0e188394b4f068926a759e3b4f63738190d1ab3d500d53b9ce7614", 229 | "sha256:a77f217ea50b2542bae5b318f7acee50d9fc8c95dd6d3656eaeff646f7cab5ee", 230 | "sha256:ad589ed1d1f83db22df867b10e01fe445516a5a4d7cfa37fe3590a5f6cfc508b", 231 | "sha256:b06a794901bf573f4b2af87e6139e5cd36ac7c91ac85d7ae3fe5b5f6fc317513", 232 | "sha256:bd8592cc5f8b4371d0bad92543370d4658dc41a5ccaaf105597eb5524c616291", 233 | "sha256:be48e5a6248a928ec43adf2bea037073e5da692c0b3c10b34f9904793bd63138", 234 | "sha256:cc5eb13f5ccc4b1b642cc147c2cdd121a34278b341c7a4d79e91182fff425836", 235 | "sha256:cd3b0e0ad69b74ee55e7c321f52a98effed2b4f4cc9a10f3683d869de00590d5", 236 | "sha256:d6e88c4920660aa75c0c2c4b53407aef5efd9a6e0ca7d2fc84d79aba2ccbda3a", 237 | "sha256:ec3c49b6d247152e19110c3a53d9bb4cf917747882017f70796460728b02722e" 238 | ], 239 | "version": "==3.7.1" 240 | }, 241 | "python-dateutil": { 242 | "hashes": [ 243 | "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", 244 | "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" 245 | ], 246 | "version": "==2.8.0" 247 | }, 248 | "pytz": { 249 | "hashes": [ 250 | "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda", 251 | "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141" 252 | ], 253 | "version": "==2019.1" 254 | }, 255 | "requests": { 256 | "hashes": [ 257 | "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", 258 | "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" 259 | ], 260 | "index": "pypi", 261 | "version": "==2.21.0" 262 | }, 263 | "six": { 264 | "hashes": [ 265 | "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", 266 | "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" 267 | ], 268 | "version": "==1.12.0" 269 | }, 270 | "tensorboard": { 271 | "hashes": [ 272 | "sha256:53d8f40589c903dae65f39a799c2bc49defae3703754984d90613d26ebd714a4", 273 | "sha256:b664fe7772be5670d8b04200342e681af7795a12cd752709aed565c06c0cc196" 274 | ], 275 | "version": "==1.13.1" 276 | }, 277 | "tensorflow": { 278 | "hashes": [ 279 | "sha256:0de5887495c20e1130ae4d9bcfaf80cec87f579a9c27a84141a588a46e5aa853", 280 | "sha256:0f305f3c461ed2ce5e0b65fccc7b7452f483c7935dd8a52a466d622e642fdea8", 281 | "sha256:4325f20b5a703b80a5f7a8807f07ad8735025bd2a947093ffff1c26fbdc7980b", 282 | "sha256:4c86be0e476b64cedf4ffa059d71b764e75b895effb697345687e3057929a7b5", 283 | "sha256:6b0a0a413390302ce7c22c98695983d6fb8406861cfb418b25536f57a96c0b89", 284 | "sha256:77eec2351d0a9b5312ea01ee4c78c13996f249cf1bead2e68256a65e533f45ef", 285 | "sha256:87bf719a564f11d63e4f614e933e5a612dd4e67c88266b774236e0982f5fcf69", 286 | "sha256:ba29e66331cd2a8f824e0fa937ce44bd624bc37739f2f083694e473051d89ace", 287 | "sha256:bc374f5a662b6e164cd1c4da61ccc752ec208a44893d2f9dcf47d2a0a2cef311", 288 | "sha256:bcf86966b7554e407bb7d73341f2e108df62a910d40b4cd2a914867f2a5de51c", 289 | "sha256:c3abffd51c168cfd62a557243c47a29ab48deb52a64465e6818060f20755ddb4", 290 | "sha256:c41862c65628261229db22e33f9e570d845eeb5cea66dcbaebe404405edaa69b", 291 | "sha256:d7341617aedd73c2c847755e87697e9c19eb625c73da26d6cd669220c5565119", 292 | "sha256:de0425b58cb34006e4500565239b4c3a3055b95bff132f097fa46c87d8e463c9", 293 | "sha256:f21fb65c8e874f40c654bc9b3ff3db3ec26f98f03fe64a541bc768f6f5c52ac2" 294 | ], 295 | "index": "pypi", 296 | "version": "==1.13.1" 297 | }, 298 | "tensorflow-estimator": { 299 | "hashes": [ 300 | "sha256:7cfdaa3e83e3532f31713713feb98be7ea9f3065722be4267e49b6c301271419" 301 | ], 302 | "version": "==1.13.0" 303 | }, 304 | "termcolor": { 305 | "hashes": [ 306 | "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b" 307 | ], 308 | "version": "==1.1.0" 309 | }, 310 | "urllib3": { 311 | "hashes": [ 312 | "sha256:4c291ca23bbb55c76518905869ef34bdd5f0e46af7afe6861e8375643ffee1a0", 313 | "sha256:9a247273df709c4fedb38c711e44292304f73f39ab01beda9f6b9fc375669ac3" 314 | ], 315 | "version": "==1.24.2" 316 | }, 317 | "werkzeug": { 318 | "hashes": [ 319 | "sha256:0a73e8bb2ff2feecfc5d56e6f458f5b99290ef34f565ffb2665801ff7de6af7a", 320 | "sha256:7fad9770a8778f9576693f0cc29c7dcc36964df916b83734f4431c0e612a7fbc" 321 | ], 322 | "version": "==0.15.2" 323 | }, 324 | "wheel": { 325 | "hashes": [ 326 | "sha256:66a8fd76f28977bb664b098372daef2b27f60dc4d1688cfab7b37a09448f0e9d", 327 | "sha256:8eb4a788b3aec8abf5ff68d4165441bc57420c9f64ca5f471f58c3969fe08668" 328 | ], 329 | "markers": "python_version >= '3'", 330 | "version": "==0.33.1" 331 | } 332 | }, 333 | "develop": {} 334 | } 335 | -------------------------------------------------------------------------------- /MIMIC3py/load_to_pandas.py: -------------------------------------------------------------------------------- 1 | # This function returns the MIMIC-III data as a dictionary of pandas data sets 2 | 3 | 4 | import pandas as pd 5 | import os.path 6 | 7 | 8 | def load_mimic_to_pandas(CSV_Folder_Location='/data/', CSV_List=None, gunzip=False): 9 | """Returns a dictionary where the key is the file name and the value is a Pandas 10 | dataframe containing the data. 11 | """ 12 | MIMIC_df = {} 13 | 14 | # Load the dictionary of dtypes from the function below 15 | dtypes = Mimic_dtypes() 16 | 17 | # if the files are still .gz archive files, set gunzip=True 18 | # if the files have already been decompressed to .csv format, gunzip should be false 19 | fileNameExtension = '.csv.gz' if gunzip else '.csv' 20 | compressionType = 'gzip' if gunzip else None 21 | 22 | # If the user doesn't tell us what tables to load, we load all the tables. 23 | # Note that some of the tables are very large, so you may need 50+ GB of memory to load them. 24 | if CSV_List is None: 25 | CSV_List = ['ADMISSIONS', # 12 MB 26 | 'CALLOUT', # 6.1 MB 27 | 'CAREGIVERS', # 199 KB 28 | 'CHARTEVENTS', # 33 GB ------BIG!!! 29 | 'CPTEVENTS', # 56 MB 30 | 'DATETIMEEVENTS', # 502 MB 31 | 'DIAGNOSES_ICD', # 19 MB 32 | 'DRGCODES', # 11 MB 33 | 'D_CPT', # 14 KB 34 | 'D_ICD_DIAGNOSES', # 1.4 KB 35 | 'D_ICD_PROCEDURES', # 305 KB 36 | 'D_ITEMS', # 933 KB 37 | 'D_LABITEMS', # 43 KB 38 | 'ICUSTAYS', # 6.1 MB 39 | 'INPUTEVENTS_CV', # 2.3 GB ------BIG!!! 40 | 'INPUTEVENTS_MV', # 931 MB 41 | 'LABEVENTS', # 1.8GB ------BIG!!! 42 | 'MICROBIOLOGYEVENTS', # 70 MB 43 | 'NOTEEVENTS', # 3.8 GB ------BIG!!! 44 | 'OUTPUTEVENTS', # 379 MB 45 | 'PATIENTS', # 2.6 MB 46 | 'PRESCRIPTIONS', # 735 MB 47 | 'PROCEDUREEVENTS_MV', # 47 MB 48 | 'PROCEDURES_ICD', # 6.5 MB 49 | 'SERVICES', # 3.4 MB 50 | 'TRANSFERS', # 24 MB 51 | ] 52 | 53 | for MyFile in CSV_List: 54 | try: 55 | MIMIC_df[MyFile] = pd.read_csv(os.path.join(CSV_Folder_Location, (MyFile + fileNameExtension)), 56 | dtype=dtypes[MyFile], 57 | parse_dates=True, 58 | sep=',', 59 | index_col='ROW_ID', 60 | compression=compressionType 61 | ) 62 | except: 63 | print('Unable to load the file: ', MyFile) 64 | 65 | return MIMIC_df 66 | 67 | 68 | def Mimic_dtypes(): 69 | """Returns a dictionary of dictionaries. 70 | Each dictionary is a list of column names 71 | and their pandas data type. 72 | Note that any numeric field that can contain 73 | a null must be declared as a float (its a pandas bug). 74 | """ 75 | dtypes = {} 76 | dtypes['ADMISSIONS'] = {'ROW_ID': int, 77 | 'SUBJECT_ID': int, 78 | 'HADM_ID': float, 79 | 'ADMITTIME': str, 80 | 'DISCHTIME': str, 81 | 'DEATHTIME': str, 82 | 'ADMISSION_TYPE': str, 83 | 'ADMISSION_LOCATION': str, 84 | 'DISCHARGE_LOCATION': str, 85 | 'INSURANCE': str, 86 | 'LANGUAGE': str, 87 | 'RELIGION': str, 88 | 'MARITAL_STATUS': str, 89 | 'ETHNICITY': str, 90 | 'EDREGTIME': str, 91 | 'EDOUTTIME': str, 92 | 'DIAGNOSIS': str, 93 | 'HOSPITAL_EXPIRE_FLAG': float, 94 | 'HAS_CHARTEVENTS_DATA': float, 95 | } 96 | dtypes['CALLOUT'] = {'ROW_ID': int, 97 | 'SUBJECT_ID': float, 98 | 'HADM_ID': float, 99 | 'SUBMIT_WARDID': float, 100 | 'SUBMIT_CAREUNIT': str, 101 | 'CURR_WARDID': float, 102 | 'CURR_CAREUNIT': str, 103 | 'CALLOUT_WARDID': float, 104 | 'CALLOUT_SERVICE': str, 105 | 'REQUEST_TELE': float, 106 | 'REQUEST_RESP': float, 107 | 'REQUEST_CDIFF': float, 108 | 'REQUEST_MRSA': float, 109 | 'REQUEST_VRE': float, 110 | 'CALLOUT_STATUS': str, 111 | 'CALLOUT_OUTCOME': str, 112 | 'DISCHARGE_WARDID': float, 113 | 'ACKNOWLEDGE_STATUS': str, 114 | 'CREATETIME': str, 115 | 'UPDATETIME': str, 116 | 'ACKNOWLEDGETIME': str, 117 | 'OUTCOMETIME': str, 118 | 'FIRSTRESERVATIONTIME': str, 119 | 'CURRENTRESERVATIONTIME': str, 120 | } 121 | dtypes['CAREGIVERS'] = {'ROW_ID': int, 122 | 'CGID': float, 123 | 'LABEL': str, 124 | 'DESCRIPTION': str, 125 | } 126 | dtypes['CHARTEVENTS'] = {'ROW_ID': int, 127 | 'SUBJECT_ID': float, 128 | 'HADM_ID': float, 129 | 'ICUSTAY_ID': float, 130 | 'ITEMID': float, 131 | 'CHARTTIME': str, 132 | 'STORETIME': str, 133 | 'CGID': float, 134 | 'VALUE': str, 135 | 'VALUENUM': float, 136 | 'VALUEUOM': str, 137 | 'WARNING': float, 138 | 'ERROR': float, 139 | 'RESULTSTATUS': str, 140 | 'STOPPED': str, 141 | } 142 | dtypes['CPTEVENTS'] = {'ROW_ID': float, 143 | 'SUBJECT_ID': float, 144 | 'HADM_ID': float, 145 | 'COSTCENTER': str, 146 | 'CHARTDATE': str, 147 | 'CPT_CD': str, 148 | 'CPT_NUMBER': float, 149 | 'CPT_SUFFIX': str, 150 | 'TICKET_ID_SEQ': float, 151 | 'SECTIONHEADER': str, 152 | 'SUBSECTIONHEADER': str, 153 | 'DESCRIPTION': str, 154 | } 155 | dtypes['DATETIMEEVENTS'] = {'ROW_ID': int, 156 | 'SUBJECT_ID': float, 157 | 'HADM_ID': float, 158 | 'ICUSTAY_ID': float, 159 | 'ITEMID': float, 160 | 'CHARTTIME': str, 161 | 'STORETIME': str, 162 | 'CGID': float, 163 | 'VALUE': str, 164 | 'VALUEUOM': str, 165 | 'WARNING': float, 166 | 'ERROR': float, 167 | 'RESULTSTATUS': str, 168 | 'STOPPED': str, 169 | } 170 | dtypes['DIAGNOSES_ICD'] = {'ROW_ID': int, 171 | 'SUBJECT_ID': int, 172 | 'HADM_ID': float, 173 | 'SEQ_NUM': float, 174 | 'ICD9_CODE': str, 175 | } 176 | dtypes['DRGCODES'] = {'ROW_ID': int, 177 | 'SUBJECT_ID': float, 178 | 'HADM_ID': float, 179 | 'DRG_TYPE': str, 180 | 'DRG_CODE': str, 181 | 'DESCRIPTION': str, 182 | 'DRG_SEVERITY': float, 183 | 'DRG_MORTALITY': float, 184 | } 185 | dtypes['D_CPT'] = {'ROW_ID': int, 186 | 'CATEGORY': float, 187 | 'SECTIONRANGE': str, 188 | 'SECTIONHEADER': str, 189 | 'SUBSECTIONRANGE': str, 190 | 'SUBSECTIONHEADER': str, 191 | 'CODESUFFIX': str, 192 | 'MINCODEINSUBSECTION': float, 193 | 'MAXCODEINSUBSECTION': float, 194 | } 195 | dtypes['D_ICD_DIAGNOSES'] = {'ROW_ID': int, 196 | 'ICD9_CODE': str, 197 | 'SHORT_TITLE': str, 198 | 'LONG_TITLE': str, 199 | } 200 | dtypes['D_ICD_PROCEDURES'] = {'ROW_ID': int, 201 | 'ICD9_CODE': str, 202 | 'SHORT_TITLE': str, 203 | 'LONG_TITLE': str, 204 | } 205 | dtypes['D_ITEMS'] = {'ROW_ID': int, 206 | 'ITEMID': float, 207 | 'LABEL': str, 208 | 'ABBREVIATION': str, 209 | 'DBSOURCE': str, 210 | 'LINKSTO': str, 211 | 'CATEGORY': str, 212 | 'UNITNAME': str, 213 | 'PARAM_TYPE': str, 214 | 'CONCEPTID': float, 215 | } 216 | dtypes['D_LABITEMS'] = {'ROW_ID': int, 217 | 'ITEMID': float, 218 | 'LABEL': str, 219 | 'FLUID': str, 220 | 'CATEGORY': str, 221 | 'LOINC_CODE': str, 222 | } 223 | dtypes['ICUSTAYS'] = {'ROW_ID': int, 224 | 'SUBJECT_ID': float, 225 | 'HADM_ID': float, 226 | 'ICUSTAY_ID': float, 227 | 'DBSOURCE': str, 228 | 'FIRST_CAREUNIT': str, 229 | 'LAST_CAREUNIT': str, 230 | 'FIRST_WARDID': float, 231 | 'LAST_WARDID': float, 232 | 'INTIME': str, 233 | 'OUTTIME': str, 234 | 'LOS': float, 235 | } 236 | dtypes['INPUTEVENTS_CV'] = {'ROW_ID': int, 237 | 'SUBJECT_ID': float, 238 | 'HADM_ID': float, 239 | 'ICUSTAY_ID': float, 240 | 'CHARTTIME': str, 241 | 'ITEMID': float, 242 | 'AMOUNT': float, 243 | 'AMOUNTUOM': str, 244 | 'RATE': float, 245 | 'RATEUOM': str, 246 | 'STORETIME': str, 247 | 'CGID': float, 248 | 'ORDERID': float, 249 | 'LINKORDERID': float, 250 | 'STOPPED': str, 251 | 'NEWBOTTLE': float, 252 | 'ORIGINALAMOUNT': float, 253 | 'ORIGINALAMOUNTUOM': str, 254 | 'ORIGINALROUTE': str, 255 | 'ORIGINALRATE': float, 256 | 'ORIGINALRATEUOM': str, 257 | 'ORIGINALSITE': str, 258 | } 259 | dtypes['INPUTEVENTS_MV'] = {'ROW_ID': int, 260 | 'SUBJECT_ID': float, 261 | 'HADM_ID': float, 262 | 'ICUSTAY_ID': float, 263 | 'STARTTIME': str, 264 | 'ENDTIME': str, 265 | 'ITEMID': float, 266 | 'AMOUNT': float, 267 | 'AMOUNTUOM': str, 268 | 'RATE': float, 269 | 'RATEUOM': str, 270 | 'STORETIME': str, 271 | 'CGID': float, 272 | 'ORDERID': float, 273 | 'LINKORDERID': float, 274 | 'ORDERCATEGORYNAME': str, 275 | 'SECONDARYORDERCATEGORYNAME': str, 276 | 'ORDERCOMPONENTTYPEDESCRIPTION': str, 277 | 'ORDERCATEGORYDESCRIPTION': str, 278 | 'PATIENTWEIGHT': float, 279 | 'TOTALAMOUNT': float, 280 | 'TOTALAMOUNTUOM': str, 281 | 'ISOPENBAG': float, 282 | 'CONTINUEINNEXTDEPT': float, 283 | 'CANCELREASON': float, 284 | 'STATUSDESCRIPTION': str, 285 | 'COMMENTS_STATUS': str, 286 | 'COMMENTS_TITLE': str, 287 | 'COMMENTS_DATE': str, 288 | 'ORIGINALAMOUNT': float, 289 | 'ORIGINALRATE': float, 290 | } 291 | dtypes['LABEVENTS'] = {'ROW_ID': int, 292 | 'SUBJECT_ID': float, 293 | 'HADM_ID': float, 294 | 'ITEMID': float, 295 | 'CHARTTIME': str, 296 | 'VALUE': str, 297 | 'VALUENUM': float, 298 | 'VALUEUOM': str, 299 | 'FLAG': str, 300 | } 301 | dtypes['MICROBIOLOGYEVENTS'] = {'ROW_ID': int, 302 | 'SUBJECT_ID': float, 303 | 'HADM_ID': float, 304 | 'CHARTDATE': str, 305 | 'CHARTTIME': str, 306 | 'SPEC_ITEMID': float, 307 | 'SPEC_TYPE_CD': str, 308 | 'SPEC_TYPE_DESC': str, 309 | 'ORG_ITEMID': float, 310 | 'ORG_CD': float, 311 | 'ORG_NAME': str, 312 | 'ISOLATE_NUM': float, 313 | 'AB_ITEMID': float, 314 | 'AB_CD': float, 315 | 'AB_NAME': str, 316 | 'DILUTION_TEXT': str, 317 | 'DILUTION_COMPARISON': str, 318 | 'DILUTION_VALUE': float, 319 | 'INTERPRETATION': str, 320 | } 321 | dtypes['NOTEEVENTS'] = {'ROW_ID': int, 322 | 'SUBJECT_ID': float, 323 | 'HADM_ID': float, 324 | 'CHARTDATE': str, 325 | 'CATEGORY': str, 326 | 'DESCRIPTION': str, 327 | 'CGID': float, 328 | 'ISERROR': str, 329 | 'TEXT': str, 330 | } 331 | dtypes['OUTPUTEVENTS'] = {'ROW_ID': int, 332 | 'SUBJECT_ID': float, 333 | 'HADM_ID': float, 334 | 'ICUSTAY_ID': float, 335 | 'CHARTTIME': str, 336 | 'ITEMID': float, 337 | 'VALUE': float, 338 | 'VALUEUOM': str, 339 | 'STORETIME': str, 340 | 'CGID': float, 341 | 'STOPPED': str, 342 | 'NEWBOTTLE': float, 343 | 'ISERROR': float, 344 | } 345 | dtypes['PATIENTS'] = {'ROW_ID': int, 346 | 'SUBJECT_ID': int, 347 | 'GENDER': str, 348 | 'DOB': str, 349 | 'DOD': str, 350 | 'DOD_HOSP': str, 351 | 'DOD_SSN': str, 352 | 'EXPIRE_FLAG': str, 353 | } 354 | dtypes['PRESCRIPTIONS'] = {'ROW_ID': int, 355 | 'SUBJECT_ID': float, 356 | 'HADM_ID': float, 357 | 'ICUSTAY_ID': float, 358 | 'STARTTIME': str, 359 | 'ENDTIME': str, 360 | 'DRUG_TYPE': str, 361 | 'DRUG': str, 362 | 'DRUG_NAME_POE': str, 363 | 'DRUG_NAME_GENERIC': str, 364 | 'FORMULARY_DRUG_CD': str, 365 | 'GSN': str, 366 | 'NDC': str, 367 | 'PROD_STRENGTH': str, 368 | 'DOSE_VAL_RX': str, 369 | 'DOSE_UNIT_RX': str, 370 | 'FORM_VAL_DISP': str, 371 | 'FORM_UNIT_DISP': str, 372 | 'ROUTE': str, 373 | } 374 | dtypes['PROCEDUREEVENTS_MV'] = {'ROW_ID': int, 375 | 'SUBJECT_ID': float, 376 | 'HADM_ID': float, 377 | 'ICUSTAY_ID': float, 378 | 'STARTTIME': str, 379 | 'ENDTIME': str, 380 | 'ITEMID': float, 381 | 'VALUE': float, 382 | 'VALUEUOM': str, 383 | 'LOCATION': str, 384 | 'LOCATIONCATEGORY': str, 385 | 'STORETIME': str, 386 | 'CGID': float, 387 | 'ORDERID': float, 388 | 'LINKORDERID': float, 389 | 'ORDERCATEGORYNAME': str, 390 | 'SECONDARYORDERCATEGORYNAME': str, 391 | 'ORDERCATEGORYDESCRIPTION': str, 392 | 'ISOPENBAG': float, 393 | 'CONTINUEINNEXTDEPT': float, 394 | 'CANCELREASON': float, 395 | 'STATUSDESCRIPTION': str, 396 | 'COMMENTS_EDITEDBY': str, 397 | 'COMMENTS_CANCELEDBY': str, 398 | 'COMMENTS_DATE': str, 399 | } 400 | dtypes['PROCEDURES_ICD'] = {'ROW_ID': int, 401 | 'SUBJECT_ID': float, 402 | 'HADM_ID': float, 403 | 'SEQ_NUM': float, 404 | 'ICD9_CODE': str, 405 | } 406 | dtypes['SERVICES'] = {'ROW_ID': int, 407 | 'SUBJECT_ID': float, 408 | 'HADM_ID': float, 409 | 'TRANSFERTIME': str, 410 | 'PREV_SERVICE': str, 411 | 'CURR_SERVICE': str, 412 | } 413 | dtypes['TRANSFERS'] = {'ROW_ID': int, 414 | 'SUBJECT_ID': int, 415 | 'HADM_ID': float, 416 | 'ICUSTAY_ID': float, 417 | 'DBSOURCE': str, 418 | 'EVENTTYPE': str, 419 | 'PREV_CAREUNIT': str, 420 | 'CURR_CAREUNIT': str, 421 | 'PREV_WARDID': str, 422 | 'CURR_WARDID': str, 423 | 'INTIME': str, 424 | 'OUTTIME': str, 425 | 'LOS': float, 426 | } 427 | return dtypes 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | # TODO: Clean up this part of the MIMIC_Pandas code 440 | if __name__ == '__main__': 441 | '''This is an example showing how you can load the MIMIC csv files 442 | into a dictionary of pandas dataframs''' 443 | 444 | # Set this variable to the folder where the .gz files are located 445 | # On a Windows machine, be sure to use \\ instead of just \ in the file path. 446 | Location_of_the_CSV_Files = "D:\\MIMIC-III Clinical Database\\Data\\" 447 | 448 | # Comment out any files you don't need for your analysis. 449 | # I was able to load all the files to memory on a VM that had 64GB of RAM. 450 | # Loading all the files from their compressed state took over 30 minutes. 451 | List_of_files_you_want_to_load = [ 452 | ########## one # means it has been tested and works, ## means I haven't tested it yet 453 | 'ADMISSIONS', # 12 MB 454 | 'CALLOUT', # 6.1 MB 455 | 'CAREGIVERS', # 199 KB 456 | # 'CHARTEVENTS', # 33 GB ------BIG!!! 457 | 'CPTEVENTS', # 56 MB 458 | # 'DATETIMEEVENTS', # 502 MB 459 | 'DIAGNOSES_ICD', # 19 MB 460 | 'DRGCODES', # 11 MB 461 | 'D_CPT', # 14 KB 462 | 'D_ICD_DIAGNOSES', # 1.4 KB 463 | 'D_ICD_PROCEDURES', # 305 KB 464 | 'D_ITEMS', # 933 KB 465 | 'D_LABITEMS', # 43 KB 466 | 'ICUSTAYS', # 6.1 MB 467 | # 'INPUTEVENTS_CV', # 2.3 GB ------BIG!!! 468 | # 'INPUTEVENTS_MV', # 931 MB ------BIG!!! 469 | # 'LABEVENTS', # 1.8GB ------BIG!!! 470 | # 'MICROBIOLOGYEVENTS', # 70 MB 471 | # 'NOTEEVENTS', # 3.8 GB ------BIG!!! 472 | 'OUTPUTEVENTS', # 379 MB 473 | 'PATIENTS', # 2.6 MB 474 | # 'PRESCRIPTIONS', # 735 MB 475 | 'PROCEDUREEVENTS_MV', # 47 MB 476 | 'PROCEDURES_ICD', # 6.5 MB 477 | 'SERVICES', # 3.4 MB 478 | 'TRANSFERS', # 24 MB 479 | ] 480 | 481 | df = load_mimic_to_pandas(CSV_Folder_Location=Location_of_the_CSV_Files, CSV_List=List_of_files_you_want_to_load, gunzip=True) 482 | 483 | # The output is a dictionary where the file names above are the keys and the values are a 484 | # pandas dataframe containing the data from the file. 485 | # So you can view the first few lines of a file using code like this: 486 | print(df['ADMISSIONS'].head()) 487 | 488 | 489 | 490 | 491 | --------------------------------------------------------------------------------