├── output ├── csv │ └── .gitignore ├── json │ └── .gitignore └── pickles │ └── .gitignore ├── .gitattributes ├── .images ├── code.png ├── db_data.png ├── db_demos.png ├── db_columns.png └── county_rates.png ├── .dockerignore ├── .gitignore ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── lib ├── snake_case.py ├── c19us_combined.py ├── update.py ├── dump_csv_and_json.py ├── c19us_nyt.py ├── c19us_jhu.py ├── upload_to_firestore.py ├── c19all.py ├── county_charts.py └── notebooks │ └── all.ipynb ├── Dockerfile ├── .devcontainer └── devcontainer.json ├── LICENSE.md ├── requirements.txt └── README.md /output/csv/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /output/json/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-detectable=false 2 | 3 | -------------------------------------------------------------------------------- /output/pickles/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /.images/code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willhaslett/covid-19-growth/HEAD/.images/code.png -------------------------------------------------------------------------------- /.images/db_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willhaslett/covid-19-growth/HEAD/.images/db_data.png -------------------------------------------------------------------------------- /.images/db_demos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willhaslett/covid-19-growth/HEAD/.images/db_demos.png -------------------------------------------------------------------------------- /.images/db_columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willhaslett/covid-19-growth/HEAD/.images/db_columns.png -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .dockerignore 3 | .byebug_history 4 | log/* 5 | tmp/* 6 | .gitignore 7 | *.swp 8 | *.pkl -------------------------------------------------------------------------------- /.images/county_rates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willhaslett/covid-19-growth/HEAD/.images/county_rates.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .vscode/ 3 | *.pyc 4 | *.swp 5 | lib/__pycache__/ 6 | .google_service_account_key.json 7 | output/charts/* 8 | 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /lib/snake_case.py: -------------------------------------------------------------------------------- 1 | def convert(string): 2 | n = len(string) 3 | string = list(string) 4 | for i in range(n): 5 | if (string[i] == ' '): 6 | string[i] = '_' 7 | else: 8 | string[i] = string[i].lower() 9 | string = "".join(string) 10 | return string 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-buster 2 | RUN apt update -qq \ 3 | && apt install -y --no-install-recommends \ 4 | python-autopep8 \ 5 | libopenblas-dev \ 6 | liblapack-dev \ 7 | texlive-full \ 8 | gfortran 9 | RUN pip install --upgrade pip 10 | RUN mkdir /app 11 | WORKDIR /app 12 | COPY . /app 13 | RUN pip install -r requirements.txt 14 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Local Dockerfile", 3 | "context": "..", 4 | "dockerFile": "../Dockerfile", 5 | "settings": { 6 | "terminal.integrated.shell.linux": null 7 | }, 8 | "extensions": [ 9 | "ms-python.python", 10 | "ryuta46.multi-command", 11 | "himanoa.python-autopep8", 12 | "randomfractalsinc.vscode-data-preview", 13 | "yzhang.markdown-all-in-one" 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /lib/c19us_combined.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | from c19us_jhu import df_us as df_jhu 5 | from c19us_nyt import df_us as df_nyt 6 | 7 | ''' Merged Johns Hopkins and New York Times US county-level data. ''' 8 | 9 | df_jhu = df_jhu[['cases', 'deaths', 'recovered', 'active']] 10 | df_us = df_nyt.join(df_jhu, lsuffix='_nyt', rsuffix='_jhu') 11 | 12 | pickle_file = open('output/pickles/df_us_combined.p', 'wb') 13 | pickle.dump(df_us, pickle_file) 14 | print('Updated pickle file df_us_combined.p with Johns Hopkins and New York Times data') 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. When I '...' 16 | 2. The '....' 17 | 3. Doesn't '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /lib/update.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | import c19us_nyt 5 | 6 | # For options, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html 7 | JSON_ORIENT = 'table' 8 | 9 | # For not-minified JSON, set to > 0 10 | JSON_INDENT = 0 11 | 12 | DATAFRAMES = { 13 | 'df_us_nyt': c19us_nyt.df_us.reset_index(), 14 | } 15 | 16 | for filename in DATAFRAMES: 17 | DATAFRAMES[filename].to_csv(f'output/csv/{filename}.csv', index=False) 18 | DATAFRAMES[filename].to_json(f'output/json/{filename}.json', orient=JSON_ORIENT, indent=JSON_INDENT) 19 | 20 | print('Updated CSV files with New York Times data') 21 | print('Updated JSON files with New York Times data') 22 | -------------------------------------------------------------------------------- /lib/dump_csv_and_json.py: -------------------------------------------------------------------------------- 1 | import c19all 2 | import c19us_jhu 3 | import c19us_nyt 4 | import c19us_combined 5 | import pandas as pd 6 | 7 | """ Generates CSV and JSON files for all available dataframes """ 8 | 9 | # For options, see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html 10 | JSON_ORIENT = 'table' 11 | 12 | # For not-minified JSON, set to > 0 13 | JSON_INDENT = 0 14 | 15 | DATAFRAMES = { 16 | 'df_all_cases': c19all.df_all['cases'], 17 | 'df_all_deaths': c19all.df_all['deaths'], 18 | 'df_us_jhu': c19us_jhu.df_us.reset_index(), 19 | 'df_us_nyt': c19us_nyt.df_us.reset_index(), 20 | 'df_us_combined': c19us_combined.df_us.reset_index(), 21 | } 22 | 23 | for filename in DATAFRAMES: 24 | DATAFRAMES[filename].to_csv(f'output/csv/{filename}.csv', index=False) 25 | DATAFRAMES[filename].to_json(f'output/json/{filename}.json', orient=JSON_ORIENT, indent=JSON_INDENT) 26 | 27 | print('Updated CSV files') 28 | print('Updated JSON files') 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Will Haslett 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/c19us_nyt.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import urllib 3 | import math 4 | import pickle 5 | import constants 6 | 7 | ''' US county-level data from the New York Times files. ''' 8 | 9 | counties = pd.DataFrame(constants.COUNTIES) 10 | fips = constants.COUNTIES.keys() 11 | county_columns = constants.US_COUNTY_COLUMNS['nyt'] 12 | output_columns = constants.US_OUTPUT_COLUMNS['nyt'] 13 | start_date = constants.START_DATE['nyt'] 14 | 15 | df = pd.read_csv(constants.DATA_URLS['us']['nyt']) 16 | df = df.loc[df.fips.isin(fips)] 17 | df = df.astype({'fips': 'int32'}) 18 | df.date = pd.to_datetime(df.date) 19 | df['start_date'] = pd.to_datetime(start_date) 20 | df['day'] = (df.date - df.start_date).astype('timedelta64[D]') 21 | df.day = df.day.astype('int') 22 | for column in county_columns: 23 | df[column] = df.apply( 24 | lambda row: counties.loc[column, str(row['fips'])], axis=1) 25 | df_us = df[output_columns].set_index(['date', 'fips']) 26 | 27 | try: 28 | get_ipython 29 | except: 30 | pickle_file = open('output/pickles/df_us_nyt.p', 'wb') 31 | pickle.dump(df_us, pickle_file) 32 | print('Updated pickle file df_us_nyt.p with New York Times data') 33 | -------------------------------------------------------------------------------- /lib/c19us_jhu.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from urllib import error, request 3 | import math 4 | import pickle 5 | import constants 6 | 7 | ''' US county-level data from the Johns Hopkins files. ''' 8 | 9 | counties = pd.DataFrame(constants.COUNTIES) 10 | fips = constants.COUNTIES.keys() 11 | county_columns = constants.US_COUNTY_COLUMNS['jhu'] 12 | output_columns = constants.US_OUTPUT_COLUMNS['jhu'] 13 | start_date = constants.START_DATE['jhu'] 14 | 15 | DATE_RANGE = pd.date_range( 16 | start=pd.to_datetime(constants.START_DATE['jhu']), 17 | end=pd.to_datetime('today') 18 | ).tolist() 19 | 20 | def df_from_daily_report(date, url): 21 | df = pd.read_csv( url)[['FIPS', 'Confirmed', 'Deaths', 'Recovered', 'Active']] 22 | df = df.rename(columns=constants.JHU_RENAMED_COLUMNS['daily_reports']) 23 | df = df.loc[df.fips.isin(fips)] 24 | df = df.astype({'fips': 'int32', 'deaths': 'int32', 'recovered': 'int32', 'cases': 'int32'}) 25 | df['date'] = date 26 | df['day'] = (date - pd.to_datetime(start_date)).days 27 | for column in county_columns: 28 | df[column] = df.apply( 29 | lambda row: counties.loc[column, str(row['fips'])], axis=1) 30 | return df[output_columns] 31 | 32 | dfs = [] 33 | for date in DATE_RANGE: 34 | url = constants.DATA_URLS['us']['jhu'].replace( 35 | '##-##-####', date.strftime('%m-%d-%Y')) 36 | try: 37 | response = request.urlopen(url) 38 | except error.HTTPError: 39 | break 40 | else: 41 | dfs.append(df_from_daily_report(date, url)) 42 | 43 | df_us = pd.concat(dfs).set_index(['date', 'fips']) 44 | 45 | try: 46 | get_ipython 47 | except: 48 | pickle_file = open('output/pickles/df_us_jhu.p', 'wb') 49 | pickle.dump(df_us, pickle_file) 50 | print('Updated pickle file df_us_jhu.p with Johns Hopkins data') -------------------------------------------------------------------------------- /lib/upload_to_firestore.py: -------------------------------------------------------------------------------- 1 | import firebase_admin 2 | from firebase_admin import credentials 3 | from firebase_admin import firestore 4 | import pandas as pd 5 | from c19us_combined import df_us 6 | 7 | # Be aware of Firestore pricing: https://firebase.google.com/docs/firestore/pricing 8 | # TODO: 9 | # Async 10 | # Incremental daily uploads 11 | 12 | # Create the database client 13 | cred = credentials.Certificate('.google_service_account_key.json') 14 | firebase_admin.initialize_app(cred, { 15 | 'projectId': 'covidlocal', 16 | }) 17 | db = firestore.client() 18 | 19 | # Turn date and fips into columns 20 | df_us = df_us.reset_index() 21 | 22 | # Create a Firestore document that holds the column names, to accompany values documents 23 | df_us['date_string'] = df_us.date.apply(lambda date: date.strftime('%Y-%m-%d')) 24 | df_us = df_us.drop(columns='date') 25 | columns = {} 26 | for i in range(0, len(df_us.columns)): 27 | columns[str(i)] = df_us.columns[i] 28 | db.collection(u'us-columns').document(u'us-combined').set(columns) 29 | 30 | 31 | # Break up df_us into json strings for each date 32 | date_list = df_us.date_string.unique().tolist() 33 | json_by_date = {} 34 | for date in date_list: 35 | df = df_us[df_us['date_string'] == date] 36 | date_json = df.to_json(orient='values') 37 | json_by_date[date] = date_json 38 | 39 | # Upload JSON string for each date as a document 40 | for date_string, json_string in json_by_date.items(): 41 | db.collection(u'us-data').document(date_string).set({'json': json_string}) 42 | 43 | 44 | def delete_collection(coll_ref, batch_size=100): 45 | docs = coll_ref.limit(batch_size).stream() 46 | deleted = 0 47 | for doc in docs: 48 | print(u'Deleting doc {}'.format(doc.id)) 49 | doc.reference.delete() 50 | deleted = deleted + 1 51 | if deleted >= batch_size: 52 | return delete_collection(coll_ref, batch_size) 53 | 54 | # Delete a collection. This does not work if there are sub-collections present 55 | # https://firebase.google.com/docs/firestore/solutions/delete-collections 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | asteval==0.9.18 3 | astroid==2.3.3 4 | attrs==19.3.0 5 | autopep8==1.5.4 6 | backcall==0.1.0 7 | bleach==3.3.0 8 | CacheControl==0.12.6 9 | cachetools==4.0.0 10 | certifi==2019.11.28 11 | chardet==3.0.4 12 | cycler==0.10.0 13 | data==0.4 14 | decorator==4.4.2 15 | defusedxml==0.6.0 16 | entrypoints==0.3 17 | firebase-admin==4.0.0 18 | funcsigs==1.0.2 19 | future==0.18.2 20 | google-api-core==1.16.0 21 | google-api-python-client==1.8.0 22 | google-auth==1.11.3 23 | google-auth-httplib2==0.0.3 24 | google-cloud-core==1.3.0 25 | google-cloud-firestore==1.6.2 26 | google-cloud-storage==1.26.0 27 | google-resumable-media==0.5.0 28 | googleapis-common-protos==1.51.0 29 | grpcio==1.27.2 30 | httplib2==0.18.0 31 | idna==2.9 32 | importlib-metadata==1.5.0 33 | ipykernel==5.1.4 34 | ipython==7.13.0 35 | ipython-genutils==0.2.0 36 | ipywidgets==7.5.1 37 | isort==4.3.21 38 | jedi==0.16.0 39 | Jinja2==2.11.1 40 | json5==0.9.5 41 | jsonschema==3.2.0 42 | jupyter==1.0.0 43 | jupyter-client==6.0.0 44 | jupyter-console==6.1.0 45 | jupyter-core==4.6.3 46 | jupyterlab==2.2.9 47 | jupyterlab-server==1.2.0 48 | kiwisolver==1.1.0 49 | latex==0.7.0 50 | lazy-object-proxy==1.4.3 51 | lmfit==1.0.0 52 | MarkupSafe==1.1.1 53 | matplotlib==3.2.0 54 | mccabe==0.6.1 55 | mistune==0.8.4 56 | msgpack==1.0.0 57 | nbconvert==5.6.1 58 | nbformat==5.0.4 59 | notebook==6.1.5 60 | numpy==1.19.2 61 | pandas==1.1.3 62 | pandocfilters==1.4.2 63 | parso==0.6.2 64 | pexpect==4.8.0 65 | pickleshare==0.7.5 66 | prometheus-client==0.7.1 67 | prompt-toolkit==3.0.4 68 | protobuf==3.11.3 69 | ptyprocess==0.6.0 70 | pyasn1==0.4.8 71 | pyasn1-modules==0.2.8 72 | pycodestyle==2.6.0 73 | Pygments==2.6.1 74 | pylint==2.4.4 75 | pyparsing==2.4.6 76 | pyrsistent==0.15.7 77 | python-dateutil==2.8.1 78 | pytz==2019.3 79 | pyzmq==19.0.0 80 | qtconsole==4.7.1 81 | QtPy==1.9.0 82 | requests==2.23.0 83 | rsa==4.0 84 | scipy==1.5.2 85 | Send2Trash==1.5.0 86 | shutilwhich==1.1.0 87 | six==1.14.0 88 | tempdir==0.7.1 89 | terminado==0.8.3 90 | testpath==0.4.4 91 | toml==0.10.1 92 | tornado==6.0.4 93 | traitlets==4.3.3 94 | typed-ast==1.4.1 95 | uncertainties==3.1.2 96 | uritemplate==3.0.1 97 | urllib3==1.25.8 98 | wcwidth==0.1.8 99 | webencodings==0.5.1 100 | widgetsnbextension==3.5.1 101 | wrapt==1.11.2 102 | zipp==3.1.0 103 | -------------------------------------------------------------------------------- /lib/c19all.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from operator import itemgetter 3 | import pickle 4 | import constants 5 | 6 | 7 | """ Exposes df_all, a dictionary with dataframes holding all global time series data 8 | df_all = { 9 | 'cases': , 10 | 'deaths': 11 | } 12 | 13 | Dataframe functions 14 | `filter(df, column, vlaue)` Generic filter 15 | `for_country(df, country)` Filter by country 16 | `for_province_state(df, province_state)` Filter by province_state 17 | `sum_by_date(df)` Group by date and sum case counts 18 | `date_to_day(date)` Convert a date to the number of days since the date of the first records 19 | `day_to_date(day)` Convert a number of days since the first records to a date 20 | """ 21 | 22 | renamed_columns = constants.JHU_RENAMED_COLUMNS['time_series'] 23 | 24 | # Perform ETL on a Johns Hopkins COVID-19 time series file, Returning a dataframe 25 | def df_from_csv(file_name): 26 | df = pd.read_csv(file_name) 27 | df = df.rename(columns=renamed_columns) 28 | date_cols = df.filter(regex=('^\d+/\d+/\d+$')).columns.array 29 | df = pd.melt(df, id_vars=['province_state', 'country', 'lat', 30 | 'long'], value_vars=date_cols, var_name='date', value_name='cases') 31 | df.date = pd.to_datetime(df.date, format='%m/%d/%y') 32 | df['day'] = (df.date - pd.to_datetime(df.date.iloc[0])).astype('timedelta64[D]') 33 | df.day = df.day.apply(lambda day: int(round(day))) 34 | return df[['date', 'day', 'cases', 'province_state', 'country', 'lat', 'long']] 35 | 36 | # General purpose filter 37 | def filter(df, column, value): 38 | return df[df[column] == value].reset_index() 39 | 40 | # Filter on country 41 | def for_country(df, country): 42 | return filter(df, 'country', country) 43 | 44 | # Filter on province_state. us.py has its own function for this 45 | def for_province_state(df, province_state): 46 | return filter(df, 'province_state', province_state) 47 | 48 | # Return input with all rows collapsed by date and cases summed 49 | def sum_by_date(df): 50 | return df.groupby('date').sum().reset_index() 51 | 52 | # Convert a date to the number of days since the date of the first records 53 | def date_to_day(date): 54 | return (date - pd.to_datetime('2020-01-21')).days 55 | 56 | # Convert a number of days since the first records to a date 57 | def day_to_date(day): 58 | pd.to_datetime('2020-03-21') + pd.DateOffset(days=day) 59 | 60 | _df_cases = df_from_csv(constants.DATA_URLS['global']['cases']) 61 | _df_deaths = df_from_csv(constants.DATA_URLS['global']['deaths']).rename(columns={'cases': 'deaths'}) 62 | 63 | # Dictionary containing dataframes for all global data 64 | df_all = { 65 | 'cases': _df_cases, 66 | 'deaths': _df_deaths 67 | } 68 | 69 | try: 70 | get_ipython 71 | except: 72 | pickle_file = open('output/pickles/df_all.p', 'wb') 73 | pickle.dump(df_all, pickle_file) 74 | print('Updated pickle file df_all.p with global data') 75 | -------------------------------------------------------------------------------- /lib/county_charts.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | import matplotlib.dates as dt 6 | import matplotlib.ticker as ticker 7 | import datetime 8 | import pickle 9 | import copy 10 | import snake_case 11 | 12 | YAXPARAMS = { 13 | 'cases': { 14 | 'total': { 15 | 'ymax': 90, 16 | 'yinterval':10 17 | }, 18 | 'adj': { 19 | 'ymax': 100, 20 | 'yinterval': 10 21 | } 22 | }, 23 | 'deaths': { 24 | 'total': { 25 | 'ymax': 5, 26 | 'yinterval': 1 27 | }, 28 | 'adj': { 29 | 'ymax': 5, 30 | 'yinterval': 1 31 | } 32 | }, 33 | } 34 | 35 | YINTERVAL_TOTAL = 5 36 | YINTERVAL_ADJ = 10 37 | 38 | SOURCE_LABELS = { 39 | 'nyt': 'New York Times', 40 | 'jhu': 'Johns Hopkins University' 41 | } 42 | 43 | STATE_COLORS = { 44 | 'Vermont': '#1f77b4', 45 | 'New Hampshire': '#871f78', 46 | } 47 | 48 | df_start = pickle.load( 49 | open('output/pickles/df_us_nyt.p', 'rb')).reset_index() 50 | 51 | # If you pass in a population, the output will be per 1,000 people 52 | # If you pass in an output filename, the plots will be written to ./images and not rendered to the screen 53 | def county_plot(county, state, metrics=['cases', 'deaths'], source='nyt', total_population=None): 54 | df = copy.deepcopy(df_start) 55 | start_date = pd.to_datetime('2020-03-01') 56 | location = { 57 | 'type': 'county', 58 | 'value': [county, state] 59 | } 60 | for metric in metrics: 61 | for population in [False, total_population]: 62 | count_of = f'{metric}' 63 | county = location['value'][0] 64 | state = location['value'][1] 65 | color = STATE_COLORS[state] 66 | df = df[df.county == county] 67 | df = df[df.state == state] 68 | df = df[df.date >= start_date] 69 | if population: 70 | df[count_of] = df[count_of].apply(lambda x: (x / population) * 100000) 71 | df['count_of_diff'] = df[count_of].diff() 72 | df['count_of_diff_7_day_mean'] = df.count_of_diff.rolling(7).mean() 73 | df = df.iloc[1:] 74 | fig = plt.figure(figsize=(7, 3)) 75 | ax = fig.add_subplot(111) 76 | ax.bar('date', 'count_of_diff', data=df, color=color, alpha=0.35) 77 | ax.plot('date', 'count_of_diff_7_day_mean', color=color, data=df) 78 | ax.xaxis.set_major_locator(dt.MonthLocator()) 79 | ax.xaxis.set_major_formatter(dt.DateFormatter('%b')) 80 | ax.set_ylim(ymin=0) 81 | yaxparams = YAXPARAMS[metric]['adj' if population else 'total'] 82 | ymax = yaxparams['ymax'] 83 | yinterval = yaxparams['yinterval'] 84 | # ax.set_ylim(ymax=yaxparams['ymax']) 85 | ax.yaxis.set_ticks(np.arange(0, ymax + yinterval, yinterval)) 86 | ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%0.0f')) 87 | ax.tick_params(axis='y', colors=color) 88 | ax.tick_params(axis='x', colors=color) 89 | ax.spines['top'].set_visible(False) 90 | ax.spines['right'].set_visible(False) 91 | ax.spines['left'].set_visible(False) 92 | ax.grid(axis='x') 93 | plt.style.use('seaborn-whitegrid') 94 | plt.text(df.date.iloc[-1] + datetime.timedelta(days=3), df.count_of_diff_7_day_mean.iloc[-1], 95 | "7-day\navg.", color=color, style='italic') 96 | filename = snake_case.convert(f'{county} {state} {metric}{" adjusted" if population else ""}.svg') 97 | plt.savefig(f'output/charts/{filename}') 98 | 99 | 100 | county_dicts = [ 101 | {'county': 'Orange', 'state': 'Vermont', 'total_population': 28892}, 102 | {'county': 'Orange', 'state': 'Vermont', 'total_population': 28892}, 103 | {'county': 'Windsor', 'state': 'Vermont', 'total_population': 55062}, 104 | {'county': 'Grafton', 'state': 'New Hampshire', 'total_population': 89886}, 105 | {'county': 'Sullivan', 'state': 'New Hampshire', 'total_population': 43146}, 106 | ] 107 | 108 | for county_dict in county_dicts: 109 | county_plot(**county_dict) 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # covid-19-growth 2 | 3 | The [New York Times](https://github.com/nytimes/covid-19-data) 4 | and the [Johns Hopkins University Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19) 5 | are providing daily Covid-19 case and death data, and they are performing ad hoc revisions to existing data. This pipeline: 6 | - Imports the NYT and JHU data 7 | - Performs type conversions where needed 8 | - Applies consistent snake_case naming for all attributes, across sources, preserving fidelity to original meanings 9 | - For the US, facilitates aggregation at the region, sub-region, state, and county levels, and adds county-level population and lat/long 10 | - Outputs the resulting data structures as a set of long-format time series 11 | - Includes Jupyter Notebook stubs for working with the transformed data 12 | - Facilitiates uploading the tranformed data to Firebase for mobile/cloud use cases 13 | 14 | 15 | The latest NYT and JHU files are pulled from GitHub at runtime. All data operations are vectorized. All data from 2020-01-21 to the present are imported whenever you run `lib/update.py` to generate new CSV, JSON, or Pickle files. This ensures that all revisions to the JHU or NYT raw files will be included here. 16 | 17 | 18 | - [Installing](#installing) 19 | - [Virtualenv](#virtualenv) 20 | - [VSCode](#vscode) 21 | - [Usage](#usage) 22 | - [What do I get?](#what-do-i-get) 23 | - [CSV, JSON, and Pickle files for the transformed data](#csv-and-json) 24 | - [Global Data](#global-data) 25 | - [US Data](#us-data) 26 | - [Jupyter Notebooks](#jupyter-notebooks) 27 | - [Firebase](#firebase) 28 | - [License](#license) 29 | - [Acknowledgments](#acknowledgments) 30 | 31 | 32 | ## Installing 33 | ### Virtualenv 34 | Copy/paste 35 | ``` 36 | git clone https://github.com/willhaslett/covid-19-growth.git 37 | cd covid-19-growth 38 | virtualenv venv 39 | source venv/bin/activate 40 | pip install -q -r requirements.txt 41 | 42 | ``` 43 | Verify installation 44 | ``` 45 | $ python lib/update.py 46 | Updated pickle file df_all.p with global data 47 | Updated pickle file df_us_jhu.p with Johns Hopkins data 48 | Updated pickle file df_us_nyt.p with New York Times data 49 | Updated pickle file df_us_combined.p with Johns Hopkins and New York Times data 50 | Updated CSV files 51 | Updated JSON files 52 | Output Pickle, CSV and JSON files are up-to-date. For further work in Python, import the Pickles! 53 | $ 54 | ``` 55 | 56 | ### VSCode/Docker 57 | 58 | 59 | If you'd like to use Docker, or if you have Python environment reasons to use Docker, VSCode makes it easy to get up and running. 60 | 61 | 62 | - Have the [VSCode extension for Remote Development](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) installed. Here, 'remote' means in a local Docker container (Debian). 63 | - In VSCode, [Open the project folder in a container](https://code.visualstudio.com/docs/remote/containers#_quick-start-open-an-existing-folder-in-a-container) 64 | - Verify the installation as above. 65 | 66 | ![](.images/code.png) 67 | 68 | 69 | ## Usage 70 | 71 | ### What do I get? 72 | Two sets of output data are constructed at runtime, one for all global data and one for all US data. 73 | The US data are parsed and demographic data are added. 74 | The NYT and JHU US data are available separately and as a combined time series. 75 | 76 | The three output formats, Pandas, CSV and JSON, all contain the same data, with the dataframes and CSV files 77 | having the same tabular format, and the JSON files structured by the 78 | [pandas.DataFrame.to_json](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) function. 79 | 80 | - Global data (JHU) 81 | - df_all_cases 82 | - df_all_deaths 83 | - US data 84 | - df_us_jhu 85 | - df_us_nyt 86 | - df_us_combined 87 | 88 | ### Global Data 89 | `c19all.py` 90 | * **`df_all`** A dictionary containing dataframes with all global data for cases, deaths, and recoveries. `province_state` has mixed types, as it does upstream. 91 | ``` 92 | print(df_all['cases']) 93 | 94 | date day cases province_state country lat long 95 | 0 2020-01-22 0 0 NaN Afghanistan 33.000000 65.000000 96 | 1 2020-01-22 0 0 NaN Albania 41.153300 20.168300 97 | 2 2020-01-22 0 0 NaN Algeria 28.033900 1.659600 98 | 3 2020-01-22 0 0 NaN Andorra 42.506300 1.521800 99 | 4 2020-01-22 0 0 NaN Angola -11.202700 17.873900 100 | ... ... ... ... ... ... ... ... 101 | 16429 2020-03-27 65 2 NaN Saint Kitts and Nevis 17.357822 -62.782998 102 | 16430 2020-03-27 65 1 Northwest Territories Canada 64.825500 -124.845700 103 | 16431 2020-03-27 65 3 Yukon Canada 64.282300 -135.000000 104 | 16432 2020-03-27 65 86 NaN Kosovo 42.602636 20.902977 105 | 16433 2020-03-27 65 8 NaN Burma 21.916200 95.956000 106 | 107 | [16434 rows x 7 columns] 108 | ``` 109 | 110 | * **Functions** 111 | - `filter(df, column, vlaue)` Generic filter 112 | - `for_country(df, country)` Filter by country 113 | - `for_province_state(df, province_state)` Filter by province_state 114 | - `sum_by_date(df)` Group by date and sum case counts 115 | 116 | ### US Data 117 | The three output US data structures all have the same basic shape. 118 | Note however that whereas the NYT time series starts on 2020-01-21, the JHU time series 119 | starts on 2020-03-22, the date when JHU changed the format of their US data. 120 | `date` and `fips` are used as a multindex in Pandas, and these are added as columns 121 | in the CSV and JSON files. 122 | 123 | `c19us_jhu.df_us` and `c19us_nyt.df_us` are combined in `c19us_combined` as shown below. 124 | Here, the suffixes `_nyt`and `_jhu` are added to the case and death data. 125 | ``` 126 | >>> from c19us_combined import df_us 127 | >>> print(df_us) 128 | day county state sub_region region lat long cases_nyt deaths_nyt cases_jhu deaths_jhu recovered active 129 | date fips 130 | 2020-01-21 53061 0 Snohomish Washington pacific west 48.04615983 -121.7170703 1 0 NaN NaN NaN NaN 131 | 2020-01-22 53061 1 Snohomish Washington pacific west 48.04615983 -121.7170703 1 0 NaN NaN NaN NaN 132 | 2020-01-23 53061 2 Snohomish Washington pacific west 48.04615983 -121.7170703 1 0 NaN NaN NaN NaN 133 | 2020-01-24 17031 3 Cook Illinois east_north_central midwest 41.84144849 -87.81658794 1 0 NaN NaN NaN NaN 134 | 53061 3 Snohomish Washington pacific west 48.04615983 -121.7170703 1 0 NaN NaN NaN NaN 135 | ... . ... ... ... ... ... ... ... ... ... ... ... ... 136 | 2020-03-26 56025 65 Natrona Wyoming mountain west 42.96180148 -106.797885 6 0 6.0 0.0 0.0 0.0 137 | 56029 65 Park Wyoming mountain west 44.52157546 -109.5852825 1 0 1.0 0.0 0.0 0.0 138 | 56033 65 Sheridan Wyoming mountain west 44.79048913 -106.8862389 4 0 4.0 0.0 0.0 0.0 139 | 56037 65 Sweetwater Wyoming mountain west 41.65943896 -108.8827882 1 0 1.0 0.0 0.0 0.0 140 | 56039 65 Teton Wyoming mountain west 43.93522482 -110.5890801 8 0 7.0 0.0 0.0 0.0 141 | 142 | [13832 rows x 13 columns] 143 | >>> 144 | ``` 145 | 146 | ## CSV and JSON 147 | `dump_csv_and_json.py` gets executed when you run `lib/update.py`. 148 | It creates CSV and JSON files for the five Pandas dataframes and puts them in `output/csv` and `output/json`. 149 | 150 | - **CSV:** 151 | Comma-delimited files for each dataframe. The formats mirror the dataframes as described above. 152 | 153 | - **JSON:** 154 | JavaScript Object Notation files for each dataframe. Files are constructed using the `orient='table'` argument for 155 | [pandas.DataFrame.to_json](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html). 156 | Choose a different structure for the JSON files by setting `JSON_ORIENT` in `lib/dump_csv_and_json.py`. JSON files are minified by default. For not-minified JSON, set `JSON_INDENT` to > 0. 157 | 158 | ## Jupyter Notebooks 159 | 160 | `all.ipynb` and `us.ipynb` contain example starting points for work with global or US data. 161 | 162 | ## Firebase 163 | ### Prerequisites 164 | - [Create your Firebase project](https://firebase.google.com/) and add a Firestore database. 165 | - Create and download a private key JSON file for your project. (Project settings > Service accounts) 166 | - Rename the downladed file to `.google_service_account_key.json` and put it in the project root. This file will be ignored by Git. 167 | 168 | ### Usage 169 | `python lib/upload_to_firestore.py` 170 | 171 | This script uploads the combined US data structure to Firestore using the following scheme: 172 | 1. A schema document that defines the column names for the associated data documents 173 | ![](.images/db_columns.png) 174 | 1. A collection of data documents, split by date, with all data for a date stored in a single JSON string 175 | ![](.images/db_data.png) 176 | 3. A document containing the available additional data for all counties in the dataset, keyed by fips code 177 | ![](.images/db_demos.png) 178 | 179 | ## License 180 | 181 | This project is licensed under the MIT License. See the [LICENSE.md](LICENSE.md) file for details 182 | 183 | ## Acknowledgments 184 | 185 | The New York Times and the Johns Hopkins University Center for Systems Science and Engineering are doing a great public service by sharing these data. 186 | 187 | -------------------------------------------------------------------------------- /lib/notebooks/all.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": "" 11 | }, 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "output_type": "execute_result" 15 | }, 16 | { 17 | "data": { 18 | "image/png": "\n", 19 | "image/svg+xml": "\n\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", 20 | "text/plain": "
" 21 | }, 22 | "metadata": { 23 | "transient": {} 24 | }, 25 | "output_type": "display_data" 26 | } 27 | ], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "import matplotlib as mpl\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import pickle\n", 33 | "from lmfit import models\n", 34 | "import sys\n", 35 | "import os\n", 36 | "sys.path.insert(0, os.path.abspath('../'))\n", 37 | "import c19all\n", 38 | "\n", 39 | "df = pickle.load(open( '../../output/pickles/df_all.p', 'rb'))\n", 40 | "\n", 41 | "''' Define the analytic dataset\n", 42 | " count_of: required, `cases` or `deaths`\n", 43 | " start_date: required, >= 2020-01-02\n", 44 | " country: optional, comment out the declaration for global data\n", 45 | "'''\n", 46 | "count_of = 'cases'\n", 47 | "start_date = pd.to_datetime('2020-01-21')\n", 48 | "country = 'Japan'\n", 49 | "\n", 50 | "# See https://lmfit.github.io/lmfit-py/builtin_models.html for model options\n", 51 | "df = df[count_of]\n", 52 | "if 'country' in locals():\n", 53 | " df = c19all.for_country(df, country)\n", 54 | " ylabel = f'{count_of.capitalize()} in {country}' if country else f'Global {count_of}'\n", 55 | "else:\n", 56 | " ylabel = f'Global {count_of}'\n", 57 | "df = df[df.date >= start_date]\n", 58 | "df.day = df.day.apply(lambda day: (day - (c19all.date_to_day(start_date ) - 1)))\n", 59 | "df = df.groupby('day').sum().reset_index()\n", 60 | "model = models.PowerLawModel()\n", 61 | "params = model.make_params()\n", 62 | "result = model.fit(df[count_of], params, x=df.day.to_list())\n", 63 | "plt.style.use('ggplot')\n", 64 | "xlabel = f'Days since {start_date.strftime(\"%Y-%m-%d\")}'\n", 65 | "result.plot_fit(xlabel=xlabel, ylabel=ylabel, datafmt='og', fitfmt='r')" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "file_extension": ".py", 71 | "kernelspec": { 72 | "display_name": "Python 3.8.2 64-bit", 73 | "name": "python38264bit638d3b9becec457392c33150d78edef7" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "name": "python", 81 | "version": "3.8.2-final" 82 | }, 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "npconvert_exporter": "python", 86 | "orig_nbformat": 2, 87 | "pygments_lexer": "ipython2", 88 | "version": 2 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } --------------------------------------------------------------------------------