├── .gitignore ├── .projectile ├── README.md ├── a ├── estimate_hosp_capacity.do ├── impute_additional_fields.do └── predict_age_cfr.do ├── assets ├── covid_build_public.nomnoml ├── covid_build_public.svg ├── download_button.png ├── hospital-beds.html ├── hospital-beds.js ├── hospital-beds.png ├── market-volumes-livestock.html ├── market-volumes-livestock.js ├── market-volumes.html ├── market-volumes.js ├── mortality-pred.html ├── mortality-pred.js └── mortality-pred.png ├── b ├── agmark_scraper.py ├── clean_agmark.do ├── clean_ap_mort.do ├── clean_assam_mort.do ├── clean_bbmp_mort.do ├── clean_bihar_mort.do ├── clean_chennai_mort.do ├── clean_ghmc_mort.do ├── clean_haryana_mort.do ├── clean_hp_mort.do ├── clean_kmdc_mort.do ├── clean_mh_mort.do ├── clean_migration.do ├── clean_mp_mort.do ├── clean_odisha_mort.do ├── clean_rajasthan_mort.do ├── clean_state_mort.do ├── clean_up_mort.do ├── clean_wb_mort.do ├── copy_keys.do ├── create_dlhs4_pc11_district_key.do ├── create_google.do ├── ddl_nfhs_poll_hmis.do ├── gen_age_distribution.do ├── gen_lgd_pc11_demographics.do ├── gen_nss_district_key.do ├── gen_urbanization_subdist.do ├── get_case_data.do ├── get_vaccination_data.do ├── make_mortality.do ├── old │ ├── aggregate_case_data.do │ ├── gen_demographics.do │ ├── gen_pc11_states.do │ ├── get_case_data.do │ ├── get_lgd_keys.do │ ├── mse_simple.m │ └── predict_cts_uk_age_or_v2.m ├── pc11_lgd_metadata.csv ├── prep_bihar.do ├── prep_dlhs4_district.do ├── prep_ec_hosp.do ├── prep_ec_hosp_microdata.do ├── prep_hosp_pca_vd.do ├── prep_nss75.do ├── prep_pc_hosp.do ├── prep_secc.do ├── push_data.sh ├── retrieve_case_data.py ├── str │ ├── cov19india_district_fixes.txt │ ├── cov19india_vaccine_district_fixes.txt │ ├── covid_district_fixes.txt │ └── lgd_district_fixes.txt ├── update_case_cronjob.sh ├── update_case_vaccination_data.do └── vaccination_plot.py ├── build.md ├── como ├── a │ ├── analyze_mort_counts.do │ ├── app_age_hr_interpolation.do │ ├── app_joint_condition.do │ ├── app_table_age_bin_prev.do │ ├── app_table_nhs_vs_os.do │ ├── calc_hr_sensitivity.do │ ├── calc_prev_sensitivity.do │ ├── calc_prrs.do │ ├── covid_como_agerisks_tpl.tex │ ├── covid_como_oscompare_tpl.tex │ ├── covid_como_sumhr_tpl.tex │ ├── covid_como_sumstats_tpl.tex │ ├── examine_risk_factors_poverty.do │ ├── make_coef_plot.py │ ├── make_paper_figures.do │ ├── make_paper_tables.do │ ├── make_summary_tables.do │ ├── old │ │ └── analyze_age_mort_risk.do │ ├── prep_eng_india_prev_compare.do │ └── sumstats.do ├── b │ ├── clean_gbd_india.do │ ├── collapse_biomarkers_to_state.do │ ├── fit_cts_uk_age_hr.m │ ├── flatten_hr_data.py │ ├── old │ │ ├── prep_india_sim_prevalence.do │ │ ├── prep_populations.do │ │ ├── prep_uk_age_risks.do │ │ └── prep_uk_bmi.do │ ├── prep_age_level_data.do │ ├── prep_england_prevalence.do │ ├── prep_gbd.do │ ├── prep_health_data.do │ ├── prep_hrs.do │ ├── prep_india_comorbidities.do │ ├── prep_ny_mortality.do │ ├── prep_pop_sex.do │ └── prep_standard_errors.do ├── como_programs.do ├── csv │ ├── copd_mclean_rates.csv │ ├── england_gender_age.csv │ ├── india_condition_prevalence.csv │ ├── ny_cummings.csv │ ├── ny_hr.csv │ ├── nystate_age_comorbid_05082020.csv │ ├── nystate_or.csv │ ├── uk_condition_prevalence.csv │ ├── uk_condition_sd.csv │ ├── uk_demography.csv │ ├── uk_nhs_hazard_ratios.csv │ ├── uk_nhs_incidence.csv │ └── weighted_hrs.txt ├── e │ ├── examine_risk_factors.do │ ├── explore_gbd_vs_dlhs.do │ ├── hr_vs_or.do │ ├── summarize_india_conditions.do │ ├── test_cts_ors.do │ └── test_map.do ├── make_como.do ├── r │ └── covid_como_sumstats.csv └── tex │ ├── app_bootstrap.tex │ ├── appendix.tex │ ├── como_exhibits.tex │ ├── como_tables_figures.tex │ ├── covid-como.bib │ ├── front_matter_como.tex │ └── vancouver.bst ├── covid_progs.do ├── e ├── agmark_plot.ipynb ├── analyze_mortality.do ├── comoweb_plots.ipynb ├── compare_hosp_counts.do ├── covid_district_map.ipynb ├── create_agmark_plots.do ├── describe_migration.do ├── diff_dlsh4_doctor_definitions.do ├── dlhs.do ├── explore_agmark_by_state.do ├── explore_agmark_perishables.do ├── explore_ec_microdata.do ├── explore_idi_survey_r1.do ├── explore_idi_survey_r2.do ├── explore_migration.ipynb ├── explore_mortality.do ├── explore_pc_dlhs_doctors.do ├── explore_vacc_story.do ├── expolore_idi_survey_r2.do ├── figure_hmis.do ├── gen_map.py ├── gen_survey_map.py ├── get_vac_data.do ├── graphs_idi_r3.do ├── hmis_spatial_maps.py ├── idi_labor_ag.do ├── idi_r1_oped_graphs.do ├── idi_survey_r1_for_ppt.do ├── map_vacc_eligible.py ├── pop_estimates_21.csv ├── prep_ahs_data.do ├── prep_covid_dist_analysis.do ├── prep_dlhs_data.do ├── validate_table1.do └── validate_vacc_api.do ├── forecasting ├── README.md ├── Snakefile ├── b │ ├── create_vector_tileset.sh │ ├── data_to_geojson.py │ ├── merge_ddl_pred_data.do │ ├── old │ │ └── push_predicted_metadata.py │ ├── process_ddl_data.do │ ├── process_predicted_data.do │ ├── pull_predicted_data.sh │ ├── pull_predicted_data_helper.py │ ├── push_public_data.sh │ ├── push_vector_tileset.py │ └── test_merged_data.py ├── config │ ├── config.yaml │ ├── forecasting.yaml │ └── forecasting_spatial.yaml └── update_forecasts_cronjob.sh ├── make_covid.do └── str └── manual_covid_case_district_match.csv /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | __pycache__ 3 | forecasting/predictions_credential.json 4 | forecasting/.snakemake/ 5 | forecasting/*.log 6 | -------------------------------------------------------------------------------- /.projectile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devdatalab/covid/a86c2d00d81eee6d26c343e05ae9ba1087fad47f/.projectile -------------------------------------------------------------------------------- /a/impute_additional_fields.do: -------------------------------------------------------------------------------- 1 | /* combine results into a single export file, and impute some additional fields */ 2 | 3 | /* open hospital bed capacity */ 4 | use $covidpub/estimates/hospitals_dist, clear 5 | 6 | /* merge with age distribution and infection fatality rate file */ 7 | merge 1:1 pc11_state_id pc11_district_id using $covidpub/estimates/district_age_dist_cfr, nogen 8 | 9 | /* identify the districts that are least well prepared */ 10 | gen bottom_100_district_dlhs = rank_dlhs <= 100 11 | gen bottom_100_district_pc = rank_pc <= 100 12 | gen district_at_risk = bottom_100_district_pc == 1 & bottom_100_district_dlhs == 1 13 | 14 | /* create scenarios where 1%, 5%, 10% of the district population gets infected */ 15 | gen predicted_mort_01 = pc11_pca_tot_t * 0.01 * district_estimated_cfr_t 16 | gen predicted_hosp_01 = pc11_pca_tot_t * 0.01 * district_estimated_cfr_t * 5 17 | gen predicted_mort_10 = pc11_pca_tot_t * 0.10 * district_estimated_cfr_t 18 | gen predicted_hosp_10 = pc11_pca_tot_t * 0.10 * district_estimated_cfr_t * 5 19 | 20 | /* calculate number of beds according to DLHS in each district */ 21 | gen dlhs_beds = dlhs_perk_pubpriv_beds / 1000 * pc11_pca_tot_t 22 | 23 | /* calculate extent over capacity under 1% infection rate */ 24 | gen capacity_01 = predicted_hosp_01 / dlhs_beds 25 | gen capacity_10 = predicted_hosp_10 / dlhs_beds 26 | 27 | save $tmp/district_age_dist_cfr_hospitals, replace 28 | export delimited $tmp/district_age_dist_cfr_hospitals, replace 29 | -------------------------------------------------------------------------------- /assets/covid_build_public.nomnoml: -------------------------------------------------------------------------------- 1 | #.instructions: 2 | #.1: navigate to nomnoml.com 3 | #.2: copy the following UML code into the window 4 | #.3: use the toolbar in the top left to export the diagram 5 | #.3a: if you make changes, export the source and push the changes to this file in github 6 | #.3b: download the png and push the updated file in github 7 | 8 | #.prog: fill=#D5F1FF visual=sender italic center 9 | #.data: visual=roundrect align=center 10 | #.web: visual=database italic 11 | #.finaldata: visual=roundrect align=center fill=#FC1 12 | #.key: fill=#ffffff bold 13 | 14 | [Key | 15 | [Initial Dataset]--[ Final Dataset] 16 | [ Final Dataset]--[Program] 17 | ] 18 | 19 | [COVID Case Data | 20 | [ https://covindia.com/]->[get_case_data.do] 21 | ]o->[raw/covindia_raw.dta;covid_infected_deaths.dta] 22 | 23 | [Population Census | 24 | [pc11r_hosp.dta | pc11u_hosp.dta]->[prep_pc_hosp.do] 25 | [prep_pc_hosp.do] 26 | ]o->[PC Hospital Data | [ pc_hospitals_subdist.dta|pc_hospitals_dist.dta]] 27 | 28 | [Economic Census | 29 | [pc11_district_key.dta]->[prep_ec_hosp.do] 30 | [ec13_hosp_microdata.dta]->[prep_ec_hosp.do] 31 | [prep_ec_hosp.do]->[ec_hospitals_tv.dta] 32 | ]o->[EC Hospital Data | [ec_hospitals_dist.dta]] 33 | 34 | [Age Bins from SECC| [secc_age_bins_district_t|secc_age_bins_subdistrict_t]]->[predict_age_cfr.do] 35 | [National CFR by Age | [ cfr_age_bins.dta]]->[predict_age_cfr.do] 36 | [predict_age_cfr.do]o->[CFR by Age Bins| [district_age_dist_cfr.dta|subdistrict_age_dist_cfr.dta]] 37 | 38 | [DLHS Data | [dlhs4_hospitals_dist.dta]]->[estimate_hosp_capacity.do] 39 | [EC Hospital Data]->[estimate_hosp_capacity.do] 40 | [PC Hospital Data ]->[estimate_hosp_capacity.do] 41 | [estimate_hosp_capacity.do]->[hospitals_dist.dta] 42 | 43 | [hospitals_dist.dta]->[impute_additional_fields.do] 44 | [CFR by Age Bins]->[impute_additional_fields.do] 45 | [impute_additional_fields.do]->[district_age_dist_cfr_hospitals.dta] 46 | -------------------------------------------------------------------------------- /assets/download_button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devdatalab/covid/a86c2d00d81eee6d26c343e05ae9ba1087fad47f/assets/download_button.png -------------------------------------------------------------------------------- /assets/hospital-beds.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /assets/hospital-beds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devdatalab/covid/a86c2d00d81eee6d26c343e05ae9ba1087fad47f/assets/hospital-beds.png -------------------------------------------------------------------------------- /assets/market-volumes-livestock.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /assets/market-volumes.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /assets/mortality-pred.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /assets/mortality-pred.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devdatalab/covid/a86c2d00d81eee6d26c343e05ae9ba1087fad47f/assets/mortality-pred.png -------------------------------------------------------------------------------- /b/clean_ap_mort.do: -------------------------------------------------------------------------------- 1 | /***************************************/ 2 | /* Clean Andhra Pradesh mortality data */ 3 | /***************************************/ 4 | 5 | /* load covid programs */ 6 | qui do $ddl/covid/covid_progs.do 7 | 8 | /* set globals for year and month */ 9 | global year "2018 2019 2020 2021" 10 | global month "01 02 03 04 05 06 07 08 09 10 11 12" 11 | 12 | foreach j in $year { 13 | 14 | /* conditional global for 2021 since data is only available till June */ 15 | if `j' == 2021 { 16 | global month = "01 02 03 04 05 06" 17 | } 18 | 19 | foreach i in $month { 20 | 21 | /* import raw data */ 22 | import excel "$covidpub/private/mortality/raw/andhra_pradesh/MonitoringReport-AP-`j'`i'-`j'`i'.xlsx", sheet("Monitoring Report") cellrange(A4:P20) clear 23 | 24 | /* basic cleaning - keep relevant vars and rename */ 25 | keep A B E F G 26 | drop if B == "" 27 | 28 | ren A id 29 | ren B district 30 | ren E death_male 31 | ren F death_female 32 | ren G death_trans 33 | 34 | /* generate vars for state, month, year */ 35 | gen state = "Andhra Pradesh" 36 | gen month = `i' 37 | gen year = `j' 38 | 39 | save $tmp/ap_`i'_`j', replace 40 | 41 | } 42 | } 43 | 44 | clear 45 | 46 | /* reset global for month */ 47 | global month "01 02 03 04 05 06 07 08 09 10 11 12" 48 | 49 | /* append all month-year data */ 50 | foreach j in $year { 51 | 52 | if `j' == 2021 { 53 | global month = "01 02 03 04 05 06" 54 | } 55 | 56 | foreach i in $month { 57 | 58 | append using $tmp/ap_`i'_`j'.dta 59 | 60 | } 61 | } 62 | 63 | /* convert months from float to string for consistency */ 64 | str_month, float(month) string(str_month) 65 | 66 | /* sum total deaths */ 67 | egen deaths = rowtotal(death_*) 68 | 69 | /* drop gender-wise deaths and id var and order vars */ 70 | drop death_* id 71 | order state district deaths month year 72 | 73 | /* save clean data to scratch */ 74 | save $tmp/mort_ap.dta, replace 75 | -------------------------------------------------------------------------------- /b/clean_assam_mort.do: -------------------------------------------------------------------------------- 1 | /******************************/ 2 | /* Clean Assam mortality data */ 3 | /******************************/ 4 | 5 | /* set globals for year */ 6 | global year "2018 2019 2020" 7 | 8 | /* process raw data for each year */ 9 | foreach j in $year { 10 | 11 | /* import raw data */ 12 | import excel "$covidpub/private/mortality/raw/assam/`j'.xlsx", sheet("D-4") cellrange(A5:P364) firstrow clear 13 | drop if C == "" 14 | 15 | /* fill missing values */ 16 | foreach i in A B { 17 | replace `i' = `i'[_n-1] if mi(`i') 18 | } 19 | 20 | /* drop redundant vars */ 21 | drop if C == "T" 22 | drop if A == 27 23 | drop P 24 | 25 | /* rename vars */ 26 | ren A id 27 | ren B district 28 | ren C sex 29 | ren D death_january 30 | ren E death_february 31 | ren F death_march 32 | ren G death_april 33 | ren H death_may 34 | ren I death_june 35 | ren J death_july 36 | ren K death_august 37 | ren L death_september 38 | ren M death_october 39 | ren N death_november 40 | ren O death_december 41 | 42 | /* reshape from wide to long */ 43 | reshape long death_, i(district sex) j(month) string 44 | 45 | /* drop id var and generate vars for state and year */ 46 | drop id 47 | gen state = "Assam" 48 | gen year = "`j'" 49 | 50 | /* rename and order variables */ 51 | ren death_ deaths 52 | order state district month year death sex 53 | 54 | save "$tmp/assam_`j'" , replace 55 | 56 | } 57 | 58 | clear 59 | 60 | /* append all month-year data */ 61 | foreach j in $year { 62 | 63 | append using $tmp/assam_`j' 64 | 65 | } 66 | 67 | /* collapse on district-month-year */ 68 | collapse (sum) deaths, by(state district month year) 69 | 70 | /* label and destring numeric vars */ 71 | la var state "State" 72 | la var district "District" 73 | la var month "Month" 74 | la var year "Year" 75 | la var deaths "Total Death" 76 | 77 | destring year, replace 78 | 79 | /* save clean dataset unique on district-month-year */ 80 | save $tmp/mort_assam.dta, replace 81 | -------------------------------------------------------------------------------- /b/clean_bbmp_mort.do: -------------------------------------------------------------------------------- 1 | /*****************************************/ 2 | /* Clean BBMP (Bangalore) mortality data */ 3 | /*****************************************/ 4 | 5 | /* import raw data */ 6 | import excel "$covidpub/private/mortality/raw/Karnataka, BBMP deaths data.xlsx", sheet("Sheet1") cellrange(A24:D37) clear 7 | 8 | /* drop redundant obs and rename vars for reshape */ 9 | drop in 1 10 | drop in 13 11 | ren A month 12 | ren B death_2019 13 | ren C death_2020 14 | ren D death_2021 15 | 16 | /* reshape wide to long on monthly deaths */ 17 | reshape long death_, i(month) j(year) 18 | 19 | /* ren deaths var and drop empty obs */ 20 | ren death deaths 21 | drop if deaths == . 22 | 23 | /* rename months for consistency */ 24 | replace month = "january" if month == "Jan" 25 | replace month = "february" if month == "Feb" 26 | replace month = "march" if month == "Mar" 27 | replace month = "april" if month == "Apr" 28 | replace month = "may" if month == "May" 29 | replace month = "june" if month == "Jun" 30 | replace month = "july" if month == "Jul" 31 | replace month = "august" if month == "Aug" 32 | replace month = "september" if month == "Sep" 33 | replace month = "october" if month == "Oct" 34 | replace month = "november" if month == "Nov" 35 | replace month = "december" if month == "Dec" 36 | 37 | /* gen vars for state, district */ 38 | gen state = "Karnataka" 39 | gen district = "Bangalore (Urban)" 40 | 41 | order state district deaths 42 | 43 | /* save clean data to scratch */ 44 | save $tmp/mort_bbmp.dta, replace 45 | -------------------------------------------------------------------------------- /b/clean_bihar_mort.do: -------------------------------------------------------------------------------- 1 | /******************************/ 2 | /* Clean Bihar mortality data */ 3 | /******************************/ 4 | 5 | /* load covid programs */ 6 | qui do $ddl/covid/covid_progs.do 7 | 8 | /* set globals for year and month */ 9 | global year "2018 2019 2020 2021" 10 | global month "01 02 03 04 05 06 07 08 09 10 11 12" 11 | 12 | foreach j in $year { 13 | 14 | /* conditional global for 2021 since data is only available till May */ 15 | if `j' == 2021 { 16 | global month = "01 02 03 04 05" 17 | } 18 | 19 | foreach i in $month { 20 | 21 | /* import raw data */ 22 | import excel "$covidpub/private/mortality/raw/bihar/`j'_`i'.xlsx", sheet("Monitoring Report") cellrange(A4:P45) clear 23 | 24 | /* basic cleaning - keep relevant vars and rename */ 25 | keep A B E F G 26 | drop if B == "" 27 | 28 | ren A id 29 | ren B district 30 | ren E death_male 31 | ren F death_female 32 | ren G death_trans 33 | 34 | /* generate vars for state, month, year */ 35 | gen state = "Bihar" 36 | gen month = `i' 37 | gen year = `j' 38 | 39 | save $tmp/bihar_`i'_`j', replace 40 | 41 | } 42 | } 43 | 44 | clear 45 | 46 | /* reset global for month */ 47 | global month "01 02 03 04 05 06 07 08 09 10 11 12" 48 | 49 | /* append all month-year data */ 50 | foreach j in $year { 51 | 52 | if `j' == 2021 { 53 | global month = "01 02 03 04 05" 54 | } 55 | 56 | foreach i in $month { 57 | 58 | append using $tmp/bihar_`i'_`j'.dta 59 | 60 | } 61 | } 62 | 63 | /* convert months from float to string for consistency */ 64 | str_month, float(month) string(str_month) 65 | 66 | /* sum total deaths */ 67 | egen deaths = rowtotal(death_*) 68 | 69 | /* drop gender-wise deaths and id var and order vars */ 70 | drop death_* id 71 | order state district deaths month year 72 | 73 | /* save clean file to scratch */ 74 | save $tmp/mort_bihar.dta, replace 75 | -------------------------------------------------------------------------------- /b/clean_chennai_mort.do: -------------------------------------------------------------------------------- 1 | /********************************/ 2 | /* Clean Chennai mortality data */ 3 | /********************************/ 4 | 5 | /* import raw data */ 6 | import excel "$covidpub/private/mortality/raw/chennai.xlsx", sheet("Sheet2") cellrange(A1:M13) clear firstrow 7 | 8 | /* rename vars for reshape */ 9 | ren Month month 10 | ren B deaths2010 11 | ren C deaths2011 12 | ren D deaths2012 13 | ren E deaths2013 14 | ren F deaths2014 15 | ren G deaths2015 16 | ren H deaths2016 17 | ren I deaths2017 18 | ren J deaths2018 19 | ren K deaths2019 20 | ren L deaths2020 21 | ren M deaths2021 22 | 23 | /* convert months to lowercase for consistency */ 24 | replace month = lower(month) 25 | 26 | /* reshape from wide to long */ 27 | reshape long deaths, i(month) j(year) 28 | 29 | /* drop missing obs */ 30 | drop if deaths == . 31 | 32 | /* generate vars for state, district */ 33 | gen state = "Tamil Nadu" 34 | gen district = "Chennai" 35 | 36 | order state district deaths 37 | 38 | /* save clean data to scratch */ 39 | save $tmp/mort_chennai, replace 40 | -------------------------------------------------------------------------------- /b/clean_ghmc_mort.do: -------------------------------------------------------------------------------- 1 | /*****************************************/ 2 | /* Clean GHMC (Hyderabad) mortality data */ 3 | /*****************************************/ 4 | 5 | /* import raw data */ 6 | import excel "$covidpub/private/mortality/raw/ghmc_certificates.xlsx", sheet("Sheet1") cellrange(A3:G14) clear 7 | 8 | /* rename vars for reshape */ 9 | ren A month 10 | ren B deaths2016 11 | ren C deaths2017 12 | ren D deaths2018 13 | ren E deaths2019 14 | ren F deaths2020 15 | ren G deaths2021 16 | 17 | /* convert months to lowercase for consistency */ 18 | replace month = lower(month) 19 | 20 | /* reshape from wide to long */ 21 | reshape long deaths, i(month) j(year) 22 | 23 | /* drop missing data */ 24 | drop if deaths == . 25 | 26 | /* gen vars for state, district */ 27 | gen state = "Telangana" 28 | gen district = "Hyderabad" 29 | 30 | order state district deaths 31 | 32 | /* save clean data to scratch */ 33 | save $tmp/mort_ghmc.dta, replace 34 | -------------------------------------------------------------------------------- /b/clean_haryana_mort.do: -------------------------------------------------------------------------------- 1 | /********************************/ 2 | /* Clean Haryana mortality data */ 3 | /********************************/ 4 | 5 | /* import raw data from statsofindia repo */ 6 | import delimited "https://raw.githubusercontent.com/statsofindia/india-mortality/master/district-level/Haryana-districts.csv", clear 7 | 8 | /* create variables for month and day of death */ 9 | gen year = substr(date, 1, 4) 10 | gen month = substr(date, 6, 2) 11 | gen day = substr(date, 9, 2) 12 | destring year month day, replace 13 | 14 | /* collapse on date of death, district and gender */ 15 | collapse (sum) deaths, by(district year month) 16 | 17 | /* convert months from float to string for consistency */ 18 | str_month, float(month) string(str_month) 19 | 20 | /* generate state var */ 21 | gen state = "Haryana" 22 | 23 | /* re-order variables */ 24 | order state district deaths year month 25 | 26 | save $tmp/mort_haryana.dta, replace 27 | -------------------------------------------------------------------------------- /b/clean_hp_mort.do: -------------------------------------------------------------------------------- 1 | /*****************************************/ 2 | /* Clean Himachal Pradesh mortality data */ 3 | /*****************************************/ 4 | 5 | /* import raw data from statsofindia repo */ 6 | import delimited "https://raw.githubusercontent.com/statsofindia/india-mortality/master/district-level/Himachal%20Pradesh-districts.csv", clear 7 | 8 | /* create variables for month and day of death */ 9 | gen year = substr(date, 1, 4) 10 | gen month = substr(date, 6, 2) 11 | gen day = substr(date, 9, 2) 12 | destring year month day, replace 13 | 14 | /* collapse on date of death, district and gender */ 15 | collapse (sum) deaths, by(district year month) 16 | 17 | /* convert months from float to string for consistency */ 18 | str_month, float(month) string(str_month) 19 | 20 | /* generate state var */ 21 | gen state = "Himachal Pradesh" 22 | 23 | /* re-order variables */ 24 | order state district deaths year month 25 | 26 | save $tmp/mort_hp.dta, replace 27 | -------------------------------------------------------------------------------- /b/clean_kmdc_mort.do: -------------------------------------------------------------------------------- 1 | /***************************************************************/ 2 | /* Clean Kolkata Municipal Corporation Death Registration data */ 3 | /***************************************************************/ 4 | 5 | /* read in raw csv data (source: https://github.com/thejeshgn/KMCDeathRecords) */ 6 | import delimited "$covidpub/private/mortality/raw/death_records_kolkata.csv", clear 7 | 8 | /* drop empty variables */ 9 | drop dateofregistration deathdate crematoriumcode regnno recordssourcerawdatafile yearofregistration 10 | 11 | /* rename and label variables */ 12 | ren deceasedname deceased_name 13 | ren deathregnno death_reg_no 14 | ren crematoriumname crematorium 15 | ren deceasedsex sex 16 | ren fathername father_name 17 | ren deathsite death_site 18 | ren recordssource record_source 19 | ren recordscity district 20 | ren recordsdateofdeath death_date 21 | 22 | la var deceased_name "Name of Deceased" 23 | la var death_reg_no "Death Registration Number" 24 | la var crematorium "Crematorium" 25 | la var sex "Sex of Deceased" 26 | la var father_name "Father Name" 27 | la var death_site "Site of Death" 28 | la var record_source "Source" 29 | la var district "District" 30 | la var death_date "Date of Death" 31 | 32 | /* gender for some obs is unidentified for various reasons and is missing - label them as unknown */ 33 | replace sex = "UNKNOWN" if sex == " " 34 | 35 | /* create placeholder variable for collapse */ 36 | gen deaths = 1 37 | 38 | /* create variables for month and day of death */ 39 | gen year = substr(death_date, 1, 4) 40 | gen month = substr(death_date, 6, 2) 41 | gen date = substr(death_date, 9, 2) 42 | destring year month date, replace 43 | 44 | /* collapse on date of death, district and gender */ 45 | collapse (sum) deaths, by(district year month) 46 | 47 | /* convert months from float to string for consistency */ 48 | str_month, float(month) string(str_month) 49 | 50 | /* generate state var */ 51 | gen state = "West Bengal" 52 | 53 | /* re-order variables */ 54 | order state district deaths year month 55 | 56 | save $tmp/mort_kolkata.dta, replace 57 | -------------------------------------------------------------------------------- /b/clean_mh_mort.do: -------------------------------------------------------------------------------- 1 | /* import raw data from statsofindia repo */ 2 | import delimited "https://raw.githubusercontent.com/statsofindia/india-mortality/master/district-level/Maharashtra-districts.csv" , clear 3 | 4 | /* create variables for month and day of death */ 5 | gen year = substr(date, 1, 4) 6 | gen month = substr(date, 6, 2) 7 | gen day = substr(date, 9, 2) 8 | destring year month day, replace 9 | 10 | /* collapse on date of death, district and gender */ 11 | collapse (sum) deaths, by(district year month) 12 | 13 | /* convert months from float to string for consistency */ 14 | str_month, float(month) string(str_month) 15 | 16 | /* generate state var */ 17 | gen state = "Maharashtra" 18 | 19 | /* re-order variables */ 20 | order state district deaths year month 21 | 22 | save $tmp/mort_maha.dta, replace 23 | -------------------------------------------------------------------------------- /b/clean_migration.do: -------------------------------------------------------------------------------- 1 | /* Clean district migration in/outflow data */ 2 | 3 | 4 | /* read in raw CSV data (source: Clement Imbert */ 5 | import delimited using $covidpub/migration/raw/district_migration_pc11.csv, clear varn(1) 6 | 7 | /* reformat census identifiers to string */ 8 | ren statecodecensus2011 pc11_state_id 9 | tostring pc11_state_id, format(%02.0f) replace 10 | ren districtcodecensus2011 pc11_district_id 11 | tostring pc11_district_id, format(%03.0f) replace 12 | 13 | /* save to pc11 */ 14 | order _all, alphabetic 15 | order pc11_state_id pc11_district_id, first 16 | compress 17 | save $covidpub/migration/pc11/district_migration_pc11, replace 18 | export delimited using $covidpub/migration/csv/district_migration_pc11.csv, replace 19 | 20 | /* create LGD version */ 21 | convert_ids, from_ids(pc11_state_id pc11_district_id) to_ids(lgd_state_id lgd_district_id) key($keys/lgd_pc11_district_key_weights.dta) weight_var(pc11_lgd_wt_pop) metadata_urls("https://docs.google.com/spreadsheets/d/e/2PACX-1vTu79uiVKSFv8c1oZvx7WARrWXSfbwfLakiukoezDaH0spMM_MQalkm5fr4bnkBQVNRs2aiU7x41oi3/pub?gid=0&single=true&output=csv") labels 22 | save $covidpub/migration/district_migration, replace 23 | export delimited using $covidpub/migration/csv/district_migration.csv, replace 24 | -------------------------------------------------------------------------------- /b/clean_mp_mort.do: -------------------------------------------------------------------------------- 1 | /***************************************/ 2 | /* Clean Madhya Pradesh mortality data */ 3 | /***************************************/ 4 | 5 | /* set globals for month and year */ 6 | global month "january february march april may june july august september october november december" 7 | 8 | global year "2018 2019 2020 2021" 9 | 10 | /* process raw data from January 2018 to May 2021 */ 11 | foreach j in $year { 12 | 13 | /* conditional global for 2021 since data is available upto May */ 14 | if `j' == 2021 { 15 | global month = "january february march april may" 16 | } 17 | 18 | foreach i in $month { 19 | 20 | /* import raw data */ 21 | import excel "$covidpub/private/mortality/raw/madhya_pradesh/`j'/`i'`j'.xlsx", sheet("Monitoring Report") firstrow clear 22 | 23 | /* rename vars and drop redundant obs */ 24 | ren SlNo id 25 | ren District district 26 | ren C deaths 27 | 28 | drop in 1/2 29 | drop if id == "" 30 | 31 | /* generate variables for month and year */ 32 | gen month = "`i'" 33 | gen year = "`j'" 34 | gen state = "Madhya Pradesh" 35 | 36 | destring * , replace 37 | order id state district 38 | 39 | /* save temp file for month-year */ 40 | save $tmp/`i'`j' , replace 41 | 42 | } 43 | } 44 | 45 | clear 46 | 47 | /* reset global */ 48 | global month "january february march april may june july august september october november december" 49 | 50 | /* append all month-year data */ 51 | foreach j in $year { 52 | 53 | if `j' == 2021 { 54 | global month = "january february march april may" 55 | } 56 | 57 | foreach i in $month { 58 | 59 | append using $tmp/`i'`j' 60 | 61 | } 62 | } 63 | 64 | drop id 65 | 66 | /* save clean dataset unique on district-month-year */ 67 | save $tmp/mort_mp.dta, replace 68 | -------------------------------------------------------------------------------- /b/clean_odisha_mort.do: -------------------------------------------------------------------------------- 1 | /*******************************/ 2 | /* Clean Odisha Mortality Data */ 3 | /*******************************/ 4 | 5 | /* to be appended with district-year mortality dataset */ 6 | 7 | /* import raw data */ 8 | import excel "$covidpub/private/mortality/raw/Odisha Analysis.xlsx", sheet("CRS") cellrange(B5:N34) clear 9 | 10 | /* drop redundant variables and rename them for reshape */ 11 | drop F G H I K L M 12 | 13 | ren B district 14 | ren C deaths2017 15 | ren D deaths2018 16 | ren E deaths2019 17 | ren J deaths2020 18 | ren N deaths2021 19 | 20 | /* reshape from wide to long on deaths */ 21 | reshape long deaths, i(district) j(year) 22 | 23 | /* generate variable for state */ 24 | gen state = "Odisha" 25 | 26 | /* create lgd_state variable to merge */ 27 | gen lgd_state_name = lower(state) 28 | 29 | /* merge in lgd state id */ 30 | merge m:1 lgd_state_name using $keys/lgd_state_key, keepusing(lgd_state_id) keep(match master) nogen 31 | 32 | /* now create an lgd_district variable to merge */ 33 | gen lgd_district_name = lower(district) 34 | 35 | /* save temp file */ 36 | save $tmp/mort_odisha, replace 37 | 38 | /* run masala merge */ 39 | keep lgd_state_name lgd_district_name 40 | duplicates drop 41 | masala_merge lgd_state_name using $keys/lgd_district_key, s1(lgd_district_name) minbigram(0.2) minscore(0.6) outfile($tmp/mort_lgd_district) 42 | 43 | /* check that all districts were matched to LGD */ 44 | count if match_source == 6 45 | di "`r(N)' districts were unmatched" 46 | 47 | /* keep master matches */ 48 | keep if match_source < 7 49 | 50 | /* drop redundant variables */ 51 | keep lgd_state_name lgd_district_name_using lgd_district_name_master lgd_district_id 52 | 53 | /* merge data back in */ 54 | ren lgd_district_name_master lgd_district_name 55 | merge 1:m lgd_state_name lgd_district_name using $tmp/mort_odisha 56 | drop _merge 57 | 58 | /* now replace the district name with the lgd key name */ 59 | drop lgd_district_name 60 | ren lgd_district_name_using lgd_district_name 61 | 62 | /* merge with PC11 districts */ 63 | merge m:m lgd_state_id lgd_district_id using "$keys/lgd_pc11_district_key.dta" 64 | keep if _merge == 3 65 | drop _merge lgd_district_name_local lgd_district_version 66 | la var deaths "Total reported deaths - CRS" 67 | 68 | /* add some pointers about data */ 69 | notes deaths: Data for Odisha provided by Chinmay Tumbe (IIM Ahmedabad) 70 | notes death: For Odisha, 2020 deaths are projected totals computed based on average growth factor of 2018 and 2019 71 | 72 | order lgd_state_id lgd_district_id lgd_state_name lgd_district_name state district deaths year pc11_* 73 | 74 | /* save clean district-year data to scratch */ 75 | save $tmp/mort_odisha_dist, replace 76 | 77 | /* collapse on state-year */ 78 | collapse (sum) deaths, by(lgd_state_id lgd_state_name state year pc11_state_id) 79 | 80 | /* save clean state-year data to scratch */ 81 | save $tmp/mort_odisha_state, replace 82 | -------------------------------------------------------------------------------- /b/clean_rajasthan_mort.do: -------------------------------------------------------------------------------- 1 | /**********************************/ 2 | /* Clean Rajasthan mortality data */ 3 | /**********************************/ 4 | 5 | /* import raw data from statsofindia repo */ 6 | import delimited "https://raw.githubusercontent.com/statsofindia/rajasthan-mortality/master/rajasthan-pehchan-districts-mortality-2018-2021.csv", clear 7 | 8 | /* create variables for month and day of death */ 9 | gen year = substr(date, 1, 4) 10 | gen month = substr(date, 6, 2) 11 | gen day = substr(date, 9, 2) 12 | destring year month day, replace 13 | 14 | /* collapse on date of death, district and gender */ 15 | collapse (sum) deaths, by(district year month) 16 | 17 | /* convert months from float to string for consistency */ 18 | str_month, float(month) string(str_month) 19 | 20 | /* generate state var */ 21 | gen state = "Rajasthan" 22 | 23 | /* re-order variables */ 24 | order state district deaths year month 25 | 26 | save $tmp/mort_rajasthan.dta, replace 27 | -------------------------------------------------------------------------------- /b/clean_up_mort.do: -------------------------------------------------------------------------------- 1 | /***************************/ 2 | /* Clean UP Mortality data */ 3 | /***************************/ 4 | 5 | /* import raw data */ 6 | import excel "$covidpub/private/mortality/raw/UP RTI- Death Certificates Issued.xlsx", sheet("Year-wise original") cellrange(A9:P308) clear 7 | 8 | /* preliminary cleaning */ 9 | replace B = B[_n-1] if mi(B) 10 | drop A 11 | drop if C == . 12 | drop P 13 | 14 | /* rename variabless */ 15 | ren B district 16 | ren C year 17 | ren D death_january 18 | ren E death_february 19 | ren F death_march 20 | ren G death_april 21 | ren H death_may 22 | ren I death_june 23 | ren J death_july 24 | ren K death_august 25 | ren L death_september 26 | ren M death_october 27 | ren N death_november 28 | ren O death_december 29 | 30 | /* reshape from wide to long */ 31 | destring death_november, replace 32 | reshape long death_, i(district year) j(month) string 33 | 34 | /* generate state variable and clean further */ 35 | gen state = "Uttar Pradesh" 36 | drop if death_ == . 37 | ren death_ deaths 38 | 39 | /* label and destring numeric vars */ 40 | la var state "State" 41 | la var district "District" 42 | la var month "Month" 43 | la var year "Year" 44 | la var deaths "Total Reported Deaths - CRS" 45 | 46 | /* save clean data to scratch */ 47 | save $tmp/mort_up.dta, replace 48 | -------------------------------------------------------------------------------- /b/clean_wb_mort.do: -------------------------------------------------------------------------------- 1 | /************************************/ 2 | /* Clean West Bengal Mortality Data */ 3 | /************************************/ 4 | 5 | /* import raw data from statsofindia repo */ 6 | import delimited "https://raw.githubusercontent.com/statsofindia/india-mortality/master/district-level/West%20Bengal-districts.csv", clear 7 | 8 | /* create variables for month and day of death */ 9 | gen year = substr(date, 1, 4) 10 | gen month = substr(date, 6, 2) 11 | gen day = substr(date, 9, 2) 12 | destring year month day, replace 13 | 14 | /* collapse on date of death, district and gender */ 15 | collapse (sum) deaths, by(district year month) 16 | 17 | /* convert months from float to string for consistency */ 18 | str_month, float(month) string(str_month) 19 | 20 | /* generate state var */ 21 | gen state = "West Bengal" 22 | 23 | /* drop Kolkata from the dataset since we already have data for the district */ 24 | drop if district == "Kolkata" 25 | 26 | /* re-order variables */ 27 | order state district deaths year month 28 | 29 | save $tmp/mort_wb.dta, replace 30 | -------------------------------------------------------------------------------- /b/copy_keys.do: -------------------------------------------------------------------------------- 1 | /* copy keys for release in the covid data repository, with any 2 | necessary processing. */ 3 | 4 | /* copy EC:PC keys */ 5 | shell cp $keys/pc11_district_key.dta $covidpub/keys/ 6 | shell cp $keys/pc11_ec13_district_key.dta $covidpub/keys/ 7 | shell cp $keys/pc11r_ec13r_key.dta $covidpub/keys/ 8 | shell cp $keys/pc11u_ec13u_key.dta $covidpub/keys/ 9 | 10 | /* copy LGD keys */ 11 | shell cp $keys/lgd_district_key.dta $covidpub/keys/ 12 | shell cp $keys/lgd_pc11_town_key.dta $covidpub/keys/ 13 | shell cp $keys/lgd_town_key.dta $covidpub/keys/ 14 | shell cp $keys/lgd_village_key.dta $covidpub/keys/ 15 | 16 | /* excise unnecessary fields from LGD PC11 village key */ 17 | use $keys/lgd_pc11_village_key.dta, clear 18 | keep pc11_state_id pc11_district_id pc11_subdistrict_id pc11_village_id lgd_state_id lgd_district_id lgd_subdistrict_id lgd_village_id lgd_pc11_match 19 | encode lgd_pc11_match, gen(tmp) 20 | drop lgd_pc11_match 21 | ren tmp lgd_pc11_match 22 | compress 23 | save $covidpub/keys/lgd_pc11_village_key.dta, replace 24 | -------------------------------------------------------------------------------- /b/create_google.do: -------------------------------------------------------------------------------- 1 | /* create dataset with google serach data for covid symptoms */ 2 | 3 | 4 | /* import csv file */ 5 | import delimited "$iec/covid/google/google_search_may2.csv", clear 6 | 7 | /* merge state keys */ 8 | merge m:1 pc11_state_name using $pc11/pc11_pca_state_clean.dta 9 | 10 | /* rename day variable in csv */ 11 | rename day date 12 | 13 | /* modify date variable to create stata data */ 14 | gen year = 20 15 | tostring year, generate(year2) 16 | gen date2 = date + year2 17 | drop date year year2 18 | rename date2 date 19 | 20 | /* gen stata date */ 21 | gen date2 = date(date, "DMY") 22 | format date2 %d 23 | 24 | /* rename date2 */ 25 | drop date 26 | rename date2 date 27 | 28 | /* keep relevant vars */ 29 | keep cough fever date pc11_state_id pc11_state_name pc11_pca_state_name 30 | 31 | /* rename symptom variables */ 32 | rename cough cough_score 33 | rename fever fever_score 34 | 35 | /* drop missing values if any */ 36 | drop if mi(fever_score) 37 | 38 | /* sort by state date */ 39 | sort pc11_state_id date 40 | 41 | /* save as stata dataset */ 42 | save $covidpub/google/google_top10_may.dta, replace 43 | -------------------------------------------------------------------------------- /b/gen_nss_district_key.do: -------------------------------------------------------------------------------- 1 | /* generates clean district key out of pdf-to-csv conversion of Appendix-I */ 2 | 3 | /* define lgd matching program */ 4 | qui do $ddl/covid/covid_progs.do 5 | 6 | /* load data */ 7 | insheet using $nss/nss-75-health/Appendix-I.csv, clear 8 | 9 | /* drop unnecessary vars */ 10 | drop v1 v3 v6 v11 v10 11 | 12 | /* gen state and district name vars */ 13 | gen nss_state_name = v9 14 | gen nss_district_name = v7 15 | gen nss_district_id = v8 16 | 17 | /* drop bad obs */ 18 | drop if real(nss_district_id) == . | real(nss_district_id) < 0 19 | 20 | /* format nss id variables */ 21 | destring nss_district_id, replace 22 | 23 | /* state and district name cleaning */ 24 | lgd_state_clean nss_state_name 25 | lgd_dist_clean nss_district_name 26 | 27 | /* state match to lgd key */ 28 | lgd_state_match nss_state_name 29 | /* arunachal pradesh and mizoram missing in nss key */ 30 | 31 | /* generate nss state id from the lgd state id variable*/ 32 | gen nss_state_id = real(lgd_state_id) 33 | 34 | 35 | /* district match to lgd key */ 36 | lgd_dist_match nss_district_name 37 | 38 | /* re-order vars */ 39 | order nss_state_id nss_district_id, first 40 | order nss_district_name, before(lgd_district_name) 41 | 42 | /* the final key has 649 obs */ 43 | /* original key had 648 obs */ 44 | /* jaintia hills was expanded into 2 obs bc lgd data has e & w. jaintia hills */ 45 | 46 | /* pull population weights to handle jaintia hills split - these can 47 | be drawn from pc11:LGD key, which has the same split */ 48 | preserve 49 | use $keys/lgd_pc11_district_key_weights.dta, clear 50 | 51 | /* make sure the key hasn't changed */ 52 | count if regexm(lower(pc11_district_name), "jaintia") 53 | assert `r(N)' == 2 54 | 55 | /* pull weights into locals */ 56 | sum pc11_lgd_wt_pop if regexm(lower(lgd_district_name), "east jaintia hills") 57 | local east = `r(mean)' 58 | sum pc11_lgd_wt_pop if regexm(lower(lgd_district_name), "west jaintia hills") 59 | local west = `r(mean)' 60 | 61 | /* back to the NSS key. weight is just 1 for all others */ 62 | restore 63 | gen nss_lgd_wt_pop = 1 64 | 65 | /* replace for split */ 66 | replace nss_lgd_wt_pop = `west' if regexm(lower(lgd_district_name), "west jaintia hills") 67 | replace nss_lgd_wt_pop = `east' if regexm(lower(lgd_district_name), "east jaintia hills") 68 | 69 | /* save */ 70 | save $nss/nss-75-health/nss75_lgd_district_key, replace 71 | save $covidpub/nss/nss75_lgd_district_key, replace 72 | -------------------------------------------------------------------------------- /b/gen_urbanization_subdist.do: -------------------------------------------------------------------------------- 1 | /* generate pc11 subdistrict-level urbanization dataset */ 2 | 3 | /* merge total/urban/rural data together */ 4 | use $pc11/pc11r_pca_subdistrict_clean.dta, clear 5 | ren pc11_pca* pc11r_pca* 6 | merge 1:1 pc11_state_id pc11_district_id pc11_subdistrict_id using $pc11/pc11u_pca_subdistrict_clean, gen(_m_pc11u) 7 | ren pc11_pca* pc11u_pca* 8 | merge 1:1 pc11_state_id pc11_district_id pc11_subdistrict_id using $pc11/pc11_pca_subdistrict_clean, gen(_m_pc11r) 9 | drop _m* 10 | 11 | /* generate urbanization variable */ 12 | gen pc11_urb_share = pc11u_pca_tot_p / pc11_pca_tot_p 13 | label var pc11_urb_share "Urbanization share of subdistrict" 14 | 15 | /* save */ 16 | save $tmp/pc11_pca_subd, replace 17 | 18 | /* zip */ 19 | cd $tmp 20 | !zip pc11_pca_subd.zip pc11_pca_subd.dta 21 | -------------------------------------------------------------------------------- /b/old/aggregate_case_data.do: -------------------------------------------------------------------------------- 1 | /* Agregate covid case data to district level */ 2 | 3 | /**********/ 4 | /* Deaths */ 5 | /**********/ 6 | use $covidpub/covid/covid_deaths_recoveries, clear 7 | 8 | /* keep only the deaths */ 9 | keep if patientstatus == "Deceased" 10 | 11 | /* create counter to get total number of deaths */ 12 | gen new_deaths = 1 13 | 14 | /* collapse to district-day */ 15 | collapse (sum) new_deaths, by(pc11_state_id pc11_district_id date) 16 | 17 | /* save as a tempfile */ 18 | save $tmp/deaths, replace 19 | 20 | /*********/ 21 | /* Cases */ 22 | /*********/ 23 | use $covidpub/covid/covid_cases_raw, clear 24 | 25 | /* rename date announced to simply date */ 26 | ren dateannounced date 27 | 28 | /* create counter to get total number of cases */ 29 | gen new_cases = 1 30 | 31 | /* collapse to district-day */ 32 | collapse (sum) new_cases, by(pc11_state_id pc11_district_id date) 33 | 34 | 35 | /*******************/ 36 | /* Merge and Clean */ 37 | /*******************/ 38 | merge 1:1 pc11_state_id pc11_district_id date using $tmp/deaths 39 | 40 | /* fill in missing new_cases and new_deaths with 0 */ 41 | replace new_cases = 0 if mi(new_cases) 42 | replace new_deaths = 0 if mi(new_deaths) 43 | 44 | /* create a numeric datetime */ 45 | gen datenum = clock(date, "DMY") 46 | 47 | /* sort by state, district, and date */ 48 | sort pc11_state_id pc11_district_id datenum 49 | 50 | /* count the running total of cases*/ 51 | bys pc11_state_id pc11_district_id: gen total_cases = sum(new_cases) 52 | 53 | /* count the running total of deaths */ 54 | bys pc11_state_id pc11_district_id: gen total_deaths = sum(new_deaths) 55 | 56 | drop _merge datenum 57 | 58 | /***************************************************************************/ 59 | /* Transform into a square dataset with district positive cases and deaths */ 60 | /***************************************************************************/ 61 | 62 | /* drop if we have no date-- hard to know what to do with these */ 63 | drop if mi(date) 64 | 65 | /* set a missing value for missing districts so they get counted */ 66 | replace pc11_district_id = "-99" if mi(pc11_district_id) 67 | 68 | /* create a single variable for state-district */ 69 | egen sdgroup = group(pc11_state_id pc11_district_id) 70 | 71 | /* create a Stata date field */ 72 | ren date datestr 73 | gen date = date(datestr, "DMY") 74 | format date %d 75 | 76 | /* fill in non-reporting dates */ 77 | assert !mi(pc11_state_id) & !mi(pc11_district_id) 78 | sort sdgroup date 79 | fillin date sdgroup 80 | 81 | /* fill in missing state and district ids created by the fillin */ 82 | xfill pc11_state_id, i(sdgroup) 83 | xfill pc11_district_id, i(sdgroup) 84 | xfill datestr, i(date) 85 | 86 | /* create a sequential row counter so we can use L for the last seen 87 | date even if not yesterday (fillin solves some of this but some dates 88 | had no reporting at all. */ 89 | sort sdgroup date 90 | by sdgroup: egen row = seq() 91 | 92 | /* set as time series on the row */ 93 | sort sdgroup row 94 | xtset sdgroup row 95 | 96 | /* fill in zeroes with the new missing data */ 97 | replace new_cases = 0 if mi(new_cases) 98 | replace new_deaths = 0 if mi(new_deaths) 99 | 100 | /* fill in the cumulative count for days when nothing happened */ 101 | replace total_cases = 0 if datestr == "30/01/2020" & mi(total_cases) 102 | replace total_deaths = 0 if datestr == "30/01/2020" & mi(total_deaths) 103 | replace total_cases = L.total_cases if mi(total_cases) 104 | replace total_deaths = L.total_deaths if mi(total_deaths) 105 | 106 | /* drop unused fields */ 107 | drop _fillin datestr sdgroup row 108 | 109 | /* save total case and death data */ 110 | save $covidpub/covid/covid_cases_deaths_district, replace 111 | cap mkdir $covidpub/covid/csv 112 | export delimited $covidpub/covid/csv/covid_cases_deaths_district.csv, replace 113 | 114 | /* review number of confirmed/deaths in unknown districts */ 115 | sum total_* if date == 22029 116 | sum total_* if date == 22029 & pc11_district_id == "-99" 117 | -------------------------------------------------------------------------------- /b/old/get_lgd_keys.do: -------------------------------------------------------------------------------- 1 | /* use downloaded Local Government Data codes to formalize PC-LGD keys 2 | 3 | data downloaded 14 April 2020 from: https://lgdirectory.gov.in/ 4 | from the "data download" page, selecting the CSV option 5 | */ 6 | 7 | /**********/ 8 | /* States */ 9 | /**********/ 10 | import delimited $iec/lgd/raw/allStateofIndia2020_04_14_23_16_43_253.csv, charset("utf-8") delimit(";") clear 11 | ren census2001code pc01_state_id 12 | ren census2011code pc11_state_id 13 | 14 | /* convert id to 2-digit string */ 15 | tostring pc01_state_id, format("%02.0f") replace 16 | tostring pc11_state_id, format("%02.0f") replace 17 | 18 | /* save */ 19 | save $iec/keys/lgd/lgd_pc_state_key, replace 20 | 21 | /*************/ 22 | /* Districts */ 23 | /*************/ 24 | import delimited $iec/lgd/raw/allDistrictofIndia2020_04_14_23_23_07_748.csv, charset("utf-8") delimit(";") clear 25 | ren census2001code pc01_district_id 26 | ren census2011code pc11_district_id 27 | 28 | /* convert id to 2- or 3-digit string */ 29 | tostring pc01_district_id, format("%02.0f") replace 30 | tostring pc11_district_id, format("%03.0f") replace 31 | 32 | /* merge in the state id's */ 33 | merge m:1 statecode using $iec/keys/lgd/lgd_pc_state_key, keepusing(pc01_state_id pc11_state_id) 34 | drop _merge 35 | 36 | /* save */ 37 | save $iec/keys/lgd/lgd_pc_district_key, replace 38 | 39 | /****************/ 40 | /* Subdistricts */ 41 | /****************/ 42 | import delimited $iec/lgd/raw/allSubDistrictofIndia2020_04_14_23_23_17_755.csv, charset("utf-8") delimit(";") clear 43 | ren census2001code pc01_subdistrict_id 44 | ren census2011code pc11_subdistrict_id 45 | 46 | /* convert id to 4- or 5-digit string */ 47 | tostring pc01_subdistrict_id, format("%04.0f") replace 48 | tostring pc11_subdistrict_id, format("%05.0f") replace 49 | 50 | /* merge in the state and district id's */ 51 | merge m:1 statecode districtcode using $iec/keys/lgd/lgd_pc_district_key, keepusing(pc01_state_id pc11_state_id pc01_district_id pc11_district_id) keep(match master) 52 | drop _merge 53 | 54 | /* save */ 55 | save $iec/keys/lgd/lgd_pc_subdistrict_key, replace 56 | 57 | /************/ 58 | /* Villages */ 59 | /************/ 60 | import delimited $iec/lgd/raw/allVillagesofIndia2020_04_14_23_23_29_843.csv, charset("utf-8") delimit(";") clear 61 | ren census2001code pc01_village_id 62 | ren census2011code pc11_village_id 63 | 64 | /* merge in the state, district, and subdistrict id's */ 65 | merge m:1 statecode districtcode subdistrictcode using $iec/keys/lgd/lgd_pc_subdistrict_key, keepusing(pc01_state_id pc11_state_id pc01_district_id pc11_district_id pc01_subdistrict_id pc11_subdistrict_id) keep(match master) 66 | drop _merge 67 | 68 | /* convert id to 8- or 6-digit string */ 69 | tostring pc01_village_id, format("%08.0f") replace 70 | tostring pc11_village_id, format("%06.0f") replace 71 | 72 | /* save */ 73 | save $iec/keys/lgd/lgd_pc_village_key, replace 74 | -------------------------------------------------------------------------------- /b/old/mse_simple.m: -------------------------------------------------------------------------------- 1 | function mse_simple = mse_simple(x); 2 | 3 | %% the odds we're trying to match 4 | or_simple = [.05 .27 1 2.61 7.61 26.27]'; 5 | ln_or_simple = log(or_simple); 6 | 7 | %% create the x axis for age from 18-90 8 | age = [18:.1:90]'; 9 | 10 | %% predict odds at each age using cubic function x 11 | y = x(1) .* age.^3 + x(2) .* age.^2 + x(3) .* age + x(4); 12 | 13 | %% calculate difference between bin means and target odds ratios 14 | m1 = abs(mean(y(age >= 18 & age < 40)) - ln_or_simple(1)); 15 | m2 = abs(mean(y(age >= 40 & age < 50)) - ln_or_simple(2)); 16 | m3 = abs(mean(y(age >= 50 & age < 60)) - ln_or_simple(3)); 17 | m4 = abs(mean(y(age >= 60 & age < 70)) - ln_or_simple(4)); 18 | m5 = abs(mean(y(age >= 70 & age < 80)) - ln_or_simple(5)); 19 | m6 = abs(mean(y(age >= 80 & age < 90)) - ln_or_simple(6)); 20 | 21 | %% calculate MSE between means and target log odds ratios, with uniform weighting 22 | %% first bin gets scaled 2.2 since it is 18-40, while other bins are all width 10 23 | mse_simple = m1 * 2.2 + m2 + m3 + m4 + m5 + m6; 24 | 25 | %% penalize max changes in slope 26 | abs((y(3:721) - y(2:720)) - (y(2:720) - y(1:719))) 27 | f2 = max(abs((y(3:721) - y(2:720)) - (y(2:720) - y(1:719)))) 28 | mse_simple = mse_simple; 29 | 30 | %% fprintf("%5.2f,%5.2f,%5.2f,%5.2f\n", x(1), x(2), x(3), x(4)) 31 | 32 | -------------------------------------------------------------------------------- /b/old/predict_cts_uk_age_or_v2.m: -------------------------------------------------------------------------------- 1 | 2 | %% set odds ratios in bins and switch to logs since that is better for fitting 3 | or_simple = [.05 .27 1 2.61 7.61 26.27]'; 4 | or_full = [.07 .31 1 2.09 4.77 12.64]'; 5 | ln_or_simple = log(or_simple); 6 | ln_or_full = log(or_full); 7 | 8 | %% set standard solver parameters 9 | options = optimoptions(@fmincon,'MaxFunEvals',10000000,'Display','none','TolCon',0.0001,'TolFun',0.0001,'TolX',0.0001); 10 | 11 | %% start with a linear function 12 | x_start = [1 2 3 4]; 13 | [x, f_min, exit_flag, output] = fmincon(@mse_simple, x_start, [], [], [], [], [], [], [], options); 14 | 15 | age = [18:100]'; 16 | y = x(1) .* age.^3 + x(2) .* age.^2 + x(3) .* age + x(4); 17 | 18 | %% graph the fit 19 | clf; 20 | hold on 21 | scatter(med_age, ln_or_simple); 22 | plot(age,y) 23 | xlabel("log odds ratio") 24 | ylabel("age") 25 | write_png('/scratch/pn/fit_simple') 26 | 27 | %% %% generate predicted values 28 | %% predicted_or_simple = fit_simple(age); 29 | %% predicted_or_full = fit_full(age); 30 | %% 31 | %% %% write these to a file 32 | %% writematrix([age predicted_or_simple predicted_or_full],'/scratch/pn/uk_age_fits.csv') 33 | %% 34 | %% %% prepend a header to the file 35 | %% system('echo "age,ln_or_simple,ln_or_full" >~/iec/covid/covid/csv/uk_age_predicted_or.csv'); 36 | %% system('cat /scratch/pn/uk_age_fits.csv >>~/iec/covid/covid/csv/uk_age_predicted_or.csv'); 37 | %% fprintf("Writing uk_age_predicted_or.csv\n"); 38 | -------------------------------------------------------------------------------- /b/pc11_lgd_metadata.csv: -------------------------------------------------------------------------------- 1 | variablename,aggregationmethod,label 2 | tot_old,sum 3 | -------------------------------------------------------------------------------- /b/prep_bihar.do: -------------------------------------------------------------------------------- 1 | /**************************************/ 2 | /* prepare bihar hospitalization data */ 3 | /**************************************/ 4 | 5 | /* import and lcase raw data */ 6 | import excel $health/bihar/raw/bihar_ventilators_beds_v2.xlsx, clear sheet("Data - Public Hospitals - Bihar") firstrow 7 | ren *, lower 8 | 9 | /* clean district name */ 10 | replace district = lower(district) 11 | ren district lgd_district_name 12 | gen lgd_state_name = "bihar" 13 | drop if lgd_district_name == "bihar" 14 | 15 | /* run standard district name fixes */ 16 | synonym_fix lgd_district_name, synfile(~/ddl/covid/b/str/lgd_district_fixes.txt) replace group(lgd_state_name) 17 | 18 | /* merge to the district key to get standardized ids */ 19 | merge 1:1 lgd_state_name lgd_district_name using $keys/lgd_district_key, assert(using match) keepusing(lgd_state_id lgd_district_id) 20 | keep if lgd_state_name == "bihar" 21 | assert _merge == 3 22 | drop _merge 23 | 24 | /* save clean bihar hospital data */ 25 | drop srno 26 | order lgd_state_id lgd_district_id lgd_state_name lgd_district_name 27 | save $health/bihar/bihar_moh_hospitals, replace 28 | 29 | 30 | /***************************/ 31 | /* prepare bihar case data */ 32 | /***************************/ 33 | /* open and lowercase raw data */ 34 | import excel $health/bihar/raw/bihar_case_data_may11.xlsx, clear firstrow 35 | ren *, lower 36 | drop sno 37 | 38 | /* rename vars */ 39 | ren causeofsample contacttrace1 40 | ren h contacttrace2 41 | 42 | /* clean district name */ 43 | replace district = lower(district) 44 | ren district lgd_district_name 45 | gen lgd_state_name = "bihar" 46 | 47 | /* run standard district name fixes */ 48 | synonym_fix lgd_district_name, synfile(~/ddl/covid/b/str/lgd_district_fixes.txt) replace group(lgd_state_name) 49 | 50 | /* merge to the district key to get standardized ids */ 51 | /* note we keep using-only districts --- they have no cases yet */ 52 | merge m:1 lgd_state_name lgd_district_name using $keys/lgd_district_key, keepusing(lgd_state_id lgd_district_id) 53 | keep if lgd_state_name == "bihar" 54 | review_merge lgd_district_name 55 | assert _merge != 1 56 | drop _merge 57 | 58 | /* save clean bihar case data */ 59 | order lgd_state_id lgd_district_id lgd_state_name lgd_district_name 60 | save $health/bihar/bihar_moh_cases, replace 61 | 62 | -------------------------------------------------------------------------------- /b/prep_ec_hosp.do: -------------------------------------------------------------------------------- 1 | use $covidpub/hospitals/ec_hosp_microdata, clear 2 | 3 | /* require employment of at least 5 to be counted */ 4 | /* NOTE: National Health Profile used 20, we get better correlation with DLHS/PC 5 | on gov hospitals with smaller thresholds. */ 6 | keep if emp_all >= 5 7 | 8 | /* create a firm-level counter t oget a firm count */ 9 | gen count_all = 1 10 | 11 | /* collapse count and employment in each type of facility, by ec13 code */ 12 | /* note village id and town id are the same thing */ 13 | destring sector, replace 14 | collapse (firstnm) sector (sum) count_all emp_all, by(gov nic ec13_state_id ec13_district_id ec13_subdistrict_id ec13_village_id ec13_town_id) 15 | 16 | /* convert data into wide format so we can collapse to village/town level */ 17 | gen class = string(nic) + "_" + string(gov) 18 | ren *all *all_ 19 | drop nic gov 20 | reshape wide count_all_ emp_all_, i(ec13_state_id ec13_district_id ec13_subdistrict_id ec13_village_id ec13_town_id sector) j(class) string 21 | 22 | ren *_1 *_gov 23 | ren *_0 *_priv 24 | 25 | /* get PC village codes */ 26 | merge m:1 ec13_state_id ec13_district_id ec13_subdistrict_id ec13_village_id using $covidpub/keys/pc11r_ec13r_key, keepusing(pc11_state_id pc11_district_id pc11_subdistrict_id pc11_village_id) 27 | drop if _merge == 2 28 | foreach v in state district subdistrict village { 29 | ren pc11_`v'_id tmp_pc11_`v'_id 30 | } 31 | ren _merge _merge_v 32 | 33 | /* get PC town codes */ 34 | merge m:1 ec13_state_id ec13_district_id ec13_subdistrict_id ec13_town_id using $covidpub/keys/pc11u_ec13u_key, keepusing(pc11_state_id pc11_district_id pc11_subdistrict_id pc11_town_id) 35 | drop if _merge == 2 36 | ren _merge _merge_t 37 | 38 | /* restore the variables from the first merge */ 39 | replace pc11_state_id = tmp_pc11_state_id if mi(pc11_state_id) 40 | replace pc11_district_id = tmp_pc11_district_id if mi(pc11_district_id) 41 | replace pc11_subdistrict_id = tmp_pc11_subdistrict_id if mi(pc11_subdistrict_id) 42 | ren tmp_pc11_village_id pc11_village_id 43 | drop tmp* 44 | 45 | /* systematically rename all variables */ 46 | ren emp_all_* emp_* 47 | ren count_all_* num_* 48 | 49 | /* drop outpatient practices, psych hospitals, etc.. */ 50 | drop *862* *871* *872* *879* *869* *873* 51 | 52 | ren *861* *hosp* 53 | drop _merge* 54 | 55 | /* label urban/rural sector */ 56 | recode sector 1=1 2=0 57 | rename sector rural 58 | 59 | /* label variables */ 60 | // label_from_gdoc, docid("1h6G4vYL3lvy4Bi8DTY3pMT2-5aVWOBoxAm3plx4M7qQ") 61 | save $covidpub/hospitals/ec_hospitals_tv, replace 62 | cap mkdir $covidpub/hospitals/csv 63 | export delimited $covidpub/hospitals/csv/ec_hospitals_tv.csv, replace 64 | 65 | /* COLLAPSE TO DISTRICT LEVEL */ 66 | use $covidpub/hospitals/ec_hospitals_tv, clear 67 | 68 | /* get district ids (can't use village/town match since we had some missing locations) */ 69 | ren pc11_state_id tmp_pc11_state_id 70 | ren pc11_district_id tmp_pc11_district_id 71 | 72 | /* get pc11 district ids */ 73 | merge m:1 ec13_state_id ec13_district_id using $covidpub/keys/pc11_ec13_district_key, keepusing(pc11_state_id pc11_district_id) 74 | drop if _merge == 2 75 | assert _merge == 3 76 | drop _merge 77 | 78 | /* see if they match (they better!) */ 79 | count if pc11_state_id != tmp_pc11_state_id & !mi(tmp_pc11_state_id) 80 | count if pc11_district_id != tmp_pc11_district_id & !mi(tmp_pc11_district_id) 81 | drop tmp* 82 | list *hosp* if mi(pc11_district_id) 83 | 84 | /* replace all delhi districts with missing so it all gets collapsed into 1 */ 85 | //replace pc11_district_id = "" if ec13_state_id == "07" 86 | 87 | /* sum the numbers to pc11 districts */ 88 | collapse (sum) *hosp*, by(pc11_state_id pc11_district_id) 89 | 90 | /* prefix all vars with EC prefix */ 91 | ren *hosp* ec_*hosp* 92 | 93 | /* drop instances without a state id */ 94 | drop if mi(pc11_state_id) 95 | 96 | /* label from the google sheet dictionary */ 97 | // label_from_gdoc, docid("1h6G4vYL3lvy4Bi8DTY3pMT2-5aVWOBoxAm3plx4M7qQ") 98 | save $covidpub/hospitals/pc11/ec_hospitals_dist_pc11, replace 99 | export delimited $covidpub/hospitals/csv/ec_hospitals_dist_pc11.csv, replace 100 | 101 | /* create LGD version */ 102 | convert_ids, from_ids(pc11_state_id pc11_district_id) to_ids(lgd_state_id lgd_district_id) labels key($keys/lgd_pc11_district_key_weights.dta) weight_var(pc11_lgd_wt_pop) metadata_urls(https://docs.google.com/spreadsheets/d/e/2PACX-1vSq7qkpXS2QFatP_35deNi0ZeHNVgSMr4JHKaxx3pZgefp4cw4iqRMo0GRPMe0-h3n6BEoHPuzQEgmc/pub?gid=1900447643&single=true&output=csv) 103 | save $covidpub/hospitals/ec_hospitals_dist, replace 104 | export delimited $covidpub/hospitals/csv/ec_hospitals_dist.csv, replace 105 | -------------------------------------------------------------------------------- /b/prep_ec_hosp_microdata.do: -------------------------------------------------------------------------------- 1 | /* open partially cleaned EC13 */ 2 | /* this temp file is generated by core/ecpc/ecmerge/collapse_ecs.do */ 3 | use $tmp/ec13_precollapse_tmp.dta, clear 4 | 5 | /* keep healthcare-related activities */ 6 | keep if inlist(nic, 861, 862, 869, 871, 872, 873, 879) 7 | 8 | drop *shric* 9 | 10 | /* only keep the sector, location, employment, and activity fields */ 11 | keep sector emp_all gov nic ec13_state_id ec13_district_id ec13_subdistrict_id ec13_village_id ec13_town_id 12 | 13 | save $covidpub/hospitals/ec_hosp_microdata, replace 14 | cap mkdir $covidpub/hospitals/csv 15 | export delimited $covidpub/hospitals/csv/ec_hosp_microdata.csv, replace 16 | -------------------------------------------------------------------------------- /b/prep_hosp_pca_vd.do: -------------------------------------------------------------------------------- 1 | /************/ 2 | /* Villages */ 3 | /************/ 4 | use $pc11/pc11_vd_clean.dta, clear 5 | 6 | /* keep the demographic and health infrastructure variables */ 7 | 8 | /* check missing data percent for various health center variables */ 9 | keep *id pc11_vd_nc* pc11_vd_asha pc11_vd_med* pc11_vd_fwc_cntr pc11_vd_mh_cln pc11_vd_disp pc11_vd_altmed_hosp pc11_vd_all_hosp pc11_vd_tb_cln pc11_vd_mcw_cntr pc11_vd_phs_cntr pc11_vd_ph_cntr pc11_vd_ch_cntr *_doc_* *_pmed_* pc11_vd_ch_cntr pc11_vd_ph_cntr pc11_vd_phs_cntr pc11_vd_tb_cln pc11_vd_all_hosp pc11_vd_disp pc11_vd_mh_cln pc11_vd_med_in_out_pat pc11_vd_med_c_hosp_home 10 | 11 | /* merge with pca clean data at village level */ 12 | merge 1:1 pc11_state_id pc11_district_id pc11_subdistrict_id pc11_village_id using $pc11/pc11r_pca_clean.dta, keepusing(pc11_pca_tot_p) 13 | keep if _merge == 3 14 | drop _merge 15 | 16 | /* save rural PCA and VD subset in data repo */ 17 | compress 18 | save $covidpub/hospitals/pc11r_hosp, replace 19 | cap mkdir $covidpub/hospitals/csv 20 | export delimited $covidpub/hospitals/csv/pc11r_hosp.csv, replace 21 | 22 | /*********/ 23 | /* Towns */ 24 | /*********/ 25 | use $pc11/pc11_td_clean.dta, clear 26 | 27 | /* keep the town directory hospital and clinic fields */ 28 | keep *id pc11_td_med* pc11_td_disp pc11_td_all_hosp pc11_td_alt_hospital *_doc_* *_pmed_* *_beds *clinic pc11_td_all_hospital pc11_td_disp pc11_td_tb_clinic pc11_td_nur_homes pc11_td_mh_clinic pc11_td_in_out_pat pc11_td_c_hosp_home 29 | 30 | /* rename badly named allh to all for consistency with rural */ 31 | ren *_allh_* *_all_hosp_* 32 | ren pc11_td_all_hospital pc11_td_all_hosp 33 | 34 | /* make a few other fields consistent */ 35 | ren pc11_td_tb_clinic pc11_td_tbc 36 | ren pc11_td_nur_homes pc11_td_nh 37 | ren pc11_td_mh_clinic pc11_td_mh 38 | 39 | /* merge with pca clean data at town level */ 40 | merge 1:1 pc11_state_id pc11_district_id pc11_subdistrict_id pc11_town_id using $pc11/pc11u_pca_clean.dta, keepusing(pc11_pca_tot_p) 41 | keep if _merge == 3 42 | drop _merge 43 | 44 | /* save urban PCA and TD subset in data repo */ 45 | compress 46 | save $covidpub/hospitals/pc11u_hosp, replace 47 | cap mkdir $covidpub/hospitals/csv 48 | export delimited $covidpub/hospitals/csv/pc11u_hosp.csv, replace 49 | -------------------------------------------------------------------------------- /b/prep_secc.do: -------------------------------------------------------------------------------- 1 | use ~/iec2/secc/final/collapse/village_consumption_imputed_pc11.dta, clear 2 | 3 | keep pc11_state_id pc11_village_id secc_cons_per_cap 4 | 5 | save $tmp/secc_cons_pc11, replace 6 | 7 | /* get district identifiers */ 8 | merge 1:1 pc11_state_id pc11_village_id using $pc11/pc11r_pca_clean.dta, keepusing(pc11_district_id) 9 | 10 | /* collapse to district level */ 11 | keep if _merge == 3 12 | drop _merge 13 | 14 | collapse (mean) secc_cons_per_cap, by(pc11_state_id pc11_district_id) 15 | 16 | save $tmp/secc_cons_pc11_district, replace 17 | 18 | 19 | use $hosp/hospitals_dist, clear 20 | 21 | merge 1:1 pc11_state_id pc11_district_id using $tmp/secc_cons_pc11_district 22 | keep if _merge == 3 23 | drop _merge 24 | 25 | 26 | sum pc_perk_beds_tot dlhs4_perk_total_beds 27 | corr pc_perk_beds_tot ec_perk_emp_hosp_gov 28 | 29 | corr pc_perk_beds_tot ec_perk_emp_hosp_tot 30 | 31 | corr pc_perk_beds_tot secc_cons_per_cap 32 | corr dlhs4_perk_total_beds secc_cons_per_cap 33 | -------------------------------------------------------------------------------- /b/push_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | 5 | ############# 6 | # ship data # 7 | ############# 8 | 9 | # set list of folders to be pushed from $covidpub (not all folders will be shared) 10 | dirs="covid demography estimates hospitals keys migration agmark nfhs hmis mortality" 11 | 12 | # send public data from these folders to Dropbox via rclone (rclone must be configured) 13 | for dir in $dirs; do 14 | rclone copy --progress ~/iec/covid/$dir my_remote:SamPaul/covid_data/$dir/ 15 | done 16 | 17 | -------------------------------------------------------------------------------- /b/str/cov19india_district_fixes.txt: -------------------------------------------------------------------------------- 1 | master,district,state 2 | y.s.r.,y.s.r kadapa,andhra pradesh 3 | y.s.r.,ysr kadapa,andhra pradesh 4 | y.s.r.,y.s.r. kadapa,andhra pradesh 5 | y.s.r.,y s r,andhra pradesh 6 | spsr nellore,s.p.s. nellore,andhra pradesh 7 | spsr nellore,sri potti sriramulu nellore,andhra pradesh 8 | dibang valley,upper dibang valley,arunachal pradesh 9 | papum pare,capital complex,arunachal pradesh 10 | south salmara mancachar,south salmara mankachar,assam 11 | aurangabad,aurangabad bihar,bihar 12 | purbi champaran,purba champaran,bihar 13 | purbi champaran,west champaran,bihar 14 | pashchim champaran,east champaran,bihar 15 | kaimur (bhabua),kaimur,bihar 16 | kaimur (bhabua),kaimur bhabua,bihar 17 | madhepura,madhopura,bihar 18 | bemetara,bametara,chhattisgarh 19 | bilaspur,bilaspur cg,chhattisgarh 20 | dantewada,dakshin bastar dantewada,chhattisgarh 21 | gariyaband,gariaband,chhattisgarh 22 | janjgir-champa,janjgir champa,chhattisgarh 23 | kabirdham,kabeerdham,chhattisgarh 24 | uttar bastar kanker,uttar bastar kanker,chhattisgarh 25 | bilaspur,gaurela pendra marwahi,chhattisgarh 26 | ahmadabad,ahmedabad,gujarat 27 | sabar kantha,sabarkantha,gujarat 28 | panch mahals,panchmahal,gujarat 29 | mehsana,mahesana,gujarat 30 | kachchh,kutch,gujarat 31 | chota udaipur,chhota udaipur,gujarat 32 | banas kantha,banaskantha,gujarat 33 | charkhi dadri,charki dadri,haryana 34 | shopiyan,shupiyan,jammu and kashmir 35 | baramulla,baramula,jammu and kashmir 36 | bandipora,bandipore,jammu and kashmir 37 | budgam,badgam,jammu and kasmir 38 | koderma,kodarma,jharkhand 39 | saraikela kharsawan,saraikela,jharkhand 40 | bengaluru urban,bengaluru,karnataka 41 | ahmednagar,ahmadnagar,maharashtra 42 | beed,bid,maharashtra 43 | raigarh,raigad,maharashtra 44 | gondiya,gondia,maharashtra 45 | buldhana,buldana,maharashtra 46 | deogarh,debagarh,odisha 47 | baleshwar,balasore,odisha 48 | jajapur,jajpur,odisha 49 | ferozepur,firozpur,punjab 50 | dholpur,dhaulpur,rajasthan 51 | chittaurgarh,chittorgarh,rajasthan 52 | west district,west sikkim,sikkim 53 | east district,east sikkim,sikkim 54 | north district,north sikkim,sikkim 55 | south district,south sikkim,sikkim 56 | the nilgiris,nilgiris,tamil nadu 57 | jayashankar bhupalapally,jayashankar,tamil nadu 58 | kumuram bheem asifabad,komaram bheem,tamil nadu 59 | kanniyakumari,kanyakumari,tamil nadu 60 | jangoan,jangaon,telangana 61 | jagitial,jagtial,telangana 62 | mahrajganj,maharajganj,uttar pradesh 63 | kheri,lakhimpur kheri,uttar pradesh 64 | barabanki,bara banki,uttar pradesh 65 | medinipur east,purba medinipur,west bengal 66 | 67 | -------------------------------------------------------------------------------- /b/str/cov19india_vaccine_district_fixes.txt: -------------------------------------------------------------------------------- 1 | master,lgd_district_name,lgd_state_name 2 | dohad,dahod,gujarat 3 | east nimar,khandwa,madhya pradesh 4 | y.s.r.,y.s.r. kadapa,andhra pradesh 5 | spsr nellore,s.p.s. nellore,andhra pradesh 6 | spsr nellore,sri potti sriramulu nellore,andhra pradesh 7 | dibang valley,upper dibang valley,arunachal pradesh 8 | papum pare,capital complex,arunachal pradesh 9 | south salmara mancachar,south salmara mankachar,assam 10 | aurangabad,aurangabad bihar,bihar 11 | purbi champaran,purba champaran,bihar 12 | purbi champaran,west champaran,bihar 13 | pashchim champaran,east champaran,bihar 14 | kaimur (bhabua),kaimur,bihar 15 | kaimur (bhabua),kaimur bhabua,bihar 16 | madhepura,madhopura,bihar 17 | bemetara,bametara,chhattisgarh 18 | bilaspur,bilaspur cg,chhattisgarh 19 | dantewada,dakshin bastar dantewada,chhattisgarh 20 | gariyaband,gariaband,chhattisgarh 21 | janjgir-champa,janjgir champa,chhattisgarh 22 | kabirdham,kabeerdham,chhattisgarh 23 | uttar bastar kanker,uttar bastar kanker,chhattisgarh 24 | bilaspur,gaurela pendra marwahi,chhattisgarh 25 | ahmadabad,ahmedabad,gujarat 26 | sabar kantha,sabarkantha,gujarat 27 | panch mahals,panchmahal,gujarat 28 | mehsana,mahesana,gujarat 29 | kachchh,kutch,gujarat 30 | chota udaipur,chhota udaipur,gujarat 31 | banas kantha,banaskantha,gujarat 32 | charkhi dadri,charki dadri,haryana 33 | shopiyan,shupiyan,jammu and kashmir 34 | baramulla,baramula,jammu and kashmir 35 | bandipora,bandipore,jammu and kashmir 36 | budgam,badgam,jammu and kasmir 37 | koderma,kodarma,jharkhand 38 | saraikela kharsawan,saraikela,jharkhand 39 | bengaluru urban,bengaluru,karnataka 40 | ahmednagar,ahmadnagar,maharashtra 41 | beed,bid,maharashtra 42 | raigarh,raigad,maharashtra 43 | gondiya,gondia,maharashtra 44 | buldhana,buldana,maharashtra 45 | deogarh,debagarh,odisha 46 | baleshwar,balasore,odisha 47 | jajapur,jajpur,odisha 48 | ferozepur,firozpur,punjab 49 | dholpur,dhaulpur,rajasthan 50 | chittaurgarh,chittorgarh,rajasthan 51 | west district,west sikkim,sikkim 52 | east district,east sikkim,sikkim 53 | north district,north sikkim,sikkim 54 | south district,south sikkim,sikkim 55 | the nilgiris,nilgiris,tamil nadu 56 | jayashankar bhupalapally,jayashankar,tamil nadu 57 | kumuram bheem asifabad,komaram bheem,tamil nadu 58 | kanniyakumari,kanyakumari,tamil nadu 59 | jangoan,jangaon,telangana 60 | jagitial,jagtial,telangana 61 | mahrajganj,maharajganj,uttar pradesh 62 | kheri,lakhimpur kheri,uttar pradesh 63 | barabanki,bara banki,uttar pradesh 64 | medinipur east,purba medinipur,west bengal 65 | tuticorin,thoothukkudi,tamil nadu 66 | -------------------------------------------------------------------------------- /b/str/covid_district_fixes.txt: -------------------------------------------------------------------------------- 1 | master,district,state 2 | vizianagaram,vizianagram,andhra pradesh 3 | kaimur bhabua,kaimur,bihar 4 | darbhanga,dharbanga,bihar 5 | kolar,kolara,karnataka 6 | alappuzha,alapuzha,kerala 7 | ernakulam,kochi,kerala 8 | adilabad,asifabad,telangana 9 | pauri garhwal,garhwal,uttarakhand 10 | nainital,nanital,uttarakhand 11 | amroha,phule,uttar pradesh 12 | amroha,jyotiba nagar,uttar pradesh 13 | budaun,budaon,uttar pradesh 14 | deoria,devariya,uttar pradesh 15 | mahamaya nagar,hathras,uttar pradesh 16 | 17 | -------------------------------------------------------------------------------- /b/str/lgd_district_fixes.txt: -------------------------------------------------------------------------------- 1 | master,lgd_district_name,lgd_state_name 2 | kaimur (bhabua),kaimur bhabua,bihar 3 | pashchim champaran,west champaran,bihar 4 | purba champaran,purbi champaran, bihar 5 | purbi champaran,east champaran,bihar 6 | purnia,purnea,bihar 7 | kaimur (bhabua),kaimur,bihar 8 | muzaffarpur,muzzafarpur,bihar 9 | sheikhpura,sheikpura,bihar 10 | ayodhya,faizabad,uttar pradesh 11 | prayagraj,allahabad,uttar pradesh 12 | kamrup metro,kamrup m,assam 13 | kamrup rural,kamrup r,assam 14 | y s r,cuddapah,andhra pradesh 15 | y s r,y.s.r. (cuddapah),andhra pradesh 16 | nuh,mewat,haryana 17 | kalaburagi,gulbarga,karnataka 18 | amethi,c s m nagar,uttar pradesh 19 | amroha,jyotiba phule nagar,uttar pradesh 20 | amroha,jyotiba nagar,uttar pradesh 21 | leh ladakh,leh,ladakh 22 | leh ladakh,ladakh,ladakh 23 | sant kabeer nagar,skn,uttar pradesh 24 | bhadohi,srnb,uttar pradesh 25 | bhadohi,sant ravidas nagar,uttar pradesh 26 | hathras,mahamaya nagar,uttar pradesh 27 | janjgir-champa,janjgir,chhattisgarh 28 | s.a.s nagar,sahibzada ajit singh,punjab 29 | shahid bhagat singh nagar,sbs nagar,punjab 30 | faridabad,gbn faridabad,haryana 31 | barddhaman,bardhaman,west bengal 32 | gurugram,gurgaon,haryana 33 | khargone,west nimar,madhya pradesh 34 | east nimar,khandwa,madhya pradesh 35 | baleshwar,balasore,odisha 36 | kachchh,kutch,gujarat 37 | tuticorin,thoothukkudi,tamil nadu 38 | purba bardhaman,east bardhaman,west bengal 39 | paschim bardhaman,west bardhaman,west bengal 40 | east medinipur,medinipur east,west bengal 41 | west medinipur,medinipur west,west bengal 42 | -------------------------------------------------------------------------------- /b/update_case_cronjob.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # set this up with the following cron command (executes just after midnight daily): 4 | # $ crontab -l 5 | # $ 5 5 * * * $HOME/ddl/covid/b/update_case_cronjob.sh 6 | 7 | # depends on slack messaging hook in env variable SLACKKEY 8 | if [[ -z "$SLACKKEY" ]]; then 9 | printf "\nENV variable $SLACKKEY must be defined for cronjob to execute. Add to your .bashrc\n" 10 | fi 11 | 12 | # send init message via slack 13 | curl -X POST -H 'Content-type: application/json' --data '{"text":":building_construction: Beginning auto-update of COVID case and vaccination data"}' https://hooks.slack.com/services/$SLACKKEY 14 | 15 | # change dir to scratch for logging 16 | cd /scratch/`whoami` 17 | 18 | # run update script 19 | printf "\nbegin update script: ~/ddl/covid/b/update_case_vaccination_data.do\n" 20 | stata -b do ~/ddl/covid/b/update_case_vaccination_data.do 21 | 22 | # check log for errors 23 | printf "\nchecking Stata log for errors...\n" 24 | if egrep --before-context=1 --max-count=1 "^r\([0-9]+\);$" "update_case_vaccination_data.log" 25 | then 26 | # send error message 27 | printf "\nFAIL - you have a data dumpster fire on your hands!" 28 | curl -X POST -H 'Content-type: application/json' --data '{"text":":rotating_light: FAILURE: auto-update of COVID data had non-zero exit status"}' https://hooks.slack.com/services/$SLACKKEY 29 | exit 1 30 | else 31 | # send success message 32 | curl -X POST -H 'Content-type: application/json' --data '{"text":":not-a-dumpster-fire: Successful update of COVID data!"}' https://hooks.slack.com/services/$SLACKKEY 33 | printf "\nSuccess!" 34 | exit 0 35 | fi 36 | 37 | # move back to starting dir 38 | cd - 39 | -------------------------------------------------------------------------------- /b/update_case_vaccination_data.do: -------------------------------------------------------------------------------- 1 | /* this file updates just the case count and vaccination data, the most frequently updated files on our platform. */ 2 | 3 | /* get new case data */ 4 | setc covid 5 | do $ccode/b/get_case_data.do 6 | import delimited using $covidpub/covid/csv/covid_infected_deaths.csv, clear 7 | 8 | /* check last date */ 9 | quietly { 10 | gen date_fmt = date(date, "DMY") 11 | egen latest_date = max(date_fmt) 12 | lab var latest_date "Last Day in the data:" 13 | format latest_date %td 14 | noi tab latest_date 15 | } 16 | 17 | /* run checks */ 18 | is_unique lgd_state_id lgd_state_name lgd_district_name date 19 | 20 | /* check that data is square */ 21 | gen n = 1 22 | bys lgd_state_id lgd_state_name lgd_district_name: egen num_days = total(n) 23 | qui distinct num_days 24 | local square_check = `r(ndistinct)' 25 | if `square_check' != 1 { 26 | disp_nice "Data is not square, it should have the same number of observations (days) for each district." 27 | exit 9 28 | } 29 | 30 | /* get new vaccination data */ 31 | do $ccode/b/get_vaccination_data.do 32 | 33 | /* import the csv data */ 34 | import delimited using $covidpub/covid/csv/covid_vaccination.csv, clear 35 | 36 | /* check last date */ 37 | quietly { 38 | gen date_fmt = date(date, "DMY") 39 | egen latest_date = max(date_fmt) 40 | lab var latest_date "Last Day in the data:" 41 | format latest_date %td 42 | noi tab latest_date 43 | } 44 | 45 | /* run checks */ 46 | is_unique lgd_state_id lgd_state_name lgd_district_name date 47 | 48 | /* check that data is square */ 49 | gen n = 1 50 | bys lgd_state_id lgd_state_name lgd_district_name: egen num_days = total(n) 51 | qui distinct num_days 52 | local square_check = `r(ndistinct)' 53 | if `square_check' != 1 { 54 | disp_nice "Data is not square, it should have the same number of observations (days) for each district." 55 | exit 9 56 | } 57 | 58 | /* check how many days are in the data - should be more than 97 as of 23 april 2021 */ 59 | qui sum num_days 60 | local num_days = `r(mean)' 61 | if `num_days' < 97 { 62 | disp_nice "Data is missing. There should be more than 97 days of data." 63 | exit 9 64 | } 65 | 66 | /* rclone needed data files to dropbox. CSV first */ 67 | shell rclone copyto --progress ~/iec/covid/covid/csv/covid_infected_deaths.csv my_remote:SamPaul/covid_data/covid/csv/covid_infected_deaths.csv 68 | shell rclone copyto --progress ~/iec/covid/covid/csv/covid_infected_deaths_pc11.csv my_remote:SamPaul/covid_data/covid/csv/covid_infected_deaths_pc11.csv 69 | shell rclone copyto --progress ~/iec/covid/covid/csv/covid_vaccination.csv my_remote:SamPaul/covid_data/covid/csv/covid_vaccination.csv 70 | 71 | /* now dta */ 72 | shell rclone copyto --progress ~/iec/covid/covid/covid_infected_deaths.dta my_remote:SamPaul/covid_data/covid/covid_infected_deaths.dta 73 | shell rclone copyto --progress ~/iec/covid/covid/covid_infected_deaths_pc11.dta my_remote:SamPaul/covid_data/covid/covid_infected_deaths_pc11.dta 74 | shell rclone copyto --progress ~/iec/covid/covid/covid_vaccination.dta my_remote:SamPaul/covid_data/covid/covid_vaccination.dta 75 | -------------------------------------------------------------------------------- /b/vaccination_plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import getpass 5 | import datetime 6 | import matplotlib.pyplot as plt 7 | 8 | # vaccination dataframe 9 | vdf = pd.read_stata(os.path.expanduser("~/iec/covid/covid/covid_vaccination.dta")) 10 | 11 | # case data 12 | cdf = pd.read_stata(os.path.expanduser("~/iec/covid/covid/covid_infected_deaths.dta")) 13 | 14 | # population data 15 | pdf = pd.read_stata(f"/scratch/{getpass.getuser()}/lgd_pca_district_pop.dta") 16 | 17 | # get dates as string 18 | cdf['string_date'] = cdf['date'].apply(lambda x: x.strftime("%d%m%Y")) 19 | vdf = vdf.rename(columns={"date": "string_date"}) 20 | 21 | # merge case data and vaccination data 22 | df = cdf.merge(vdf, on=["lgd_state_name", "lgd_state_id", "lgd_district_name", "string_date"], how="outer") 23 | 24 | # merge in population data 25 | df = df.merge(pdf, on=["lgd_state_name", "lgd_state_id", "lgd_district_name", "lgd_district_id"], how="left") 26 | 27 | # get date as datetime object 28 | df['date'] = df['string_date'].apply(lambda x: datetime.datetime.strptime(x, "%d%m%Y")) 29 | df = df.sort_values(["lgd_state_name", "lgd_district_name", "date"]) 30 | 31 | # keep only dates with vaccination data, after Jan 16 2021, before April 13 32 | df = df.loc[(df['date'] >= datetime.datetime.strptime("16012021", "%d%m%Y")) & 33 | (df['date'] < datetime.datetime.strptime("13042021", "%d%m%Y"))].copy() 34 | df['total_vaccinated'] = df['total_covaxin'] + df['total_covishied'] 35 | 36 | # calculate per capita vaccination rates 37 | df['vac_rate'] = df['total_vaccinated'] / df['lgd_pca_tot_p'] 38 | 39 | 40 | # ---- # 41 | # plot # 42 | # ---- # 43 | f, ax = plt.subplots(figsize=[12, 8]) 44 | 45 | # get state data 46 | state_data = df.groupby(['date', 'lgd_state_name']).sum()[['total_vaccinated', 'lgd_pca_tot_p']].reset_index() 47 | 48 | # get state total across all time 49 | state_total = state_data.groupby(['lgd_state_name']).sum()[['total_vaccinated', 'lgd_pca_tot_p']].reset_index() 50 | 51 | # calcualte vaccination rate 52 | state_data['vac_rate'] = state_data['total_vaccinated'] / state_data['lgd_pca_tot_p'] 53 | state_total['vac_rate'] = state_total['total_vaccinated'] / state_total['lgd_pca_tot_p'] 54 | 55 | #sns.lineplot(data=state_data, x="date", y="vac_rate", hue="lgd_state_name") 56 | state_total = state_total.sort_values(by='vac_rate', ascending=False) 57 | state_total = state_total.set_index("lgd_state_name") 58 | 59 | # drop infinite values (sikkim) 60 | state_total = state_total.drop(state_total.loc[state_total["vac_rate"] == np.inf].index) 61 | 62 | # drop 0 vlaues (lakshadweep) 63 | state_total = state_total.drop(state_total.loc[state_total["vac_rate"] == 0].index) 64 | 65 | state_total.plot.bar(y='vac_rate', ax=ax) 66 | ax.set_ylabel("Vaccination Rate", fontsize=12) 67 | ax.set_xlabel("State", fontsize=12) 68 | 69 | plt.savefig(os.path.expanduser("~/public_html/png/state_vac_rate.png"), bbox_inches="tight") 70 | -------------------------------------------------------------------------------- /build.md: -------------------------------------------------------------------------------- 1 | # The Build 2 | 3 | ![COVID-19 Repository Build](assets/covid_build_public.svg?raw=true "COVID Build") 4 | -------------------------------------------------------------------------------- /como/a/app_age_hr_interpolation.do: -------------------------------------------------------------------------------- 1 | /* open hazard ratio interpolations */ 2 | import delimited using $covidpub/covid/csv/uk_age_predicted_hr.csv, clear 3 | ren ln_hr_age_sex ln_hr_simple 4 | 5 | /* merge to actual data used in OpenSAFELY */ 6 | merge 1:1 age using $tmp/hr_full_dis, nogen keepusing(hr_age) 7 | ren hr_age hr_full_age 8 | 9 | merge 1:1 age using $tmp/hr_simp_dis, nogen keepusing(hr_age) 10 | ren hr_age hr_simple_age 11 | 12 | /* expand opensafely data to decimal ages */ 13 | expand 10 14 | bys age: egen s = seq() 15 | replace age = age - (s - 1) / 10 16 | replace ln_hr_simple = . if age != round(age) 17 | replace ln_hr_full = . if age != round(age) 18 | 19 | foreach v in simple full { 20 | gen ln_hr_`v'_age_dis = ln(hr_`v'_age) 21 | } 22 | 23 | keep if inrange(age, 18, 89) 24 | 25 | sort age 26 | twoway /// 27 | (line ln_hr_full age, lwidth(medthick) lpattern(solid) lcolor(black)) /// 28 | (line ln_hr_full_age_dis age, lwidth(medthick) lpattern(-) lcolor(blue)) /// 29 | , xscale(range(15 90)) xlabel(20(10)90) xtitle(Age) ytitle("Log Hazard Ratio") /// 30 | legend(region(lcolor(black)) rows(2) ring(0) pos(5) lab(1 "Interpolated Age Hazard Ratio") lab(2 "Discrete Age Hazard Ratio") size(small) symxsize(5) bm(tiny)) 31 | 32 | graphout age_interpolation_full, pdf 33 | 34 | // twoway /// 35 | // (line ln_hr_simple age, lwidth(medthick) lpattern(-) lcolor(gs8)) /// 36 | // (line ln_hr_simple_age_dis age, lwidth(medthick) lpattern(solid) lcolor(black)) /// 37 | // , xscale(range(15 90)) xlabel(20(10)90) xtitle(Age) ytitle("Log Hazard Ratio") /// 38 | // legend(region(lcolor(black)) rows(2) ring(0) pos(5) lab(1 "Discrete Age Hazard Ratio") lab(2 "Interpolated Hazard Ratio") size(small) symxsize(5) bm(tiny)) 39 | // 40 | // graphout age_interpolation_simple, pdf 41 | 42 | 43 | -------------------------------------------------------------------------------- /como/a/app_joint_condition.do: -------------------------------------------------------------------------------- 1 | /*************************************************************************************/ 2 | /* TEST: how much does interaction of comorbidities affect population relative risk? */ 3 | 4 | /* Note: 5 | 6 | We can only do this for the biomarker conditions that we have in the 7 | Indian data since we don't have microdata on the GBD variables. */ 8 | 9 | /*************************************************************************************/ 10 | 11 | global conditionlist male $hr_biomarker_vars 12 | 13 | /* open Indian comorbidity microdata */ 14 | use $health/dlhs/data/dlhs_ahs_covid_comorbidities, clear 15 | keep wt age $conditionlist 16 | 17 | /* merge primary hazard ratios */ 18 | merge m:1 age using $tmp/hr_full_cts 19 | 20 | /* calculate the risk factor for each individual, multiplying the hazard ratio by 21 | an indicator for condition existence. */ 22 | gen prr_health = 1 23 | foreach v in $conditionlist { 24 | 25 | qui gen prr_`v' = `v' * hr_`v' + (1 - `v') 26 | qui replace prr_health = prr_health * prr_`v' 27 | qui sum prr_health [aw=wt] 28 | di %20s "`v': " %5.2f `r(mean)' 29 | } 30 | 31 | /* collapse combined health PRR to age-level using survey weights */ 32 | collapse (mean) $conditionlist prr_health [aw=wt], by(age) 33 | ren prr_health prr_health_micro 34 | 35 | /* now repeat the exercise using the aggregate data (which ignores interactions) */ 36 | merge 1:1 age using $tmp/hr_full_cts, nogen 37 | gen prr_health_agg = 1 38 | foreach v in $conditionlist { 39 | qui gen prr_`v' = `v' * hr_`v' + (1 - `v') 40 | qui replace prr_health_agg = prr_health_agg * prr_`v' 41 | qui sum prr_health_agg 42 | di "`v': " %5.2f `r(mean)' 43 | } 44 | 45 | gen gap = prr_health_micro / prr_health_agg 46 | tsset age 47 | replace gap = (L3.gap + L2.gap + L.gap + gap + F.gap + F2.gap + F3.gap) / 7 if !mi(L3.gap) & !mi(F3.gap) 48 | keep if age <= 95 49 | 50 | /* plot the two age-specific PRR distributions */ 51 | sort age 52 | twoway /// 53 | (line prr_health_micro age, lwidth(medthick) lcolor(black)) /// 54 | (line prr_health_agg age, lwidth(medthick) lcolor(lavender)), /// 55 | ytitle("Aggregate Population Relative Risk from Health Conditions") xtitle("Age") /// 56 | legend(lab(1 "Microdata") lab(2 "Aggregate Data") ring(0) pos(5) cols(1) size(small) symxsize(5) bm(tiny) region(lcolor(black))) /// 57 | ylabel(1(.5)2.5) 58 | graphout prr_health_joint, pdf 59 | 60 | line gap age if age < 98, lwidth(medthick) ylabel(1 1.05 1.1 1.15) /// 61 | xtitle("Age") ytitle("Increased population relative risk" "from comorbidity correlation") 62 | graphout prr_ratio_micro, pdf 63 | -------------------------------------------------------------------------------- /como/a/app_table_age_bin_prev.do: -------------------------------------------------------------------------------- 1 | /* set variable labels */ 2 | cap prog drop label_vars 3 | prog def label_vars 4 | cap label var bmi_obeseI "Obese (class I)" 5 | cap label var bmi_obeseII "Obese (class II)" 6 | cap label var bmi_obeseIII "Obese (class III)" 7 | label var obese_1_2 "Obese (class 1 & 2)" 8 | label var obese_3 "Obese (class 3)" 9 | label var bp_high "Hypertension" 10 | label var diabetes_uncontr "Diabetes" 11 | label var asthma_ocs "Asthma" 12 | label var autoimmune_dz "Psoriasis, Rheumatoid" 13 | label var haem_malig_1 "Haematological Cancer" 14 | label var cancer_non_haem_1 "Non-haematological Cancer" 15 | label var chronic_heart_dz "Chronic Heart Disease" 16 | label var chronic_resp_dz "Chronic Respiratory Disease" 17 | label var immuno_other_dz "Other Immunosuppressive Conditions" 18 | label var kidney_dz "Kidney Disease" 19 | label var liver_dz "Chronic Liver Disease" 20 | label var neuro_other "Other Neurological Condition" 21 | label var stroke_dementia "Stroke / Dementia" 22 | end 23 | 24 | /* write table header */ 25 | cap file close fh 26 | file open fh using $out/app_table_age_bin_prev.tex, write replace 27 | 28 | file write fh "\begin{tabular}{lrrrrrr}" _n 29 | file write fh " & \multicolumn{6}{c}{\textbf{Age}} \\ " _n 30 | file write fh " & 18--39 & 40--49 & 50--59 & 60--69 & 70--79 & 80--99 \\ " _n 31 | 32 | /* india header */ 33 | file write fh " \textbf{India} & & & & & \\ " _n 34 | 35 | /* INDIA PREVALENCE TABLE */ 36 | use $tmp/prev_india, clear 37 | ren prev_* * 38 | 39 | /* get india age-specific population to weight GBD year vars */ 40 | merge 1:1 age using $tmp/india_pop, keep(match) nogen 41 | 42 | /* loop over condition list */ 43 | label_vars 44 | foreach condition in $hr_biomarker_vars $hr_gbd_vars { 45 | 46 | /* get variable label for condition */ 47 | local lab: variable label `condition' 48 | 49 | file write fh "\hspace{3mm} " 50 | file write fh "`lab' & " 51 | 52 | qui sum `condition' [aw=india_pop] if inrange(age, 18, 39) 53 | file write fh %5.1f (`r(mean)' * 100) " & " 54 | qui sum `condition' [aw=india_pop] if inrange(age, 40, 49) 55 | file write fh %5.1f (`r(mean)' * 100) " & " 56 | qui sum `condition' [aw=india_pop] if inrange(age, 50, 59) 57 | file write fh %5.1f (`r(mean)' * 100) " & " 58 | qui sum `condition' [aw=india_pop] if inrange(age, 60, 69) 59 | file write fh %5.1f (`r(mean)' * 100) " & " 60 | qui sum `condition' [aw=india_pop] if inrange(age, 70, 79) 61 | file write fh %5.1f (`r(mean)' * 100) " & " 62 | qui sum `condition' [aw=india_pop] if inrange(age, 80, 99) 63 | file write fh %5.1f (`r(mean)' * 100) " \\ " _n 64 | } 65 | 66 | /* UK HEADER */ 67 | file write fh " & & & & & \\ " _n 68 | file write fh " \textbf{England} & & & & & \\ " _n 69 | 70 | /* ENGLAND PREVALENCE TABLE */ 71 | 72 | /* combine UK prevalence data */ 73 | use $tmp/prev_uk_nhs_matched, clear 74 | ren prev_* * 75 | 76 | /* get UK age-specific population to weight GBD year vars */ 77 | merge 1:1 age using $tmp/uk_pop, keep(match) nogen 78 | 79 | /* loop over condition list */ 80 | label_vars 81 | foreach condition in $hr_biomarker_vars $hr_gbd_vars { 82 | 83 | /* get variable label for condition */ 84 | local lab: variable label `condition' 85 | 86 | file write fh "\hspace{3mm} " 87 | file write fh "`lab' & " 88 | 89 | qui sum `condition' [aw=uk_pop] if inrange(age, 18, 39) 90 | file write fh %5.1f (`r(mean)' * 100) " & " 91 | qui sum `condition' [aw=uk_pop] if inrange(age, 40, 49) 92 | file write fh %5.1f (`r(mean)' * 100) " & " 93 | qui sum `condition' [aw=uk_pop] if inrange(age, 50, 59) 94 | file write fh %5.1f (`r(mean)' * 100) " & " 95 | qui sum `condition' [aw=uk_pop] if inrange(age, 60, 69) 96 | file write fh %5.1f (`r(mean)' * 100) " & " 97 | qui sum `condition' [aw=uk_pop] if inrange(age, 70, 79) 98 | file write fh %5.1f (`r(mean)' * 100) " & " 99 | qui sum `condition' [aw=uk_pop] if inrange(age, 80, 99) 100 | file write fh %5.1f (`r(mean)' * 100) " \\ " _n 101 | 102 | } 103 | 104 | file write fh "\end{tabular}" _n 105 | 106 | file close fh 107 | 108 | -------------------------------------------------------------------------------- /como/a/app_table_nhs_vs_os.do: -------------------------------------------------------------------------------- 1 | /*************************************************************************/ 2 | /* create a table comparing opensafely prevalences to UK NHS prevalences */ 3 | /*************************************************************************/ 4 | 5 | /* set variable labels */ 6 | cap prog drop label_vars 7 | prog def label_vars 8 | label var bmi_obeseI "Obese (class I)" 9 | label var bmi_obeseII "Obese (class II)" 10 | label var bmi_obeseIII "Obese (class III)" 11 | label var bp_high "Hypertension" 12 | label var diabetes_uncontr "Diabetes" 13 | label var asthma_ocs "Asthma" 14 | label var autoimmune_dz "Psoriasis, Rheumatoid" 15 | label var haem_malig_1 "Haematological Cancer" 16 | label var cancer_non_haem_1 "Non-haematological Cancer" 17 | label var chronic_heart_dz "Chronic Heart Disease" 18 | label var chronic_resp_dz "Chronic Respiratory Disease" 19 | label var immuno_other_dz "Other Immunosuppressive Conditions" 20 | label var kidney_dz "Kidney Disease" 21 | label var liver_dz "Chronic Liver Disease" 22 | label var neuro_other "Other Neurological Condition" 23 | label var stroke_dementia "Stroke / Dementia" 24 | end 25 | 26 | /* ENGLAND PREVALENCE TABLE */ 27 | 28 | /* combine UK prevalence data */ 29 | use $tmp/prev_uk_nhs_matched, clear 30 | 31 | /* get age-specific population for weighted collapse */ 32 | merge 1:1 age using $tmp/uk_pop 33 | 34 | /* collapse to population prevalence */ 35 | drop male 36 | ren prev_* * 37 | 38 | /* merge in OpenSafely prevalences */ 39 | merge 1:1 age using $tmp/prev_uk_os, nogen 40 | foreach v in $hr_biomarker_vars $hr_gbd_vars { 41 | bys age: egen t = mean(prev_`v') 42 | replace prev_`v' = t if mi(prev_`v') 43 | drop t 44 | } 45 | 46 | /* limit to ages 18-99 */ 47 | keep if inrange(age, 18, 99) 48 | 49 | /* BEGIN TABLE OUTPUT */ 50 | cap file close fh 51 | file open fh using $out/app_table_os_vs_nhs.tex, write replace 52 | 53 | file write fh "\begin{tabular}{lcc}" _n 54 | file write fh " & Population & OpenSAFELY \\ " _n 55 | file write fh " & Prevalence & Prevalence \\ " _n 56 | 57 | /* loop over conditions sourced in NHS */ 58 | label_vars 59 | file write fh "\textbf{Source: NHS Health Survey for England} & & \\ " _n 60 | foreach condition in $hr_biomarker_vars { 61 | 62 | /* get variable label for condition */ 63 | local lab: variable label `condition' 64 | 65 | /* put in variable */ 66 | file write fh "\hspace{3mm} " 67 | file write fh "`lab' & " 68 | 69 | /* put in our prevalence */ 70 | qui sum `condition' [aw=uk_pop] 71 | file write fh %5.1f (`r(mean)' * 100) " & " 72 | 73 | /* put in OpenSAFELY prevalence */ 74 | qui sum prev_`condition' 75 | file write fh %5.1f (`r(mean)' * 100) " \\ " _n 76 | } 77 | 78 | /* put in COPD */ 79 | 80 | file write fh "\vspace{5mm} & & \\ " _n 81 | file write fh "\textbf{Source: Clinical Practice Research Datalink} & & \\ " _n 82 | file write fh "\hspace{3mm} " 83 | file write fh %5.1f "Chronic Respiratory Illness & " 84 | qui sum chronic_resp_dz [aw=uk_pop] 85 | file write fh %5.1f (`r(mean)' * 100) " & " 86 | 87 | /* put in OpenSAFELY prevalence */ 88 | qui sum prev_chronic_resp_dz 89 | file write fh %5.1f (`r(mean)' * 100) " \\ " _n 90 | 91 | /* GBD conditions */ 92 | file write fh "\vspace{5mm} & & \\ " _n 93 | file write fh "\textbf{Source: Global Burden of Disease} & & \\ " _n 94 | foreach condition in $hr_gbd_vars { 95 | 96 | /* get variable label for condition */ 97 | local lab: variable label `condition' 98 | 99 | /* put in variable */ 100 | file write fh "\hspace{3mm} " 101 | file write fh "`lab' & " 102 | 103 | /* put in our prevalence */ 104 | qui sum `condition' [aw=uk_pop] 105 | file write fh %5.1f (`r(mean)' * 100) " & " 106 | 107 | /* put in OpenSAFELY prevalence */ 108 | qui sum prev_`condition' 109 | file write fh %5.1f (`r(mean)' * 100) " \\ " _n 110 | } 111 | 112 | file write fh "\end{tabular}" _n 113 | 114 | file close fh 115 | 116 | -------------------------------------------------------------------------------- /como/a/covid_como_oscompare_tpl.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{p{6cm}cc} 2 | & \multicolumn{2}{c}{\textbf{England Prevalence (\%) }} \\[0.5ex] & 3 | \emph{OpenSafely Sample} & \emph{This Study} \\[2ex] 4 | Age 18-39 & \num{34.4} & \num{$$uk_age_18_40$$} \\[0.25ex] 5 | Age 40-49 & \num{16.5} & \num{$$uk_age_40_50$$} \\[0.25ex] 6 | Age 50-59 & \num{17.6} & \num{$$uk_age_50_60$$}\\[0.25ex] 7 | Age 60-69 & \num{13.8} & \num{$$uk_age_60_70$$}\\[0.25ex] 8 | Age 70-79 & \num{11.2} & \num{$$uk_age_70_80$$}\\[0.25ex] 9 | Age 80-99 & \num{6.5} & \num{$$uk_age_80$$} \\[0.25ex] 10 | Male & \num{49.9} & \num{$$male$$} \\[0.25ex] 11 | Diabetes (Controlled) & \num{6.0} & \num{$$uk_prev_diabetes_contr$$} \\[0.25ex] 12 | Diabetes (Uncontrolled) & \num{2.8} & \num{$$uk_prev_diabetes_uncontr$$} \\[0.25ex] 13 | Hypertension & 34.2} & \num{$$uk_prev_hypertension_both$$} \\[0.25ex] 14 | Obese (class I \& II) & \num{19.1} & \num{$$uk_prev_obese_1_2$$} \\[0.25ex] 15 | Obese (class III) & \num{2.7} & \num{$$uk_prev_obese_3$$} \\[0.25ex] 16 | Chronic Heart Disease & \num{6.7} & \num{$$uk_gbd_chronic_heart_dz_mu$$} \\[0.25ex] 17 | Chronic Respiratory Disease & \num{4.1} & \num{$$uk_prev_chronic_resp_dz$$} 18 | \\[0.25ex] 19 | Asthma & \num{1.7} & \num{$$uk_gbd_asthma_ocs_mu$$} \\[0.25ex] 20 | Kidney Disease & \num{6.3} & \num{$$uk_gbd_kidney_dz_mu$$} \\[0.25ex] 21 | Chronic Liver Disease & \num{0.7} & \num{$$uk_gbd_liver_dz_mu$$} \\[0.25ex] 22 | Haematological Cancer & \num{0.1} & \num{$$uk_gbd_haem_malig_1_mu$$}\\[0.25ex] 23 | Non-haematological Cancer & \num{0.5} & \num{$$uk_gbd_cancer_non_haem_1_mu$$} \\[0.25ex] 24 | Stroke, Dementia & \num{2.1} & \num{$$uk_gbd_stroke_dementia_mu$$} \\[0.25ex] 25 | Other Neurological Condition & \num{1.0} & \num{$$uk_gbd_neuro_other_mu$$} \\[0.25ex] 26 | Psoriasis, Rheumatoid & \num{5.1} & \num{$$uk_gbd_autoimmune_dz_mu$$} \\[0.25ex] 27 | Other Immunosuppressive Conditions & \num{1.6} & \num{$$uk_gbd_immuno_other_dz_mu$$} \\[0.25ex] 28 | \end{tabular} 29 | -------------------------------------------------------------------------------- /como/a/covid_como_sumhr_tpl.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{p{7cm}cp{1.25cm}p{1.5cm}} 2 | & \textbf{{\footnotesize Individual}} & 3 | \multicolumn{2}{c}{{\textbf{\footnotesize{Population}}}} \\ 4 | & \textbf{{\footnotesize Relative Risk}} & 5 | \multicolumn{2}{c}{{\textbf{\footnotesize{Relative Risk}}}} \\[0.75ex] 6 | & & \emph{India} & \emph{England} \\[2ex] 7 | %Male & 1.99 & \num{$$india_male_risk$$} & \num{$$uk_male_risk$$} \\[0.25ex] 8 | Diabetes (Controlled) & \num{1.31} & \num{$$india_diabetes_contr_risk$$} & \num{$$uk_diabetes_contr_risk$$} \\[0.25ex] 9 | Diabetes (Uncontrolled) & \num{1.94} & \num{$$india_diabetes_uncontr_risk$$} & \num{$$uk_diabetes_uncontr_risk$$} \\[0.25ex] 10 | Hypertension & \num{0.89} & \num{$$india_bp_high_risk$$} & \num{$$uk_bp_high_risk$$} \\[0.25ex] 11 | Obese (class I \& II) & \num{1.15} & \num{$$india_obese_1_2_risk$$} & \num{$$uk_obese_1_2_risk$$} \\[0.25ex] 12 | Obese (class III) & \num{1.91} & \num{$$india_obese_3_risk$$} & \num{$$uk_obese_3_risk$$} \\[0.25ex] 13 | \\ 14 | Chronic Heart Disease & \num{1.17} & \num{$$india_chronic_heart_dz_risk$$} & \num{$$uk_chronic_heart_dz_risk$$} \\[0.25ex] 15 | Chronic Respiratory Disease & \num{1.62} & \num{$$india_chronic_resp_dz_risk$$} & \num{$$uk_chronic_resp_dz_risk$$} \\[0.25ex] 16 | Asthma & \num{1.13} & \num{$$india_asthma_ocs_risk$$} & \num{$$uk_asthma_ocs_risk$$} \\[0.25ex] 17 | Kidney Disease & \num{1.42} & \num{$$india_kidney_dz_risk$$} & \num{$$uk_kidney_dz_risk$$} \\[0.25ex] 18 | Chronic Liver Disease & \num{1.73} & \num{$$india_liver_dz_risk$$} & \num{$$uk_liver_dz_risk$$} \\[0.25ex] 19 | \\ 20 | Haematological Cancer & \num{2.79} & \num{$$india_haem_malig_1_risk$$} & \num{$$uk_haem_malig_1_risk$$} \\[0.25ex] 21 | Non-haematological Cancer & \num{1.71} & \num{$$india_cancer_non_haem_1_risk$$} & \num{$$uk_cancer_non_haem_1_risk$$} \\[0.25ex] 22 | Stroke, Dementia & \num{2.15} & \num{$$india_stroke_dementia_risk$$} & \num{$$uk_stroke_dementia_risk$$} \\[0.25ex] 23 | Other Neurological Condition & \num{2.56} & \num{$$india_neuro_other_risk$$} & \num{$$uk_neuro_other_risk$$} \\[0.25ex] 24 | Psoriasis, Rheumatoid & \num{1.19} & \num{$$india_autoimmune_dz_risk$$} & \num{$$uk_autoimmune_dz_risk$$} \\[0.25ex] 25 | Other Immunosuppressive Conditions & \num{1.69} & \num{$$india_immuno_other_dz_risk$$} & \num{$$uk_immuno_other_dz_risk$$} \\[0.25ex] 26 | \end{tabular} 27 | 28 | -------------------------------------------------------------------------------- /como/a/covid_como_sumstats_tpl.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{p{7cm}p{1.1cm}p{1cm}} 2 | & \multicolumn{2}{c}{\textbf{{\footnotesize Prevalence (\%) }}} \\[0.5ex] & \emph{India} & \emph{England} \\[2ex] 3 | Age 18-39 & \num{$$india_age18_40_mu$$} & \num{$$uk_age_18_40$$} \\[0.25ex] 4 | Age 40-49 & \num{$$india_age40_50_mu$$} & \num{$$uk_age_40_50$$} \\[0.25ex] 5 | Age 50-59 & \num{$$india_age50_60_mu$$} & \num{$$uk_age_50_60$$}\\[0.25ex] 6 | Age 60-69 & \num{$$india_age60_70_mu$$} & \num{$$uk_age_60_70$$}\\[0.25ex] 7 | Age 70-79 & \num{$$india_age70_80_mu$$} & \num{$$uk_age_70_80$$}\\[0.25ex] 8 | Age 80-99 & \num{$$india_age80__mu$$} & \num{$$uk_age_80$$} \\[0.25ex] 9 | Male & \num{$$india_male_mu$$} & \num{$$male$$} \\[0.25ex] 10 | \\ 11 | Diabetes (Controlled) & \num{$$india_diabetes_contr_mu$$} & \num{$$uk_prev_diabetes_contr$$} \\[0.25ex] 12 | Diabetes (Uncontrolled) & \num{$$india_diabetes_uncontr_mu$$} & \num{$$uk_prev_diabetes_uncontr$$} \\[0.25ex] 13 | Hypertension & \num{$$india_hypertension_both_mu$$} & \num{$$uk_prev_hypertension_both$$} \\[0.25ex] 14 | Obese (class I \& II) & \num{$$india_obese_1_2_mu$$} & \num{$$uk_prev_obese_1_2$$} \\[0.25ex] 15 | Obese (class III) & \num{$$india_obese_3_mu$$} & \num{$$uk_prev_obese_3$$} \\[0.25ex] 16 | \\ 17 | Chronic Heart Disease & \num{$$india_gbd_chronic_heart_dz_mu$$} & \num{$$uk_gbd_chronic_heart_dz_mu$$} \\[0.25ex] 18 | Chronic Respiratory Disease & \num{$$india_gbd_chronic_resp_dz_mu$$} & \num{$$uk_prev_chronic_resp_dz$$} \\[0.25ex] 19 | Asthma & \num{$$india_gbd_asthma_ocs_mu$$} & \num{$$uk_gbd_asthma_ocs_mu$$} \\[0.25ex] 20 | Kidney Disease & \num{$$india_gbd_kidney_dz_mu$$} & \num{$$uk_gbd_kidney_dz_mu$$} \\[0.25ex] 21 | Chronic Liver Disease & \num{$$india_gbd_liver_dz_mu$$} & \num{$$uk_gbd_liver_dz_mu$$} \\[0.25ex] 22 | \\ 23 | Haematological Cancer & \num{$$india_gbd_haem_malig_1_mu$$} & \num{$$uk_gbd_haem_malig_1_mu$$}\\[0.25ex] 24 | Non-haematological Cancer & \num{$$india_gbd_cancer_non_haem_1_mu$$} & \num{$$uk_gbd_cancer_non_haem_1_mu$$} \\[0.25ex] 25 | Stroke, Dementia & \num{$$india_gbd_stroke_dementia_mu$$} & \num{$$uk_gbd_stroke_dementia_mu$$} \\[0.25ex] 26 | Other Neurological Condition & \num{$$india_gbd_neuro_other_mu$$} & \num{$$uk_gbd_neuro_other_mu$$} \\[0.25ex] 27 | Psoriasis, Rheumatoid & \num{$$india_gbd_autoimmune_dz_mu$$} & \num{$$uk_gbd_autoimmune_dz_mu$$} \\[0.25ex] 28 | Other Immunosuppressive Conditions & \num{$$india_gbd_immuno_other_dz_mu$$} & \num{$$uk_gbd_immuno_other_dz_mu$$} \\[0.25ex] 29 | \end{tabular} 30 | -------------------------------------------------------------------------------- /como/a/examine_risk_factors_poverty.do: -------------------------------------------------------------------------------- 1 | /* get a poverty measure from... */ 2 | use $secc/final/collapse/village_consumption_imputed_pc11, clear 3 | 4 | /* get district identifiers */ 5 | merge 1:1 pc11_state_id pc11_village_id using $keys/pc11_village_key, keepusing(pc11_district_id) 6 | keep if _merge == 3 7 | drop _merge 8 | 9 | /* collapse to district */ 10 | collapse (mean) secc_cons_per_cap [aw=pc11_pop], by(pc11_state_id pc11_district_id) 11 | save $tmp/pc11_cons, replace 12 | 13 | /* open risk factor file from examine_risk_factors.do */ 14 | use $tmp/rfs, clear 15 | 16 | /* collapse risk factors to district level */ 17 | /* [note this is trusting district-level age distributions to be correct -- 18 | which it won't be given only 1000 obs per district. alternately we could 19 | do some kind of imputation here to bring down the noise level-- reweight 20 | the age distribution based on the true age distribution from the SECC, 21 | but keep the conditions as they are.] */ 22 | collapse (mean) hr_full_bp_high bp_high *resp* ln_rf_full_nond_conditions *rf_full_abd_c *rf_full_diab ln_rf_full_c rf_full_c rf_simple_agesex_c ln_rf_simple_agesex_c [aw=wt], by(pc11_state_id pc11_district_id) 23 | 24 | /* merge in our best measure of poverty -- currently access to */ 25 | merge 1:1 pc11_state_id pc11_district_id using $covidpub/demography/pc11/water_district_pc11, keepusing(pc11r_hl_dw_loc_inprem_sh pc11u_hl_dw_loc_inprem_sh) gen(_mw) 26 | drop if _mw == 2 27 | drop _mw 28 | 29 | merge 1:1 pc11_state_id pc11_district_id using $covidpub/demography/pc11/dem_district_pc11, keepusing(pc11r_pca_tot_p pc11u_pca_tot_p) gen(_md) 30 | drop if _md == 2 31 | drop _md 32 | 33 | /* get consumption data */ 34 | merge 1:1 pc11_state_id pc11_district_id using $tmp/pc11_cons 35 | drop if _merge == 2 36 | drop _merge 37 | ren secc_cons_per_cap cons 38 | 39 | /* generate share with water */ 40 | gen water_share = ((pc11r_hl_dw_loc_inprem_sh * pc11r_pca_tot_p) + (pc11u_hl_dw_loc_inprem_sh * pc11u_pca_tot_p)) / (pc11r_pca_tot_p + pc11u_pca_tot_p) 41 | 42 | /* examine risk curves as a function of the water share */ 43 | binscatter rf_full_c rf_simple_agesex_c water_share, 44 | graphout rf_water 45 | 46 | binscatter ln_rf_full_c ln_rf_simple_agesex_c water_share, 47 | graphout ln_rf_water 48 | 49 | /* examine correlation of risk factors with water share */ 50 | reg rf_simple_agesex_c water_share 51 | reg rf_full_c water_share 52 | 53 | reg rf_full_diab water_share 54 | reg ln_rf_full_diab water_share 55 | 56 | reg ln_rf_full_abd_c water_share 57 | reg ln_rf_simple_agesex_c water_share 58 | 59 | /* repeat analysis with secc consumption */ 60 | binscatter rf_full_c rf_simple_agesex_c cons, 61 | graphout rf_cons 62 | 63 | binscatter ln_rf_full_c ln_rf_simple_agesex_c cons, 64 | graphout ln_rf_cons 65 | 66 | reg rf_simple_agesex_c cons 67 | reg rf_full_c cons 68 | 69 | reg rf_full_diab cons 70 | reg ln_rf_full_diab cons 71 | 72 | reg ln_rf_full_abd_c cons 73 | reg ln_rf_simple_agesex_c cons 74 | 75 | /* various conditions vs income */ 76 | binscatter rf_full_diab cons 77 | graphout diabetes_cons 78 | 79 | /* compare resp conditions with income */ 80 | foreach v of varlist *resp* { 81 | binscatter `v' cons 82 | graphout `v'_cons 83 | } 84 | 85 | /* hypertension vs. income */ 86 | binscatter bp_high cons 87 | graphout hyp_cons 88 | 89 | -------------------------------------------------------------------------------- /como/a/prep_eng_india_prev_compare.do: -------------------------------------------------------------------------------- 1 | /* merge India and UK age-specific health prevalences with country prefixes */ 2 | use $tmp/prev_india, clear 3 | ren prev_* i_* 4 | 5 | merge 1:1 age using $tmp/prev_uk_nhs_matched, nogen 6 | ren prev_* u_* 7 | 8 | 9 | /* combine all three classes of obesity */ 10 | gen u_obese = u_obese_1_2 + u_obese_3 11 | gen i_obese = i_obese_1_2 + i_obese_3 12 | 13 | /* label the variables we will graph */ 14 | label var u_diabetes_contr "Diabetes (Controlled, UK)" 15 | label var u_diabetes_uncontr "Diabetes (Uncontrolled, UK)" 16 | label var u_hypertension_contr "Hypertension (Controlled, UK)" 17 | label var u_hypertension_uncontr "Hypertension (Uncontrolled, UK)" 18 | label var u_obese "Obese (BMI >= 30, UK)" 19 | label var i_diabetes_contr "Diabetes (Controlled, India)" 20 | label var i_diabetes_uncontr "Diabetes (Uncontrolled, India)" 21 | label var i_hypertension_contr "Hypertension (Controlled, India)" 22 | label var i_hypertension_uncontr "Hypertension (Uncontrolled, India)" 23 | label var i_obese "Obese (BMI >= 30, India)" 24 | 25 | /* apply a smoother to the India microdata conditions */ 26 | sort age 27 | tsset age 28 | foreach v in i_diabetes_contr i_diabetes_uncontr i_hypertension_uncontr i_hypertension_contr i_obese { 29 | replace `v' = (L2.`v' + L1.`v' + `v' + F1.`v' + F2.`v') / 5 if !mi(L2.`v') & !mi(F2.`v') 30 | replace `v' = (L1.`v' + `v' + F1.`v') / 3 if (mi(L2.`v') | mi(F2.`v')) & !mi(L1.`v') & !mi(F1.`v') 31 | } 32 | sort age 33 | keep if age < 90 34 | 35 | drop *diabetes_no_measure *hypertension_both 36 | 37 | /* save file for figure generation */ 38 | save $tmp/prev_compare, replace 39 | -------------------------------------------------------------------------------- /como/a/sumstats.do: -------------------------------------------------------------------------------- 1 | /* generate stats used in paper */ 2 | use $tmp/combined, clear 3 | 4 | /* sample size */ 5 | count 6 | 7 | /* age median and IQR */ 8 | sum age, d 9 | 10 | /* risk factor severity */ 11 | foreach v in $age_vars male $hr_biomarker_vars { 12 | disp_nice "`v'" 13 | tab `v' [aw=wt], mi 14 | } 15 | 16 | /* open GBD data */ 17 | foreach country in india uk { 18 | use $health/gbd/gbd_nhs_conditions_`country'.dta, clear 19 | disp_nice "`country'" 20 | foreach v in $hr_gbd_vars { 21 | qui sum gbd_`v' if age == -90 22 | di %25s "`v': " %6.1f (`r(mean)' * 100) "%" 23 | } 24 | } 25 | 26 | /* self-report measures of liver and kidney disease for reference */ 27 | use $tmp/combined, clear 28 | tab kidney_dz [aw=wt] 29 | tab liver_dz [aw=wt] 30 | -------------------------------------------------------------------------------- /como/b/fit_cts_uk_age_hr.m: -------------------------------------------------------------------------------- 1 | 2 | med_age = [29 45 55 65 75 85]'; 3 | hr_age_sex = [.05 .27 1 2.61 7.61 26.27]'; 4 | hr_full = [.07 .31 1 2.09 4.77 12.64]'; 5 | 6 | ln_hr_age_sex = log(hr_age_sex); 7 | ln_hr_full = log(hr_full); 8 | 9 | age = [18:100]'; 10 | 11 | %% use a polynomial interpolation since spline fails on the endpoints 12 | fit_age_sex = fit(med_age, ln_hr_age_sex, 'poly3') 13 | fit_full = fit(med_age, ln_hr_full, 'poly3') 14 | 15 | %% graph the fits 16 | clf; 17 | hold on 18 | scatter(med_age, ln_hr_full); 19 | plot(fit_full, age, zeros(83, 1)) 20 | xlabel("log odds ratio") 21 | ylabel("age") 22 | b = gca; legend(b,'off'); 23 | write_png('/scratch/pn/fit_full') 24 | 25 | clf; 26 | hold on 27 | scatter(med_age, ln_hr_age_sex); 28 | plot(fit_age_sex, age, zeros(83, 1)) 29 | xlabel("log odds ratio") 30 | ylabel("age") 31 | write_png('/scratch/pn/fit_age_sex') 32 | 33 | %% generate predicted values 34 | predicted_hr_age_sex = fit_age_sex(age); 35 | predicted_hr_full = fit_full(age); 36 | 37 | %% topcode predicted values at age 90 value since we don't have certainty over the 38 | %% age distribution here or whether HRs keep rising 39 | predicted_hr_age_sex(age > 90) = predicted_hr_age_sex(age == 90); 40 | predicted_hr_full(age > 90) = predicted_hr_full(age == 90); 41 | 42 | %% write these to a file 43 | writematrix([age predicted_hr_age_sex predicted_hr_full],'/scratch/pn/uk_age_fits.csv') 44 | 45 | %% prepend a header to the file 46 | system('echo "age,ln_hr_age_sex,ln_hr_full" >~/iec/covid/covid/csv/uk_age_predicted_hr.csv'); 47 | system('cat /scratch/pn/uk_age_fits.csv >>~/iec/covid/covid/csv/uk_age_predicted_hr.csv'); 48 | fprintf("Writing uk_age_predicted_hr.csv\n"); 49 | -------------------------------------------------------------------------------- /como/b/flatten_hr_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | def flatten_hr_data(hr_var, fn_in, fn_out): 5 | """ 6 | flatten the hazard ratio data from the NHS study to 7 | be a 1D array with the names of the variables and the 8 | selected hazard ratio variable. 9 | """ 10 | # read in the HR data 11 | df = pd.read_stata(fn_in) 12 | 13 | # select just the variables we need 14 | df = df[["variable", hr_var]].T 15 | 16 | # set new column names to combine hr and variable names 17 | new_cols = [f"{x}_{hr_var}" for x in df.loc["variable"]] 18 | df.columns = new_cols 19 | 20 | # drop the variable column 21 | df = df.drop(["variable"]) 22 | 23 | # set the index value to 0 24 | df.index = [0] 25 | 26 | # write out the file 27 | df.to_csv(fn_out) 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /como/b/old/prep_india_sim_prevalence.do: -------------------------------------------------------------------------------- 1 | global conditionlist copd asthma 2 | 3 | /* import india prevalence csv */ 4 | import delimited using $comocsv/india_condition_prevalence.csv, varnames(1) clear 5 | drop source 6 | ren *, lower 7 | 8 | /* reshape wide on conditions */ 9 | replace condition = condition[_n-1] if mi(condition) 10 | drop if condition == "Hypertension (2)" 11 | replace condition = "hypertension" if condition == "Hypertension (1)" 12 | replace condition = lower(condition) 13 | 14 | /* create a new age-granular dataset and fill it according to the age bins */ 15 | set obs 100 16 | gen age = _n 17 | 18 | foreach condition in $conditionlist { 19 | gen prev_`condition' = . 20 | forval age = 1/100 { 21 | sum prevalence if condition == "`condition'" & inrange(`age', startage, endage) 22 | if `r(N)' > 0 { 23 | replace prev_`condition' = `r(mean)' if age == `age' 24 | } 25 | } 26 | replace prev_`condition' = prev_`condition' / 100 27 | } 28 | 29 | /* drop the original fields and limit to ages close to our sample */ 30 | keep if inrange(age, 16, 90) 31 | keep age prev* 32 | drop prevalence 33 | 34 | ren prev* india_prev* 35 | 36 | /* save uk prevalences */ 37 | save $tmp/india_prevalences, replace 38 | -------------------------------------------------------------------------------- /como/b/old/prep_populations.do: -------------------------------------------------------------------------------- 1 | // /********************************************/ 2 | // /* convert the UK age distribution to Stata */ 3 | // /********************************************/ 4 | // OBSOLETE-- WE FOUND A BETTER FILE 5 | // 6 | // /* open the raw csv file */ 7 | // import delimited using $comocsv/uk_demography.csv, clear 8 | // 9 | // /* split the age into a start and end */ 10 | // gen agestart = real(substr(age, 1, strpos(age, "-") - 1)) 11 | // gen ageend = real(substr(age, strpos(age, "-") + 1, .)) 12 | // 13 | // /* create 5 rows for each age to have granular age */ 14 | // expand 5 15 | // ren age age_str 16 | // bys age_str: egen age = seq() 17 | // replace age = age + agestart - 1 18 | // 19 | // /* cut population by 5 since we multiplied each bin by 5 */ 20 | // replace uk_pop = uk_pop / 5 21 | // 22 | // /* keep the ages and vars that we want */ 23 | // drop agestart ageend age_str 24 | // keep if inrange(age, 16, 90) 25 | // 26 | // /* smooth the population across bins */ 27 | // lpoly uk_pop age, bw(2) gen(uk_pop_smooth) at(age) 28 | // 29 | // order age 30 | // save $tmp/uk_pop, replace 31 | 32 | // /****************************************/ 33 | // /* create age-granular india population */ 34 | // /****************************************/ 35 | // OBSOLETE-- WE NOW GET THESE FROM THE POPULATION CENSUS 36 | // 37 | // /* open district data with 5-year age bins */ 38 | // use $covidpub/demography/pc11/age_bins_district_t_pc11.dta, clear 39 | // 40 | // /* collapse to national level */ 41 | // gen x = 1 42 | // collapse (sum) age_*_t, by(x) 43 | // 44 | // /* reshape to long on ages */ 45 | // ren *_t * 46 | // ren age_* india_pop* 47 | // reshape long india_pop, j(age) i(x) 48 | // format india_pop %10.0f 49 | // drop x 50 | // 51 | // /* expand to have one row per age */ 52 | // expand 5 53 | // ren age agebin 54 | // bys agebin: egen age = seq() 55 | // replace age = age + agebin - 1 56 | // replace india_pop = india_pop / 5 57 | // 58 | // drop agebin 59 | // keep if inrange(age, 16, 90) 60 | // order age 61 | // 62 | // /* smooth across age bins */ 63 | // lpoly india_pop age, bw(3) gen(india_pop_smooth) at(age) 64 | // 65 | // save $tmp/india_pop, replace 66 | 67 | /*************************/ 68 | /* repeat at state level */ 69 | /*************************/ 70 | 71 | /* open district data with 5-year age bins */ 72 | use $covidpub/demography/pc11/age_bins_district_t_pc11.dta, clear 73 | 74 | /* collapse to state level */ 75 | collapse (sum) age_*_t, by(pc11_state_id) 76 | 77 | /* reshape to long on ages */ 78 | ren *_t * 79 | ren age_* state_pop* 80 | reshape long state_pop, j(age) i(pc11_state_id) 81 | format state_pop %10.0f 82 | 83 | /* expand to have one row per age */ 84 | expand 5 85 | ren age agebin 86 | bys pc11_state_id agebin: egen age = seq() 87 | replace age = age + agebin - 1 88 | replace state_pop = state_pop / 5 89 | 90 | drop agebin 91 | keep if inrange(age, 16, 90) 92 | order pc11_state_id age 93 | 94 | /* smooth across age bins */ 95 | gen state_pop_smooth = . 96 | levelsof pc11_state_id, local(states) 97 | foreach state in `states' { 98 | lpoly state_pop age if pc11_state_id == "`state'", bw(3) gen(tmp) at(age) 99 | replace state_pop_smooth = tmp if pc11_state_id == "`state'" 100 | drop tmp 101 | } 102 | 103 | ren state_pop state_pop_binned 104 | ren state_pop_smooth state_pop 105 | save $tmp/state_pop, replace 106 | -------------------------------------------------------------------------------- /como/b/old/prep_uk_bmi.do: -------------------------------------------------------------------------------- 1 | import delimited using IHME_GBD_2015_OBESITY_PREVALENCE_1980_2015_Y2017M06D12.CSV , clear 2 | 3 | keep if location_name == "United Kingdom" & sex == "Both" & year == 2015 & metric == "Percent" 4 | capdrop location* sex* year measure metric 5 | list 6 | 7 | 8 | import delimited using IHME_GBD_2015_OVERWEIGHT_PREVALENCE_1980_2015_Y2017M06D12.CSV , clear 9 | 10 | keep if location_name == "United Kingdom" & sex == "Both" & year == 2015 & metric == "Percent" 11 | capdrop location* sex* year measure metric 12 | list 13 | 14 | -------------------------------------------------------------------------------- /como/b/prep_england_prevalence.do: -------------------------------------------------------------------------------- 1 | /* first import and calculate COPD rate, this is the only variable coming from its own source */ 2 | import delimited using $comocsv/copd_mclean_rates.csv, clear 3 | 4 | /* calculate the total population */ 5 | gen pop_total = pop_female + pop_male 6 | 7 | /* take the weighted average of male and female rates */ 8 | gen prev_copd = rate100k_male_mean*(pop_male / pop_total) + rate100k_female_mean*(pop_female / pop_total) 9 | 10 | /* convert the per 100k rate to a prevalence */ 11 | replace prev_copd = prev_copd / 100000 12 | 13 | /* keep only 18 - 100 year olds */ 14 | keep if inrange(age, 18, 100) 15 | keep age prev_copd 16 | save $tmp/copd_uk_prev, replace 17 | 18 | /* create full condition list */ 19 | global conditionlist diabetes_contr diabetes_uncontr hypertension_contr hypertension_uncontr hypertension_both asthma obese_1_2 obese_3 20 | 21 | /* import england data */ 22 | import delimited using $comocsv/uk_condition_prevalence.csv, varnames(1) clear 23 | drop source v* 24 | 25 | /* reshape wide on conditions */ 26 | replace condition = condition[_n-1] if mi(condition) 27 | 28 | /* replace names */ 29 | // replace condition = "diabetes_diagnosed" if condition == "Diabetes" */ 30 | replace condition = "diabetes_contr" if condition == "Diabetes (2)" 31 | replace condition = "diabetes_uncontr" if condition == "Diabetes (2a)" 32 | replace condition = "hypertension_contr" if condition == "Hypertension (3)" 33 | replace condition = "hypertension_both" if condition == "Hypertension (3a)" 34 | replace condition = "hypertension_uncontr" if condition == "Hypertension (3b)" 35 | // replace condition = "hypertension_diagnosis" if condition == "Hypertension (1)" */ 36 | // replace condition = "hypertension_both2" if condition == "Hypertension (2)" */ 37 | // replace condition = "hypertension_biomarker2" if condition == "Hypertension (2a)" */ 38 | replace condition = "obese_1_2" if condition == "Obesity class 1-2" 39 | replace condition = "obese_3" if condition == "Obesity class 3" 40 | replace condition = lower(condition) 41 | 42 | 43 | /* create a new dataset and fill it with a manual reshape */ 44 | set obs 100 45 | gen age = _n 46 | 47 | foreach condition in $conditionlist { 48 | gen prev_`condition' = . 49 | forval age = 1/100 { 50 | qui sum prevalence if condition == "`condition'" & inrange(`age', startage, endage) 51 | if `r(N)' > 0 { 52 | replace prev_`condition' = `r(mean)' if age == `age' 53 | } 54 | } 55 | } 56 | drop prevalence 57 | keep age prev_* 58 | 59 | /* Update 06/22: correct data input of COPD with calculations at the top of this file */ 60 | merge 1:1 age using $tmp/copd_uk_prev, nogen 61 | replace prev_copd = 0 if mi(prev_copd) 62 | 63 | /* old code: 64 | merge 1:1 age using $comocsv/uk_copd_prevalence, keepusing(prevalence) nogen 65 | drop prev_copd 66 | ren prevalence prev_copd */ 67 | 68 | /* drop the original fields and limit to ages in study */ 69 | keep if inrange(age, 18, 100) 70 | 71 | /* rename these to the variables that match OpenSAFELY */ 72 | gen prev_bp_high = prev_hypertension_both 73 | ren prev_asthma prev_asthma_no_ocs 74 | ren prev_copd prev_chronic_resp_dz 75 | 76 | ren prev* uk_prev* 77 | 78 | /* save england prevalences */ 79 | save $tmp/uk_prevalences, replace 80 | -------------------------------------------------------------------------------- /como/b/prep_health_data.do: -------------------------------------------------------------------------------- 1 | /* Combine AHS and DLHS data */ 2 | 3 | /***************************/ 4 | /* Merge DLHS and AHS Data */ 5 | /***************************/ 6 | /* open the dlhs data */ 7 | use $health/dlhs/dlhs_cab, clear 8 | 9 | /* rename variables to align with ahs */ 10 | ren hv05 sex 11 | ren hv06 usual_residance 12 | ren age_test age 13 | ren hv82 weight_in_kg 14 | ren hv85 length_height_cm 15 | ren hv93a bp_systolic_1_reading 16 | ren hv93b bp_systolic_2_reading 17 | ren hv94a bp_diastolic_1_reading 18 | ren hv94b bp_diastolic_2_reading 19 | ren hv19 illness_type 20 | ren hv21 symptoms_pertaining_illness 21 | ren hv23 diagnosed_for 22 | ren hv02 sl_no 23 | ren hv91a fasting_blood_glucose_mg_dl 24 | ren hv91 fasting 25 | ren hv25 regular_treatment 26 | 27 | /* match variables to format in AHS */ 28 | tostring sl_no, format("%05.0f") replace 29 | 30 | /* mark as dlhs */ 31 | gen survey = 1 32 | 33 | /* append the ahs data */ 34 | append using $health/ahs/ahs_cab 35 | 36 | /* mark as ahs */ 37 | replace survey = 2 if mi(survey) 38 | 39 | /************/ 40 | /* Cleaning */ 41 | /************/ 42 | 43 | /* AGE */ 44 | gen age_new = . 45 | 46 | /* use AHS calculated age if it exists */ 47 | replace age_new = age_calc if !mi(age_calc) 48 | 49 | /* use AHS/DLHS reported age if the calculated age does not exist */ 50 | replace age_new = age if mi(age_new) & !mi(age) 51 | 52 | /* use age from the hosehold survey for AHS observations with no matched CAB observation */ 53 | replace age_new = age_comb if mi(age_new) & !mi(age_comb) 54 | 55 | /* replace age with age_new */ 56 | drop age age_calc age_comb 57 | ren age_new age 58 | 59 | /* SEX */ 60 | /* use the AHS household sex for observations with no matched CAB observation */ 61 | replace sex = sex_comb if mi(sex) & !mi(sex_comb) 62 | 63 | /* drop if missing age or sex */ 64 | drop if mi(age) | mi(sex) | sex == 3 65 | 66 | /* drop those under 18 */ 67 | drop if age < 18 68 | 69 | /* SAMPLE */ 70 | /* define a variable to clarify the sample for each variable */ 71 | gen sample = ahs_merge 72 | replace sample = 4 if dlhs_merge == 2 & !mi(dlhs_merge) 73 | replace sample = 5 if dlhs_merge == 3 & !mi(dlhs_merge) 74 | cap label define sample 1 "1 AHS cab" 2 "2 AHS comb" 3 "3 AHS cab & comb" 4 "4 DLHS comb" 5 "5 DLHS cab & comb" 75 | label values sample sample 76 | label var sample "DLHS or AHS modules for each observation" 77 | 78 | /* create new numeric unique identifer */ 79 | gen long uid = _n 80 | tostring uid, format("%08.0f") replace 81 | 82 | /* drop some large, unneeded variables */ 83 | drop index prim_key qs* qe* qh* hv* 84 | 85 | /* save */ 86 | compress 87 | save $health/dlhs/data/dlhs_ahs_merged, replace 88 | -------------------------------------------------------------------------------- /como/b/prep_hrs.do: -------------------------------------------------------------------------------- 1 | /**********************************/ 2 | /* Create wide hazard ratio files */ 3 | /**********************************/ 4 | 5 | /* Convert HR CSV to Stata */ 6 | import delimited $comocsv/uk_nhs_hazard_ratios.csv, clear 7 | 8 | /* label variables */ 9 | lab var hr_age_sex "hazard ratio age-sex adjusted" 10 | lab var hr_age_sex_low "hazard ratio age-sex adjusted lower CI" 11 | lab var hr_age_sex_up "hazard ratio age-sex adjusted upper CI" 12 | lab var hr_full "hazard ratio fully adjusted" 13 | lab var hr_full_low "hazard ratio fully adjusted lower CI" 14 | lab var hr_full_up "hazard ratio fully adjusted upper CI" 15 | lab var hr_full_ec "hazard ratio fully adjusted early censoring" 16 | lab var hr_full_low_ec "hazard ratio fully adjusted early censoring lower CI" 17 | lab var hr_full_up_ec "hazard ratio fully adjusted early censoring upper CI" 18 | 19 | /* shorten age-sex HRs */ 20 | ren *age_sex* *simp* 21 | 22 | /* save as dta file */ 23 | save $tmp/uk_nhs_hazard_ratios, replace 24 | 25 | /* raw data has risk factors in long format-- reshape them to wide */ 26 | /* loop over two types of hazard ratios */ 27 | foreach hr in hr_full hr_simp { 28 | 29 | /* open the long format HRs */ 30 | use $tmp/uk_nhs_hazard_ratios, clear 31 | 32 | /* keep the risk factors, the desired hazard ratio, and the confidence interval */ 33 | keep variable `hr' `hr'_low `hr'_up 34 | 35 | /* transform hazard ratio into a relative risk, assuming base mortality rate of 1% */ 36 | replace `hr' = (1 - exp(`hr' * ln(1 - 0.01))) / 0.01 37 | 38 | /* replace the confidence interval with a standard error. 39 | These are odds ratios. CIs for log odds are symmetric. */ 40 | gen `hr'_lnse = (ln(`hr') - ln(`hr'_low)) / 1.96 41 | gen `hr'_lnse2 = (ln(`hr'_up) - ln(`hr')) / 1.96 42 | 43 | /* reshape them to wide format */ 44 | gen v1 = 0 45 | keep `hr' `hr'_lnse v1 variable 46 | reshape wide `hr' `hr'_lnse, j(variable) i(v1) string 47 | ren `hr'_lnse* *_hr_lnse 48 | ren `hr'* *_`hr' 49 | 50 | /* save the wide hazard ratios with standard errors */ 51 | save $tmp/uk_nhs_hazard_ratios_flat_`hr', replace 52 | } 53 | 54 | /* convert continuous age HRs to stata */ 55 | import delimited $covidpub/covid/csv/uk_age_predicted_hr.csv, clear 56 | ren ln_hr_age_sex ln_hr_simp 57 | gen hr_simp_age_cts = exp(ln_hr_simp) 58 | gen hr_full_age_cts = exp(ln_hr_full) 59 | drop ln_* 60 | save $tmp/uk_age_predicted_hr, replace 61 | -------------------------------------------------------------------------------- /como/b/prep_ny_mortality.do: -------------------------------------------------------------------------------- 1 | /********************************/ 2 | /* prepare O/R from NY epi data */ 3 | /********************************/ 4 | import delimited using $comocsv/nystate_or.csv, varnames(1) clear 5 | 6 | /* bottom-code everything at 1. it's not plausible that these conditions are protective */ 7 | foreach v of varlist * { 8 | if "`v'" == "age" continue 9 | 10 | winsorize `v' 1 100, replace 11 | } 12 | 13 | /* make the data granular on age */ 14 | gen start_age = real(substr(age, 1, 2)) 15 | drop age 16 | 17 | expand 10 18 | bys start_age: egen increment = seq() 19 | gen age = start_age + increment - 1 20 | 21 | /* clean up unused vars */ 22 | drop increment start_age 23 | 24 | /* expand to cover 18-19 */ 25 | expand 3 if age == 20 26 | bys age: egen s = seq() 27 | replace age = age + 1 - s 28 | drop s 29 | 30 | /* expand to cover 80-99 */ 31 | expand 21 if age == 79 32 | bys age: egen s = seq() 33 | replace age = age - 1 + s 34 | drop s 35 | 36 | /* prefix with ny to avoid name collision */ 37 | rename * hr_* 38 | ren hr_age age 39 | 40 | /* assume this new york measure is mostly controlled, though we don't know */ 41 | ren hr_diabetes_uncontr hr_diabetes_contr 42 | 43 | /* clean and save */ 44 | order age 45 | save $tmp/nystate_hr, replace 46 | 47 | 48 | /**************************************/ 49 | /* prepare O/Rs from NY Cummings data */ 50 | /**************************************/ 51 | import delimited using $comocsv/ny_cummings.csv, varnames(1) clear 52 | ren * hr_* 53 | 54 | expand 82 55 | gen age = _n + 17 56 | 57 | save $tmp/nycu_hr, replace 58 | -------------------------------------------------------------------------------- /como/b/prep_pop_sex.do: -------------------------------------------------------------------------------- 1 | /*************************/ 2 | /* prep india male share */ 3 | /*************************/ 4 | import excel $covidpub/demography/pc11/pc11_agesex.xls, firstrow clear 5 | 6 | keep if place_name == "India" 7 | 8 | keep age tot_p tot_m tot_f 9 | assert tot_m + tot_f == tot_p 10 | 11 | destring age, replace force 12 | drop if mi(age) 13 | 14 | /* take 5-year MA of population data series to pull out bumps */ 15 | gen x = 1 16 | xtset x age 17 | gen p_smooth = (L2.tot_p + L1.tot_p + tot_p + F.tot_p + F2.tot_p) / 5 if !mi(L2.tot_p) & !mi(F2.tot_p) 18 | replace p_smooth = (L1.tot_p + tot_p + F.tot_p + F2.tot_p) / 4 if mi(L2.tot_p) & !mi(F2.tot_p) & mi(p_smooth) 19 | replace p_smooth = (L2.tot_p + L1.tot_p + tot_p + F.tot_p) / 4 if mi(F2.tot_p) & !mi(L2.tot_p) & mi(p_smooth) 20 | replace p_smooth = tot_p if mi(p_smooth) 21 | 22 | /* repeat for male population to get smoothed sex ratio */ 23 | gen m_smooth = (L2.tot_m + L1.tot_m + tot_m + F.tot_m + F2.tot_m) / 5 if !mi(L2.tot_m) & !mi(F2.tot_m) 24 | replace m_smooth = (L1.tot_m + tot_m + F.tot_m + F2.tot_m) / 4 if mi(L2.tot_m) & !mi(F2.tot_m) & mi(m_smooth) 25 | replace m_smooth = (L2.tot_m + L1.tot_m + tot_m + F.tot_m) / 4 if mi(F2.tot_m) & !mi(L2.tot_m) & mi(m_smooth) 26 | replace m_smooth = tot_m if mi(m_smooth) 27 | 28 | /* calculate male share */ 29 | gen male = m_smooth / p_smooth 30 | 31 | ren p_smooth india_pop 32 | 33 | keep if inrange(age, 18, 100) 34 | keep age male india_pop 35 | save $tmp/india_pop, replace 36 | 37 | /********************************/ 38 | /* UK population and male share */ 39 | /********************************/ 40 | import delimited using $comocsv/england_gender_age.csv, clear 41 | gen male_share = male / total 42 | drop male female 43 | ren male_share male 44 | ren total uk_pop 45 | 46 | /* distribute age 90 weight across remaining years, since age 90 is actually 90+ */ 47 | /* this is basically inconsequential since it is very few people and COPD is the 48 | only variable that is non-constant from 90-99. */ 49 | expand 10 if age == 90 50 | replace uk_pop = uk_pop/10 if inrange(age, 90, 99) 51 | replace age = _n - 1 if age == 90 52 | 53 | save $tmp/uk_pop, replace 54 | 55 | -------------------------------------------------------------------------------- /como/como_programs.do: -------------------------------------------------------------------------------- 1 | /****************************************/ 2 | /* set globals used throughout analysis */ 3 | /****************************************/ 4 | 5 | /* MAIN COMORBID CONDITION SETS USED IN THE PAPER */ 6 | 7 | /* define age bin indicator variables */ 8 | global age_vars age18_40 age40_50 age50_60 age60_70 age70_80 age80_ 9 | 10 | /* define biomarker variables from DLHS/AHS that match NHS hazard ratio vars */ 11 | global hr_biomarker_vars obese_1_2 obese_3 bp_high diabetes_uncontr diabetes_contr 12 | 13 | /* define non-biomarker GBD variables that match NHS hazard ratio vars */ 14 | global hr_gbd_vars asthma_ocs autoimmune_dz haem_malig_1 cancer_non_haem_1 /// 15 | chronic_heart_dz chronic_resp_dz immuno_other_dz kidney_dz liver_dz neuro_other /// 16 | stroke_dementia 17 | 18 | /* define varlist found only in opensafely */ 19 | global hr_os_only_vars asthma_no_ocs cancer_non_haem_1_5 cancer_non_haem_5 diabetes_no_measure haem_malig_1_5 haem_malig_5 organ_transplant spleen_dz 20 | 21 | 22 | /* SOME ADDITIONAL VARIABLE GROUPS USED IN EXPLORATION AND DEBUGGING */ 23 | 24 | /* define self-report vars found in DLHS/AHS (but not used in risk analysis) */ 25 | global hr_selfreport_vars chronic_heart_dz stroke_dementia liver_dz kidney_dz autoimmune_dz /// 26 | cancer_non_haem_1 haem_malig_1 chronic_resp_dz 27 | 28 | 29 | /*******************************/ 30 | /* define some helper programs */ 31 | /*******************************/ 32 | /*********************************************************/ 33 | /* sc: a function to scatter multiple variables over age */ 34 | /*********************************************************/ 35 | cap prog drop sc 36 | prog def sc 37 | 38 | syntax varlist, [name(string) yscale(passthru) ylabel(passthru) legend(passthru)] 39 | tokenize `varlist' 40 | 41 | /* set a default yscale */ 42 | if mi("`yscale'") local yscale yscale(log) 43 | if mi("`ylabel'") local ylabel ylabel(.125 .25 1 4 16 64) 44 | 45 | /* set a default name */ 46 | if mi("`name'") local name euripides 47 | 48 | /* loop over the outcome vars */ 49 | while (!mi("`1'")) { 50 | 51 | /* store the variable label */ 52 | local label : variable label `1' 53 | 54 | /* add the line plot for this variable to the twoway command string */ 55 | local command `command' (line `1' age, `yscale' `ylabel' xtitle("`label'") ytitle("Mortality Hazard Ratio") lwidth(medthick) ) 56 | 57 | /* get the next variable in the list */ 58 | mac shift 59 | } 60 | 61 | /* draw the graph */ 62 | twoway `command', `legend' 63 | graphout `name' 64 | end 65 | /****************** end sc *********************** */ 66 | 67 | /************************************************************/ 68 | /* scp: a function to compare multiple prevalences over age */ 69 | /************************************************************/ 70 | cap prog drop scp 71 | prog def scp 72 | 73 | syntax varlist, [name(string) yscale(passthru) yline(passthru) ytitle(passthru) legend(passthru)] 74 | tokenize `varlist' 75 | 76 | /* set defaults */ 77 | if mi("`yscale'") local yscale 78 | if mi("`name'") local name euripides 79 | if mi("`ytitle'") local ytitle ytitle("Prevalence") 80 | 81 | /* loop over the outcome vars */ 82 | while (!mi("`1'")) { 83 | 84 | /* store the variable label */ 85 | local label : variable label `1' 86 | 87 | /* add the line plot for this variable to the twoway command string */ 88 | local command `command' (line `1' age, `yscale' xtitle("`label'") `ytitle' lwidth(medthick) ) 89 | 90 | /* get the next variable in the list */ 91 | mac shift 92 | } 93 | 94 | /* draw the graph */ 95 | twoway `command', `yline' name(`name', replace) `legend' 96 | graphout `name' 97 | end 98 | /****************** end scp *********************** */ 99 | 100 | 101 | 102 | // CONDITION LIST 103 | 104 | // AGE/SEX 105 | // age18_40 106 | // age40_50 107 | // age50_60 108 | // age60_70 109 | // age70_80 110 | // age80_ 111 | // male 112 | 113 | // BIOMARKERS (plus diabetes_contr from DLHS/AHS) 114 | // obese_1_2 115 | // obese_3 116 | // bp_high 117 | // diabetes_uncontr 118 | // diabetes_contr 119 | 120 | // GLOBAL BURDEN OF DISEASE 121 | // asthma_ocs 122 | // autoimmune_dz 123 | // haem_malig_1 124 | // cancer_non_haem_1 125 | // chronic_heart_dz 126 | // chronic_resp_dz 127 | // immuno_other_dz 128 | // kidney_dz 129 | // liver_dz 130 | // neuro_other 131 | // stroke_dementia 132 | 133 | // NOT USED 134 | // asthma_no_ocs 135 | // cancer_non_haem_1_5 136 | // cancer_non_haem_5 137 | // diabetes_no_measure 138 | // haem_malig_1_5 139 | // haem_malig_5 140 | // organ_transplant 141 | // spleen_dz 142 | -------------------------------------------------------------------------------- /como/csv/copd_mclean_rates.csv: -------------------------------------------------------------------------------- 1 | age,rate100k_male_mean,rate100k_male_lower,rate100k_male_upper,rate100k_female_mean,rate100k_female_lower,rate100k_female_upper,pop_male,pop_female 2 | 35,92.0,47.6,160.7,152.7,93.3,235.8,334046,334931 3 | 36,124.7,72.7,199.7,66.9,30.6,126.9,340309,341747 4 | 37,139.4,85.2,215.3,163.9,103.9,245.9,345224,348338 5 | 38,246.3,173.4,339.4,183.2,120.8,266.6,360518,363211 6 | 39,244.2,173.7,333.8,160.7,104.0,237.2,360518,363211 7 | 40,190.9,130.6,269.5,236.4,168.1,323.2,385058,393469 8 | 41,239.9,171.4,326.7,361.6,275.3,466.4,376049,383443 9 | 42,312.6,234.2,408.9,345.9,262.0,448.1,384557,393354 10 | 43,457.9,362.5,570.6,298.2,221.3,393.1,384595,392455 11 | 44,396.3,308.4,501.6,488.3,389.5,604.5,391659,396844 12 | 45,412.1,322.4,518.9,428.7,337.2,537.4,389810,399415 13 | 46,484.1,387.7,597.1,599.8,490.6,726.1,392097,401209 14 | 47,624.2,514.4,750.4,618.2,507.6,745.8,388933,397082 15 | 48,597.9,489.1,723.9,731.5,609.4,870.9,381317,388739 16 | 49,849.7,717.9,998.7,969.1,825.6,1130.3,373943,379531 17 | 50,787.4,658.8,933.8,1061.6,909.3,1232.1,361409,366327 18 | 51,985.8,839.8,1149.8,1075.2,920.9,1247.9,346924,351937 19 | 52,1206.2,1041.2,1389.9,1408.9,1227.7,1609.4,339352,344086 20 | 53,1502.0,1316.1,1706.8,1393.4,1212.6,1593.7,332154,336435 21 | 54,1470.1,1283.9,1675.8,1669.6,1468.2,1890.8,319149,324806 22 | 55,1689.8,1483.2,1917.2,1751.2,1541.7,1981.3,308806,313516 23 | 56,2019.8,1791.3,2269.4,2366.7,2116.4,2638.5,297135,303305 24 | 57,2278.0,2033.7,2543.5,2396.8,2145.9,2669.0,297836,305403 25 | 58,2451.5,2197.8,2726.4,2681.2,2415.3,2968.4,295033,301455 26 | 59,3040.2,2756.0,3345.7,2760.1,2487.5,3054.6,285974,295038 27 | 60,2959.3,2674.3,3266.4,3418.8,3112.2,3747.5,288397,297457 28 | 61,3702.1,3384.0,4041.9,3472.9,3168.2,3799.0,293435,305710 29 | 62,3947.1,3622.0,4293.5,3702.7,3392.1,4034.0,302337,313305 30 | 63,4109.5,3785.3,4454.0,3858.7,3547.8,4189.5,319545,332459 31 | 64,4732.4,4400.8,5082.3,4274.2,3963.2,4603.1,348284,361362 32 | 65,5121.9,4760.9,5503.0,4547.6,4211.9,4902.9,268158,280482 33 | 66,6037.8,5600.6,6500.1,4925.2,4535.4,5339.5,260309,273585 34 | 67,6286.9,5858.5,6738.4,5673.2,5272.3,6096.5,259773,273394 35 | 68,7180.7,6695.8,7691.4,5646.0,5227.4,6089.1,240572,256467 36 | 69,7589.7,7075.3,8131.6,5839.5,5401.4,6303.6,213801,231079 37 | 70,7473.9,6925.4,8054.3,6810.5,6301.9,7349.2,191917,210047 38 | 71,8378.5,7792.2,8997.3,6519.3,6022.8,7045.8,199921,219818 39 | 72,8704.3,8108.5,9332.2,7241.5,6728.8,7783.0,197591,219848 40 | 73,9778.5,9144.1,10445.4,7168.4,6647.5,7719.3,192255,214700 41 | 74,9398.4,8759.5,10071.6,7579.6,7037.4,8152.4,181894,206560 42 | 75,9158.3,8510.7,9842.2,7440.4,6893.2,8019.6,171911,198752 43 | 76,10352.1,9640.5,11102.3,7706.0,7143.2,8301.3,162735,191159 44 | 77,10050.6,9337.0,10804.2,8147.5,7557.1,8771.7,149477,180146 45 | 78,10517.6,9751.2,11328.2,7821.2,7215.7,8464.0,140940,174360 46 | 79,10317.8,9552.4,11128.3,7706.6,7113.7,8335.7,136064,172787 47 | 80,10994.9,10173.1,11865.3,8286.5,7661.2,8949.3,127395,168522 48 | 81,11872.4,10997.3,12798.6,8541.3,7900.6,9220.2,116429,160681 49 | 82,11408.1,10505.3,12367.8,8156.7,7509.0,8845.3,104119,148184 50 | 83,12755.2,11739.0,13835.9,8489.7,7807.0,9216.2,92560,36864 51 | 84,12132.6,11074.6,13264.5,7477.9,6807.2,8196.8,83689,27404 52 | 85,11844.6,10769.5,12997.9,8149.2,7422.2,8928.2,74160,18928 53 | 86,12018.2,10863.5,13262.2,7854.0,7113.0,8651.3,63615,08907 54 | 87,11655.6,10437.2,12977.2,7025.4,6304.5,7806.0,54463,99659 55 | 88,11767.1,10445.0,13210.3,7120.4,6358.1,7949.0,46119,90656 56 | 89,11255.6,9832.9,12826.4,6604.5,5838.7,7442.7,39749,82648 57 | 90,11086.4,9599.5,12738.3,6471.2,5686.2,7334.3,10164,27510 58 | 91,10639.6,9113.2,12348.6,6857.9,6035.7,7760.9,10164,27510 59 | 92,9776.8,7881.5,11990.4,6544.6,5515.9,7709.5,10164,27510 60 | 93,8373.6,6253.8,10980.9,6392.4,5206.7,7767.4,10164,27510 61 | 94,8805.0,6345.9,11901.8,5705.5,4480.0,7162.7,10164,27510 62 | 95,8148.1,5608.8,11443.0,4148.3,3048.0,5516.3,10164,27510 63 | 96,7142.9,4476.4,10814.4,3415.2,2336.0,4821.2,10164,27510 64 | 97,5357.1,2768.1,9357.9,5020.9,3516.6,6951.1,10164,27510 65 | 98,4929.6,1981.9,10156.8,4212.5,2670.3,6320.8,10164,27510 66 | 99,8000.0,3453.8,15763.2,4134.4,2363.1,6714.0,10164,27510 67 | 100,13114.8,5662.0,25841.3,6227.1,3627.5,9970.2,10164,27510 68 | -------------------------------------------------------------------------------- /como/csv/england_gender_age.csv: -------------------------------------------------------------------------------- 1 | age,male,female,total 2 | 0,327309,310525,637834 3 | 1,338368,321522,659890 4 | 2,349229,331803,681032 5 | 3,349199,331559,680758 6 | 4,352148,335065,687213 7 | 5,360688,342703,703391 8 | 6,370995,354215,725210 9 | 7,363496,346678,710174 10 | 8,356965,340812,697777 11 | 9,351790,335524,687314 12 | 10,355145,337926,693071 13 | 11,344574,326534,671108 14 | 12,337076,321037,658113 15 | 13,322996,307963,630959 16 | 14,317980,302888,620868 17 | 15,309945,293801,603746 18 | 16,305110,288845,593955 19 | 17,313367,298838,612205 20 | 18,324403,305833,630236 21 | 19,337046,319387,656433 22 | 20,343991,325703,669694 23 | 21,357200,337190,694390 24 | 22,363315,340570,703885 25 | 23,364520,345682,710202 26 | 24,376642,357841,734483 27 | 25,377103,361973,739076 28 | 26,385487,377446,762933 29 | 27,397538,383724,781262 30 | 28,391623,377887,769510 31 | 29,384229,378914,763143 32 | 30,385964,385662,771626 33 | 31,372655,379770,752425 34 | 32,380880,379553,760433 35 | 33,380805,379662,760467 36 | 34,369422,373224,742646 37 | 35,370811,376544,747355 38 | 36,369418,376552,745970 39 | 37,372822,378172,750994 40 | 38,373242,378447,751689 41 | 39,359561,361914,721475 42 | 40,336044,336944,672988 43 | 41,329983,331604,661587 44 | 42,334321,338908,673229 45 | 43,340409,345283,685692 46 | 44,345749,351339,697088 47 | 45,360708,365570,726278 48 | 46,375568,380534,756102 49 | 47,384636,394820,779456 50 | 48,375401,384788,760189 51 | 49,383381,393836,777217 52 | 50,382860,393074,775934 53 | 51,389406,396933,786339 54 | 52,386668,398868,785536 55 | 53,388054,399900,787954 56 | 54,384446,395242,779688 57 | 55,375450,385802,761252 58 | 56,367033,375868,742901 59 | 57,353738,362172,715910 60 | 58,337972,347088,685060 61 | 59,329177,339029,668206 62 | 60,321653,331290,652943 63 | 61,308159,318785,626944 64 | 62,296626,306696,603322 65 | 63,284029,296404,580433 66 | 64,282719,298013,580732 67 | 65,278642,292661,571303 68 | 66,268184,285237,553421 69 | 67,268738,286641,555379 70 | 68,271970,293101,565071 71 | 69,278608,298811,577419 72 | 70,292024,315080,607104 73 | 71,315843,339855,655698 74 | 72,240337,261886,502223 75 | 73,230038,252895,482933 76 | 74,226476,250366,476842 77 | 75,205959,232313,438272 78 | 76,179879,206575,386454 79 | 77,158067,184848,342915 80 | 78,161540,190485,352025 81 | 79,156343,187117,343460 82 | 80,147733,179519,327252 83 | 81,135514,168751,304265 84 | 82,123492,158153,281645 85 | 83,112133,147147,259280 86 | 84,98000,133314,231314 87 | 85,87528,123624,211152 88 | 86,79030,116217,195247 89 | 87,69067,106926,175993 90 | 88,58264,95694,153958 91 | 89,47498,81854,129352 92 | 90,157788,341488,499276 93 | -------------------------------------------------------------------------------- /como/csv/india_condition_prevalence.csv: -------------------------------------------------------------------------------- 1 | Condition,Start Age,End Age,Prevalence,Source 2 | COPD,0,5,0.1,"GBD" 3 | COPD,5,10,0.2 4 | COPD,10,15,0.3 5 | COPD,15,20,0.4 6 | COPD,20,25,0.55 7 | COPD,25,30,0.8 8 | COPD,30,35,1.35 9 | COPD,35,40,2.15 10 | COPD,40,45,3.55 11 | COPD,45,50,6.05 12 | COPD,50,55,9.65 13 | COPD,55,60,14.25 14 | COPD,60,65,19 15 | COPD,65,70,23.1 16 | COPD,70,75,26.35 17 | COPD,75,80,28.1 18 | COPD,80,100,28.35 19 | ASTHMA,0,5,1.35,"GBD" 20 | ASTHMA,5,10,2.4 21 | ASTHMA,10,15,1.75 22 | ASTHMA,15,20,1.35 23 | ASTHMA,20,25,1.2 24 | ASTHMA,25,30,1.5 25 | ASTHMA,30,35,1.95 26 | ASTHMA,35,40,2.35 27 | ASTHMA,40,45,2.85 28 | ASTHMA,45,50,3.6 29 | ASTHMA,50,55,4.65 30 | ASTHMA,55,60,6 31 | ASTHMA,60,65,7.5 32 | ASTHMA,65,70,9.4 33 | ASTHMA,70,75,10.9 34 | ASTHMA,75,80,11.25 35 | ASTHMA,80,100,10.9 36 | 37 | -------------------------------------------------------------------------------- /como/csv/ny_cummings.csv: -------------------------------------------------------------------------------- 1 | age,male,bp_high,diabetes_contr,chronic_heart_dz,chronic_resp_dz 2 | 1.31,1.13,1.58,1.31,1.76,2.94 3 | -------------------------------------------------------------------------------- /como/csv/ny_hr.csv: -------------------------------------------------------------------------------- 1 | age,hr_bp_high,hr_diabetes_uncontr,hr_chronic_heart_dz,hr_kidney_dz,hr_chronic_resp_dz 2 | 20,1.64,48.19,0.00,0.60,0.00 3 | 30,1.81,8.48,0.17,1.14,0.12 4 | 40,1.67,3.90,0.68,1.06,0.65 5 | 50,1.38,2.81,1.01,1.13,0.95 6 | 60,1.13,2.15,0.90,0.54,1.02 7 | 70,1.04,1.87,0.82,0.26,1.04 8 | 80,1.04,1.87,0.82,0.26,1.04 9 | 90,1.04,1.87,0.82,0.26,1.04 10 | -------------------------------------------------------------------------------- /como/csv/nystate_age_comorbid_05082020.csv: -------------------------------------------------------------------------------- 1 | Age Range,Hypertension,Diabetes,Hyperlipidemia,Dementia,Cronoary Artery Disease,Renal Disease,COPD,Atrial Fibrillation,Cancer,Stroke,Fatality Count 2 | Total,11517,7572,4445,2643,2476,2248,1880,1599,1527,1402,21045 3 | age 0-9,1,0,0,0,0,0,0,0,0,0,3 4 | age 10-19,0,1,0,0,0,0,0,0,0,0,8 5 | age 20-29,11,20,3,0,0,3,0,0,3,2,74 6 | age 30-39,65,73,15,0,1,20,2,2,5,4,287 7 | age 40-49,250,223,66,2,19,73,12,8,18,22,748 8 | age 50-59,942,800,321,27,133,210,103,43,71,79,2034 9 | age 60-69,2352,1768,896,152,451,504,332,183,300,286,4147 10 | age 70-79,3331,2365,1381,567,746,633,590,415,453,453,5536 11 | age 80-89,3138,1749,1227,1062,815,585,570,611,487,408,5415 12 | age 90+,1426,572,535,831,311,220,271,337,190,148,2784 13 | Unknown,1,1,1,2,0,0,0,0,0,0,9 -------------------------------------------------------------------------------- /como/csv/nystate_or.csv: -------------------------------------------------------------------------------- 1 | age,bp_high,diabetes_uncontr,chronic_heart_dz,kidney_dz,chronic_resp_dz 2 | 20-29,1.642935378,48.19277108,0,0.6024096386,0 3 | 30-39,1.805650924,8.47630719,0.171998624,1.14379085,0.116713352 4 | 40-49,1.6687759,3.90086477,0.6750675068,1.058855886,0.6497524752 5 | 50-59,1.384627488,2.811610607,1.010526316,1.12890923,0.949742778 6 | 60-69,1.129405312,2.153825806,0.9003444145,0.5370415748,1.024088434 7 | 70-79,1.043533767,1.874224294,0.8249186539,0.2574051725,1.039019679 8 | -------------------------------------------------------------------------------- /como/csv/uk_condition_prevalence.csv: -------------------------------------------------------------------------------- 1 | condition,start age, end age, prevalence, source 2 | Diabetes,16,24,0.0085,, 3 | ,25,34,0.012,, 4 | ,35,44,0.0180,, 5 | ,45,54,0.0480,, 6 | ,55,64,0.0725,, 7 | ,65,74,.1265,, 8 | ,75,100,.1205,, 9 | Diabetes (2),16,44,0.007,, 10 | ,45,64,0.088,, 11 | ,65,100,0.139,, 12 | Diabetes (2a),16,44,0.003,, 13 | ,45,64,0.032,, 14 | ,65,100,0.040,, 15 | Asthma,16,24,0.18,, 16 | ,25,34,0.21,, 17 | ,35,44,0.19,, 18 | ,45,54,0.17,, 19 | ,55,64,0.15,, 20 | ,65,74,0.15,, 21 | ,75,100,0.13,, 22 | Hypertension (1),16,24,0,, 23 | ,25,44,0.01,, 24 | ,35,64,0.11,, 25 | ,65,100,0.29,, 26 | Hypertension (2),20,39,0.093,, 27 | ,40,59,0.2790,, 28 | ,60,100,0.6370,, 29 | Hypertension (2a),20,39,0.08, 30 | ,40,59,0.1630,, 31 | ,60,100,0.3020,, 32 | Hypertension (3),16,24,0.00,,NHS-hyp.controlled 33 | ,25,34,0.007,, 34 | ,35,44,0.01,, 35 | ,45,54,0.087,, 36 | ,55,64,0.127,, 37 | ,65,74,0.267,, 38 | ,75,100,0.325,, 39 | Hypertension (3a),16,24,0.026,,NHS-hyp.all 40 | ,25,34,0.069,, 41 | ,35,44,0.122,, 42 | ,45,54,0.238,, 43 | ,55,64,0.409,, 44 | ,65,74,0.581,, 45 | ,75,100,0.660,, 46 | Hypertension (3b),16,24,0.026,,NHS-hyp.uncontrolled+untreated 47 | ,25,34,0.062,, 48 | ,35,44,0.112,, 49 | ,45,54,0.151,, 50 | ,55,64,0.283,, 51 | ,65,74,0.254,, 52 | ,75,100,0.335,, 53 | COPD,31,40,0.0009,, 54 | ,41,50,0.0069,, 55 | ,51,60,0.0223,, 56 | ,61,70,0.0539,, 57 | ,71,80,0.0834,, 58 | ,81,100,0.0800,, 59 | Obesity class 1-2,16,24,.116, 60 | ,25,34,.184, 61 | ,35,44,.241, 62 | ,45,54,.308, 63 | ,55,64,.310, 64 | ,65,74,.289, 65 | ,75,100,.264, 66 | Obesity class 3,16,24,.019, 67 | ,25,34,.028, 68 | ,35,44,.032, 69 | ,45,54,.043, 70 | ,55,64,.038, 71 | ,65,74,.033, 72 | ,75,100,.017, 73 | -------------------------------------------------------------------------------- /como/csv/uk_condition_sd.csv: -------------------------------------------------------------------------------- 1 | condition, start age, end age,lower, upper, mean 2 | sd_obese_1_2,15,24,.091,.146,.116 3 | ,25,34,.16,.21,.184 4 | ,35,44,.215,.269,.241 5 | ,45,54,.278,.339,.308 6 | ,55,64,.281,.341,.310 7 | ,65,74,.263,.316,.289 8 | ,75,100,.230,.301,.264 9 | sd_obese_3,16,24,.011,.035,.019 10 | ,25,34,.018,.042,.028 11 | ,35,44,.024,.044,.032 12 | ,45,54,.032,.058,.043 13 | ,55,64,.027,.054,.038 14 | ,65,74,.024,.046,.033 15 | ,75,100,.01,.031,.017 16 | sd_bp_high,16,24,.012,.057,.026 17 | ,25,34,.044,.107,.062 18 | ,35,44,.095,.157,.112 19 | ,45,54,.205,.276,.151 20 | ,55,64,.369,.451,.283 21 | ,65,74,.545,.617,.254 22 | ,75,100,.618,.700,.335 23 | diabetes_uncontr,16,44, 24 | ,45,64, 25 | ,65,100, 26 | diabetes_contr,16,44, 27 | ,45,64, 28 | ,65,100, 29 | 30 | -------------------------------------------------------------------------------- /como/csv/uk_demography.csv: -------------------------------------------------------------------------------- 1 | age,uk_pop 2 | 0-4,3857263 3 | 5-9,4149852 4 | 10-14,3953866 5 | 15-19,3656968 6 | 20-24,4153080 7 | 25-29,4514249 8 | 30-34,4497132 9 | 35-39,4395667 10 | 40-44,4019539 11 | 45-49,4402122 12 | 50-54,4661015 13 | 55-59,4405908 14 | 60-64,3755185 15 | 65-69,3368199 16 | 70-74,3318867 17 | 75-79,2325296 18 | 80-84,1715328 19 | 85-89,1042090 20 | 90+,605181 21 | -------------------------------------------------------------------------------- /como/csv/uk_nhs_hazard_ratios.csv: -------------------------------------------------------------------------------- 1 | variable,HR_age_sex,HR_age_sex_low,HR_age_sex_up,HR_full,HR_full_low,HR_full_up,HR_full_ec,HR_full_low_ec,HR_full_up_ec 2 | age18_40,0.05,0.04,0.08,0.06,0.04,0.08,0.08,0.05,0.13 3 | age40_50,0.27,0.21,0.34,0.30,0.25,0.36,0.31,0.21,0.46 4 | age50_60,1,1,1,1,1,1,1,1,1 5 | age60_70,2.61,2.29,2.96,2.40,2.16,2.66,2.35,1.9,2.91 6 | age70_80,7.61,6.78,8.54,6.08,5.52,6.69,5.55,4.54,6.77 7 | age80_,26.27,23.52,29.33,20.61,18.72,22.7,13.43,10.95,16.45 8 | female,1,1,1,1,1,1,1,1,1 9 | male,2.24,2.12,2.36,1.59,1.53,1.65,2.18,1.99,2.38 10 | bmi_not_obese,1,1,1,1,1,1,1,1,1 11 | bmi_obeseI,1.57,1.47,1.68,1.05,1.00,1.11,1.39,1.25,1.54 12 | bmi_obeseII,2.01,1.82,2.21,1.40,1.30,1.52,1.62,1.39,1.9 13 | bmi_obeseIII,2.97,2.62,3.38,1.92,1.72,2.13,2.45,2,3.01 14 | obese_1_2,1.69,1.57,1.83,1.15,1.08,1.22,1.45,1.29,1.64 15 | obese_3,2.97,2.62,3.38,1.92,1.72,2.13,2.45,2,3.01 16 | bp_not_high,1,1,1,1,1,1,1,1,1 17 | bp_high,1.22,1.15,1.3,0.89,0.85,0.93,0.94,0.85,1.05 18 | chronic_resp_dz,2.35,2.21,2.5,1.63,1.55,1.71,1.97,1.77,2.18 19 | asthma_no_ocs,1.23,1.14,1.33,0.99,0.93,1.05,1.14,1.01,1.29 20 | asthma_ocs,1.7,1.48,1.96,1.13,1.01,1.26,1.39,1.12,1.73 21 | chronic_heart_dz,2.01,1.9,2.13,1.17,1.12,1.22,1.33,1.22,1.46 22 | diabetes_contr,2.02,1.89,2.16,1.31,1.24,1.37,1.48,1.33,1.65 23 | diabetes_uncontr,3.61,3.34,3.9,1.95,1.83,2.07,2.57,2.27,2.91 24 | diabetes_no_measure,2.35,2.04,2.7,1.90,1.71,2.09,1.68,1.33,2.12 25 | cancer_non_haem_1,1.83,1.51,2.21,1.72,1.50,1.97,1.51,1.1,2.05 26 | cancer_non_haem_1_5,1.39,1.22,1.58,1.15,1.05,1.27,1.36,1.13,1.65 27 | cancer_non_haem_5,1.03,0.94,1.12,0.96,0.91,1.03,0.92,0.79,1.06 28 | haem_malig_1,4.03,2.76,5.88,2.82,2.09,3.81,2.6,1.3,5.22 29 | haem_malig_1_5,3.59,2.88,4.48,2.47,2.05,2.96,3.67,2.66,5.06 30 | haem_malig_5,2.13,1.76,2.59,1.62,1.39,1.88,1.64,1.18,2.28 31 | liver_dz,2.34,1.94,2.83,1.75,1.51,2.03,1.86,1.4,2.47 32 | stroke_dementia,2.34,2.18,2.51,2.16,2.06,2.27,1.61,1.43,1.81 33 | neuro_other,2.94,2.62,3.3,2.58,2.38,2.79,2.28,1.88,2.76 34 | kidney_dz,2.19,2.06,2.32,1.42,1.36,1.50,1.75,1.58,1.92 35 | organ_transplant,7.79,5.88,10.33,3.55,2.79,4.52,2.62,1.51,4.57 36 | spleen_dz,1.82,1.21,2.74,1.34,0.98,1.83,1.87,1.06,3.3 37 | autoimmune_dz,1.35,1.24,1.48,1.19,1.11,1.27,1.31,1.14,1.51 38 | immuno_other_dz,2.02,1.45,2.81,1.70,1.34,2.16,2.01,1.25,3.25 39 | -------------------------------------------------------------------------------- /como/csv/uk_nhs_incidence.csv: -------------------------------------------------------------------------------- 1 | condition,prevalence 2 | age18_40,34.4 3 | age40_50,16.5 4 | age50_60,17.6 5 | age60_70,13.8 6 | age70_80,11.2 7 | age80_,6.5 8 | female,46.0 9 | male,49.9 10 | bmi_not_obese,56.5 11 | bmi_obeseI,13.8 12 | bmi_obeseII,5.3 13 | bmi_obeseIII,2.7 14 | obese_1_2,19.1 15 | obese_3,2.7 16 | bp_not_high,65.8 17 | bp_high,34.2 18 | chronic_resp_dz,4.1 19 | asthma_no_ocs,14.2 20 | asthma_ocs,1.7 21 | chronic_heart_dz,6.7 22 | diabetes_contr,6.0 23 | diabetes_uncontr,2.8 24 | diabetes_no_measure,1.1 25 | cancer_non_haem_1,0.13 26 | cancer_non_haem_1_5,0.10 27 | cancer_non_haem_5,0.10 28 | haem_malig_1,0.31 29 | haem_malig_1_5,0.29 30 | haem_malig_5,0.16 31 | liver_dz,0.70 32 | stroke_dementia,2.1 33 | neuro_other,1.0 34 | kidney_dz,6.3 35 | organ_transplant,0.1 36 | spleen_dz,0.2 37 | autoimmune_dz,5.1 38 | immuno_other_dz,1.6 39 | -------------------------------------------------------------------------------- /como/csv/weighted_hrs.txt: -------------------------------------------------------------------------------- 1 | OpenSAFELY reports HRs for Obesity 1 & 2 separately. We need them 2 | together to match prevalence. We take prevalence-weighted mean of 3 | hazard ratios. 4 | 5 | Obesity 1: prevalence: 0.138, hazard ratio: 1.05 (1.00-1.11) 6 | Obesity 2: prevalence: 0.053, hazard ratio: 1.40 (1.30-1.52) 7 | 8 | Combined: 1.15 (1.08 - 1.22) 9 | 10 | Kidney mild: 0.058, HR: 1.33 (1.28-1.40) 11 | Kidney severe: 0.005, HR: 2.52 (2.33-2.72) 12 | 13 | Combined: 1.42 (1.36-1.50) 14 | 15 | -------------------------------------------------------------------------------- /como/e/explore_gbd_vs_dlhs.do: -------------------------------------------------------------------------------- 1 | use $tmp/india_models, clear 2 | 3 | sort age 4 | twoway /// 5 | (line diabetes_uncontr age) (scatter gbd_diabetes age) 6 | graphout x 7 | -------------------------------------------------------------------------------- /como/e/hr_vs_or.do: -------------------------------------------------------------------------------- 1 | /* import hazard ratios from NHS study, fully adjusted model */ 2 | use $tmp/uk_nhs_hazard_ratios_flat_hr_fully_adj, clear 3 | 4 | global varlist $age_vars $hr_biomarker_vars $hr_gbd_vars 5 | 6 | /* define reference group mortality -- 50--60 year olds */ 7 | global r = 355 / 3068883 8 | 9 | /* rename hazard ratio vars for consistency */ 10 | foreach condition in $comorbid_vars { 11 | ren `condition'_hr_fully_adj hr_`condition' 12 | } 13 | 14 | /* calculate relative risk for each condition from hazard ratio */ 15 | foreach condition in $comorbid_vars { 16 | gen rr_`condition' = (1 - exp(hr_`condition' * ln(1 - ${r}))) / ${r} 17 | } 18 | 19 | /* calculate odds ratios from relative risk */ 20 | foreach condition in $comorbid_vars { 21 | gen or_`condition' = (rr_`condition' * (1 + ${r})) / (1 - rr_`condition' * ${r}) 22 | } 23 | 24 | /* check we got it right by recalculating rr from or */ 25 | foreach condition in $comorbid_vars { 26 | gen rr2_`condition' = or_`condition' / (1 - $r + $r * or_`condition') 27 | gen diff = rr_`condition' / rr2_`condition' 28 | assert inrange(diff, .999, 1.001) 29 | drop diff 30 | } 31 | drop rr2* 32 | 33 | /* check out the comparison */ 34 | foreach condition in $comorbid_vars { 35 | list or_`condition' rr_`condition' hr_`condition' 36 | } 37 | 38 | 39 | /* reshape to wide on different stats */ 40 | reshape long hr or rr, string i(v1) j(stat) 41 | drop v1 42 | 43 | /* round results to 3 digits */ 44 | foreach v in hr or rr { 45 | replace `v' = round(`v', .001) 46 | format `v' %6.3f 47 | } 48 | 49 | /* list results */ 50 | list 51 | -------------------------------------------------------------------------------- /como/e/summarize_india_conditions.do: -------------------------------------------------------------------------------- 1 | /**************************************************************/ 2 | /* explore different risk factors across the age distribution */ 3 | /**************************************************************/ 4 | use $tmp/combined, clear 5 | collapse (mean) risk_factor_* [aw=wt], by(age) 6 | 7 | keep if age < 85 8 | sort age 9 | save $tmp/foo, replace 10 | 11 | /* 1. compare continuous age distributions to discrete to confirm they are ok */ 12 | twoway (line risk_factor_simple_cts age) (line risk_factor_simple age), yscale(log) ylabel(0.1 0.5 1 2 5 10 50) 13 | graphout simple_comp 14 | 15 | twoway (line risk_factor_full_cts age) (line risk_factor_full age), yscale(log) ylabel(0.1 0.5 1 2 5 10 50) 16 | graphout full_comp 17 | 18 | /* 2. compare fully adjusted, age-sex, comorbid conditions only */ 19 | twoway (line risk_factor_full_cts age, lwidth(medthick)) (line risk_factor_simple_cts age, lwidth(medthick)) , yscale(log) ylabel(0.1 0.5 1 2 5 10 50) 20 | graphout risk_factors 21 | 22 | /* 3. compare the discrete graphs */ 23 | twoway (line risk_factor_full age, lwidth(medthick)) (line risk_factor_simple age, lwidth(medthick)) , yscale(log) ylabel(0.1 0.5 1 2 5 10 50) 24 | graphout risk_factors_discrete 25 | 26 | twoway (line risk_factor_age_weird age, lwidth(medthick)) (line risk_factor_full_cts age, lwidth(medthick)) (line risk_factor_full age, lwidth(medthick)) , yscale(log) ylabel(0.1 0.5 1 2 5 10 50) 27 | graphout risk_factors_full_agesex_part_only 28 | 29 | /* review some results */ 30 | sum risk_factor* if age == 20, d 31 | sum risk_factor* if age == 65, d 32 | sum risk_factor* if age == 20 & male == 1, d 33 | sum risk_factor* if age == 65 & male == 1, d 34 | sum risk_factor* if age == 65 & male == 0, d 35 | 36 | /********************************/ 37 | /* create a fast sample dataset */ 38 | /********************************/ 39 | use $tmp/combined, clear 40 | keep if uniform() < .1 41 | save $tmp/combined_short, replace 42 | 43 | 44 | 45 | /* run some HR comparisons [obsolete i think] */ 46 | use $tmp/combined, clear 47 | 48 | /* compare discrete vs. continuous risk factors */ 49 | keep hr_age_*_age* hr_full*age* age 50 | 51 | /* create combined discrete age factors */ 52 | gen hr_age_discrete_full = hr_full_age18_40 * hr_full_age40_50 * hr_full_age50_60 * hr_full_age60_70 * hr_full_age70_80 * hr_full_age80_ 53 | gen hr_age_discrete_age_sex = hr_age_sex_age18_40 * hr_age_sex_age40_50 * hr_age_sex_age50_60 * hr_age_sex_age60_70 * hr_age_sex_age70_80 * hr_age_sex_age80_ 54 | 55 | gen ln_d_full = ln(hr_age_discrete_full) 56 | gen ln_d_age_sex = ln(hr_age_discrete_age_sex) 57 | gen ln_c_full = ln(hr_full_age_cts) 58 | gen ln_c_age_sex = ln(hr_age_sex_age_cts) 59 | 60 | binscatter ln_d_full ln_c_full age, linetype(none) xq(age) 61 | graphout hr_comp_full 62 | 63 | binscatter ln_d_age_sex ln_c_age_sex age, linetype(none) xq(age) legend(off) 64 | graphout hr_comp_age_sex 65 | -------------------------------------------------------------------------------- /como/e/test_cts_ors.do: -------------------------------------------------------------------------------- 1 | import delimited uk_age_predicted_or.csv, clear 2 | 3 | /* see how well the bin means line up */ 4 | replace or_simple = exp(or_simple) 5 | replace or_full = exp(or_full) 6 | 7 | sum or* if inrange(age, 18, 39) 8 | sum or* if inrange(age, 40, 49) 9 | sum or* if inrange(age, 50, 59) 10 | sum or* if inrange(age, 60, 69) 11 | sum or* if inrange(age, 70, 79) 12 | sum or* if inrange(age, 80, 85) 13 | -------------------------------------------------------------------------------- /como/e/test_map.do: -------------------------------------------------------------------------------- 1 | use ~/iec/output/pn/test, clear 2 | 3 | /* test by making kerala (32) / rajasthan (8) into outliers */ 4 | replace rf_conditions = 5 if pc11_state_name == "kerala" 5 | replace rf_conditions = -1 if pc11_state_name == "rajasthan" 6 | 7 | ren pc11_state_id pc11_s_id 8 | 9 | /* save the temp dataset for merging the values to the geodataset */ 10 | save $tmp/test.dta, replace 11 | 12 | /* convert the shapefile into a geodatabase */ 13 | shp2dta using $iec1/gis/pc11/pc11-state, database($tmp/state_db) coordinates($tmp/state_coord) genid(geo_id) replace 14 | 15 | /* use the created database, it is the one that the map can be created from */ 16 | use $tmp/state_db, clear 17 | 18 | /* merge wiith the */ 19 | merge 1:1 pc11_s_id using $tmp/test.dta 20 | 21 | cap destring pc11_s_id, replace 22 | 23 | /* test blank map by state */ 24 | spmap using $tmp/state_coord, id(geo_id) 25 | graphout blank_map 26 | 27 | /* heatmap conditions by state */ 28 | spmap rf_conditions using $tmp/state_coord, id(geo_id) 29 | graphout heatmap 30 | -------------------------------------------------------------------------------- /como/make_como.do: -------------------------------------------------------------------------------- 1 | /*********************/ 2 | /* data construction */ 3 | /*********************/ 4 | 5 | /* get continuous fit to UK age hazard ratios */ 6 | //shell matlab $ccode/como/b/fit_cts_uk_age_hr.m 7 | 8 | /* combine DLHS and AHS */ 9 | do $ccode/como/b/prep_health_data.do 10 | 11 | /* prepare global burden of disease data */ 12 | do $ccode/como/b/prep_gbd.do 13 | 14 | /* calculate risk factors */ 15 | do $ccode/como/b/prep_india_comorbidities.do 16 | 17 | /* create an age-level dataset with England condition prevalence */ 18 | do $ccode/como/b/prep_england_prevalence.do 19 | 20 | /* create a clean set of files with relative risks */ 21 | do $ccode/como/b/prep_hrs.do 22 | 23 | /* prep NY odds ratios of death */ 24 | do $ccode/como/b/prep_ny_mortality.do 25 | 26 | /* prep india and UK sex ratios and populations */ 27 | do $ccode/como/b/prep_pop_sex.do 28 | 29 | /* create age-level datasets for HR, prevalence, population, all with identical structures */ 30 | /* THIS CREATES THE MAIN ANALYSIS FILE */ 31 | do $ccode/como/b/prep_age_level_data.do 32 | 33 | /* create prevalence standard errors for bootstraps */ 34 | do $ccode/como/b/prep_standard_errors.do 35 | 36 | /* calculate population relative risks and death distributions for england / india */ 37 | do $ccode/como/a/calc_prrs.do 38 | 39 | /************/ 40 | /* analysis */ 41 | /************/ 42 | 43 | /* prepare data for England / India prevalence comparison */ 44 | do $ccode/como/a/prep_eng_india_prev_compare.do 45 | 46 | /* calculate summary statistics and prevalences */ 47 | // do $ccode/como/a/sumstats.do 48 | 49 | /**********************/ 50 | /* figures and tables */ 51 | /**********************/ 52 | 53 | /* create tables for main text and appendix*/ 54 | do $ccode/como/a/make_paper_tables.do 55 | 56 | /* create figures */ 57 | do $ccode/como/a/make_paper_figures.do 58 | 59 | 60 | /************/ 61 | /* appendix */ 62 | /************/ 63 | 64 | /* app figure: hr interpolations */ 65 | do $ccode/como/a/app_age_hr_interpolation.do 66 | 67 | /* run sensitivity tests for sampling error in HRs */ 68 | do $ccode/como/a/calc_hr_sensitivity.do 69 | 70 | /* run sensitivity tests for sampling error in prevalences */ 71 | do $ccode/como/a/calc_prev_sensitivity.do 72 | 73 | /* sensitivity to joint conditions */ 74 | do $ccode/como/a/app_joint_condition.do 75 | -------------------------------------------------------------------------------- /como/r/covid_como_sumstats.csv: -------------------------------------------------------------------------------- 1 | uk_male_risk,1.48 2 | india_male_risk,1.50 3 | male_ratio_sign,+ 4 | male_ratio,1.35 5 | uk_obese_1_2_risk,1.09 6 | india_obese_1_2_risk,1.01 7 | obese_1_2_ratio_sign, 8 | obese_1_2_ratio,-6.73 9 | uk_obese_3_risk,1.04 10 | india_obese_3_risk,1.01 11 | obese_3_ratio_sign, 12 | obese_3_ratio,-3.31 13 | uk_bp_high_risk,0.99 14 | india_bp_high_risk,0.99 15 | bp_high_ratio_sign,+ 16 | bp_high_ratio,0.09 17 | uk_diabetes_uncontr_risk,1.03 18 | india_diabetes_uncontr_risk,1.11 19 | diabetes_uncontr_ratio_sign,+ 20 | diabetes_uncontr_ratio,8.12 21 | uk_diabetes_contr_risk,1.03 22 | india_diabetes_contr_risk,1.01 23 | diabetes_contr_ratio_sign, 24 | diabetes_contr_ratio,-2.44 25 | uk_asthma_ocs_risk,1.02 26 | india_asthma_ocs_risk,1.01 27 | asthma_ocs_ratio_sign, 28 | asthma_ocs_ratio,-1.50 29 | uk_autoimmune_dz_risk,1.01 30 | india_autoimmune_dz_risk,1.00 31 | autoimmune_dz_ratio_sign, 32 | autoimmune_dz_ratio,-0.51 33 | uk_haem_malig_1_risk,1.01 34 | india_haem_malig_1_risk,1.00 35 | haem_malig_1_ratio_sign, 36 | haem_malig_1_ratio,-0.92 37 | uk_cancer_non_haem_1_risk,1.03 38 | india_cancer_non_haem_1_risk,1.00 39 | cancer_non_haem_1_ratio_sign, 40 | cancer_non_haem_1_ratio,-2.35 41 | uk_chronic_heart_dz_risk,1.03 42 | india_chronic_heart_dz_risk,1.01 43 | chronic_heart_dz_ratio_sign, 44 | chronic_heart_dz_ratio,-1.89 45 | uk_chronic_resp_dz_risk,1.02 46 | india_chronic_resp_dz_risk,1.04 47 | chronic_resp_dz_ratio_sign,+ 48 | chronic_resp_dz_ratio,2.34 49 | uk_immuno_other_dz_risk,1.00 50 | india_immuno_other_dz_risk,1.00 51 | immuno_other_dz_ratio_sign,+ 52 | immuno_other_dz_ratio,0.02 53 | uk_kidney_dz_risk,1.08 54 | india_kidney_dz_risk,1.09 55 | kidney_dz_ratio_sign,+ 56 | kidney_dz_ratio,0.76 57 | uk_liver_dz_risk,1.02 58 | india_liver_dz_risk,1.03 59 | liver_dz_ratio_sign,+ 60 | liver_dz_ratio,1.42 61 | uk_neuro_other_risk,1.00 62 | india_neuro_other_risk,1.00 63 | neuro_other_ratio_sign, 64 | neuro_other_ratio,-0.08 65 | uk_stroke_dementia_risk,1.03 66 | india_stroke_dementia_risk,1.01 67 | stroke_dementia_ratio_sign, 68 | stroke_dementia_ratio,-1.72 69 | uk_health_risk,2.29 70 | india_health_risk,2.10 71 | health_ratio_sign, 72 | health_ratio,-8.25 73 | india_age18_40_mu,50.2 74 | india_age40_50_mu,19.2 75 | india_age50_60_mu,14.3 76 | india_age60_70_mu,10.3 77 | india_age70_80_mu,4.6 78 | india_age80__mu,1.5 79 | india_male_mu,47.1 80 | india_diabetes_uncontr_mu,8.9 81 | india_diabetes_contr_mu,1.7 82 | india_hypertension_both_mu,28.2 83 | india_obese_3_mu,0.4 84 | india_obese_1_2_mu,4.0 85 | uk_age_18_40,36.6 86 | uk_age_40_50,16.3 87 | uk_age_50_60,17.0 88 | uk_age_60_70,13.3 89 | uk_age_70_80,10.4 90 | uk_age_80,6.3 91 | india_gbd_chronic_heart_dz_mu,4.4 92 | india_gbd_chronic_resp_dz_mu,4.8 93 | india_gbd_kidney_dz_mu,9.7 94 | india_gbd_liver_dz_mu,5.3 95 | india_gbd_asthma_ocs_mu,2.5 96 | india_gbd_cancer_non_haem_1_mu,0.3 97 | india_gbd_haem_malig_1_mu,0.0 98 | india_gbd_autoimmune_dz_mu,1.0 99 | india_gbd_immuno_other_dz_mu,0.1 100 | india_gbd_stroke_dementia_mu,1.3 101 | india_gbd_neuro_other_mu,0.0 102 | uk_gbd_chronic_heart_dz_mu,5.9 103 | uk_gbd_chronic_resp_dz_mu,5.4 104 | uk_gbd_kidney_dz_mu,5.6 105 | uk_gbd_liver_dz_mu,2.6 106 | uk_gbd_asthma_ocs_mu,9.2 107 | uk_gbd_cancer_non_haem_1_mu,2.6 108 | uk_gbd_haem_malig_1_mu,0.2 109 | uk_gbd_autoimmune_dz_mu,2.4 110 | uk_gbd_immuno_other_dz_mu,0.1 111 | uk_gbd_stroke_dementia_mu,1.5 112 | uk_gbd_neuro_other_mu,0.1 113 | male,48.9 114 | uk_prev_diabetes_contr,6.4 115 | uk_prev_diabetes_uncontr,2.1 116 | uk_prev_chronic_resp_dz,2.5 117 | uk_prev_hypertension_both,28.0 118 | uk_prev_obese_3,3.1 119 | uk_prev_obese_1_2,24.8 120 | -------------------------------------------------------------------------------- /como/tex/como_exhibits.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt,letterpaper]{article} 2 | \setcounter{page}{0} 3 | 4 | \usepackage{mathtools} 5 | \usepackage{bbm} 6 | \usepackage[multiple]{footmisc} 7 | \usepackage{floatpag,amsmath,amsthm,amssymb} 8 | \newtheorem{proposition}{Proposition} 9 | \numberwithin{equation}{section} 10 | \newtheorem{nono-prop}{Proposition}[] 11 | 12 | % Figure panel header font 13 | \newcommand{\panel}{\fontfamily{phv}\selectfont\scriptsize\textbf} 14 | \usepackage{amsmath} 15 | \DeclareMathOperator*{\argmin}{arg\,min} 16 | \DeclareMathOperator*{\argmax}{arg\,max} 17 | 18 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 19 | %% LOAD LOCAL COMPILATION PATHS 20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 21 | 22 | %% DON'T CHANGE ANY OF THESE PATHS. FOR LOCAL COMPILE, EDIT YOUR 23 | %% ~/include.tex ONLY 24 | \newcommand{\HOME}{\string~} 25 | \input{\HOME/include.tex} 26 | 27 | % include standard package 28 | \input{front_matter_como} 29 | 30 | \usepackage{fancyhdr} 31 | \pagestyle{fancy} 32 | \lhead{} 33 | \chead{} 34 | \rhead{\thepage} 35 | \cfoot{} % get rid of the page number 36 | \renewcommand{\headrulewidth}{0pt} 37 | \renewcommand{\footrulewidth}{0pt} 38 | \setlength{\headsep}{24pt} 39 | 40 | % package for color-shared tables 41 | \usepackage[table]{xcolor} 42 | 43 | \usepackage{graphicx} 44 | % disable hyperlinks, which were breaking on appendix references 45 | % \usepackage[options]{nohyperref} 46 | 47 | \title{COVID Comorbidity paper} \author{Nobody} 48 | 49 | %%%%%%%%%%%%%%%%%%%%%% 50 | % NO TITLE PAGE 51 | %%%%%%%%%%%%%%%%%%%%%% 52 | \begin{document} 53 | \date{June 2020} 54 | \maketitle 55 | \clearpage 56 | 57 | \begin{figure} 58 | \centering 59 | \caption{{\footnotesize Prevalence of diabetes, hypertension, and 60 | obesity in India and England.}} 61 | \begin{tabular}{@{}p{0.48\linewidth}@{\quad}p{0.48\linewidth}@{}} 62 | \subfigimg[width=\linewidth]{A) Diabetes}{\covidpath/diabetes.pdf} & 63 | \subfigimg[width=\linewidth]{B) Hypertension}{\covidpath/hypertension.pdf} \\ 64 | \subfigimg[width=\linewidth]{C) Obesity}{\covidpath/obese.pdf} & 65 | \end{tabular} 66 | \end{figure} 67 | 68 | \clearpage 69 | 70 | \begin{figure}[H] 71 | \begin{center} 72 | \caption{Age-specific population relative risk of COVID-19 mortality from all health conditions ($PRR_a$)} 73 | \includegraphics[scale=1.0]{\covidpath/prr_health.pdf} 74 | \end{center} 75 | \end{figure} 76 | 77 | \begin{figure}[H] 78 | \begin{center} 79 | \caption{Comorbidity-specific population relative risk of COVID-19 mortality in India v. England} 80 | \includegraphics[scale=0.7]{\covidpath/coefplot.pdf} 81 | \end{center} 82 | \end{figure} 83 | 84 | \begin{figure}[H] 85 | \begin{center} 86 | \caption{Modelled age distribution of COVID-19 mortality} 87 | \includegraphics[scale=1.0]{\covidpath/mort_density_full.pdf} 88 | \end{center} 89 | \end{figure} 90 | 91 | \begin{table}[H] 92 | \begin{center} 93 | \caption{} 94 | \input{\covidpath/covid_como_sumstats} 95 | \end{center} 96 | \end{table} 97 | 98 | \begin{table}[H] 99 | \begin{center} 100 | \caption{} 101 | \input{\covidpath/covid_como_sumhr} 102 | \end{center} 103 | \end{table} 104 | 105 | 106 | \end{document} 107 | 108 | -------------------------------------------------------------------------------- /como/tex/como_tables_figures.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt,letterpaper]{article} 2 | \setcounter{page}{0} 3 | 4 | % \usepackage[a4paper,margin=1in,landscape]{geometry} 5 | \usepackage{mathtools} 6 | \usepackage{bbm} 7 | \usepackage[multiple]{footmisc} 8 | \usepackage{floatpag,amsmath,amsthm,amssymb} 9 | \newtheorem{proposition}{Proposition} 10 | \numberwithin{equation}{section} 11 | \newtheorem{nono-prop}{Proposition}[] 12 | 13 | % Figure panel header font 14 | \newcommand{\panel}{\fontfamily{phv}\selectfont\scriptsize\textbf} 15 | \usepackage{amsmath} 16 | \DeclareMathOperator*{\argmin}{arg\,min} 17 | \DeclareMathOperator*{\argmax}{arg\,max} 18 | 19 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 20 | %% LOAD LOCAL COMPILATION PATHS 21 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 22 | 23 | %% DON'T CHANGE ANY OF THESE PATHS. FOR LOCAL COMPILE, EDIT YOUR 24 | %% ~/include.tex ONLY 25 | \newcommand{\HOME}{\string~} 26 | \input{\HOME/include.tex} 27 | 28 | % include standard package 29 | \input{front_matter_como} 30 | 31 | \usepackage{fancyhdr} 32 | \pagestyle{fancy} 33 | \lhead{} 34 | \chead{} 35 | \rhead{\thepage} 36 | \cfoot{} % get rid of the page number 37 | \renewcommand{\headrulewidth}{0pt} 38 | \renewcommand{\footrulewidth}{0pt} 39 | \setlength{\headsep}{24pt} 40 | 41 | % package for color-shared tables 42 | \usepackage[table]{xcolor} 43 | 44 | % disable hyperlinks, which were breaking on appendix references 45 | % \usepackage[options]{nohyperref} 46 | 47 | \title{COVID Comorbidity paper} \author{Nobody} 48 | 49 | %%%%%%%%%%%%%%%%%%%%%% 50 | % NO TITLE PAGE 51 | %%%%%%%%%%%%%%%%%%%%%% 52 | \begin{document} 53 | \date{June 2020} 54 | % \maketitle 55 | 56 | \section{Figures and Tables} 57 | 58 | \begin{table}[H] 59 | \begin{center} 60 | \caption{condition prevalences } 61 | %\input{\covidpath/app_table_age_bin_prev} 62 | \input{\covidpath/covid_como_agerisks.tex} 63 | 64 | \footnotesize{[table note]} 65 | \end{center} 66 | \end{table} 67 | 68 | \begin{table}[H] 69 | \begin{center} 70 | \caption{Prevalence of Conditions in Population and in OpenSAFELY} 71 | %\input{\covidpath/app_table_os_vs_nhs} 72 | \input{\covidpath/covid_como_oscompare.tex} 73 | 74 | \footnotesize{[table note]} 75 | \end{center} 76 | \end{table} 77 | 78 | \clearpage 79 | \begin{figure}[H] 80 | \begin{center} 81 | \caption{Prevalence of Conditions in Population and in OpenSAFELY} 82 | \textbf{Age Interpolation: Fully-Adjusted Model} 83 | 84 | \includegraphics[scale=0.5]{\covidpath/age_interpolation_full} 85 | 86 | \footnotesize{[figure note]} 87 | \end{center} 88 | \end{figure} 89 | 90 | \end{document} 91 | 92 | -------------------------------------------------------------------------------- /como/tex/front_matter_como.tex: -------------------------------------------------------------------------------- 1 | \usepackage[latin1]{inputenc} 2 | % \usepackage{lmodern} % keep or kill this?? might affect italics. 3 | \usepackage{setspace} 4 | \usepackage{amsmath} 5 | \usepackage{amsthm} 6 | \usepackage{amsfonts} 7 | \usepackage{longtable} 8 | \addtolength{\textwidth}{5cm} 9 | \addtolength{\textheight}{5cm} 10 | \usepackage{fullpage} 11 | \usepackage{amssymb} 12 | \usepackage[hyperpageref]{backref} 13 | \usepackage[hidelinks]{hyperref} 14 | \usepackage{url} 15 | \usepackage{epstopdf} 16 | \usepackage{multirow} 17 | %\usepackage{array} 18 | %\usepackage{harvard} 19 | \usepackage{tabularx} 20 | %\citationmode{abbr} 21 | 22 | \usepackage{float} 23 | % \usepackage{perpage} 24 | % \MakeSorted{figure} 25 | % \MakeSorted{table} 26 | \usepackage{lscape} 27 | \usepackage{verbatim} 28 | \usepackage{pdflscape} 29 | \usepackage{chngcntr} 30 | \usepackage{appendix} 31 | \usepackage{booktabs,calc} 32 | \usepackage{ulem} 33 | \usepackage{siunitx} 34 | %\sisetup{output-decimal-marker=\cdot} 35 | 36 | % allow yellow highlighting in tables 37 | \usepackage{color,colortbl} 38 | \usepackage{soul} 39 | \definecolor{Yellow}{rgb}{.88,1,.65} 40 | \definecolor{Green}{rgb}{.65,1,.65} 41 | \definecolor{Red}{rgb}{1,.65,.65} 42 | 43 | %\citationstyle{dcu} 44 | 45 | \usepackage[labelfont=bf,center,large,labelsep=newline]{caption} 46 | %\usepackage{subfigure} 47 | % \counterwithout{subtable}{table} 48 | \def\changemargin#1#2{\list{}{\rightmargin#2\leftmargin#1}\item[]} 49 | \let\endchangemargin=\endlist 50 | 51 | % define subscript / superscript commands 52 | \newcommand{\superscript}[1]{\ensuremath{^{\textrm{#1}}}} 53 | \newcommand{\subscript}[1]{\ensuremath{_{\textrm{#1}}}} 54 | 55 | % create a shortcut for newlines in captions: 56 | \newcommand{\cnewline}{\hspace{\linewidth}} 57 | 58 | %format paper to save trees 59 | \usepackage[right=1in,left=1in,top=1in,bottom=1in]{geometry} 60 | \usepackage{savetrees} 61 | 62 | %AER style headers 63 | \def\thesection{\arabic{section}} 64 | \def\thesubsection {\thesection.\arabic{subsection}} 65 | 66 | % set home path 67 | % \newcommand{\HOME}{\string~} 68 | 69 | \newcommand{\subfigimg}[3][,]{% 70 | \setbox1=\hbox{\includegraphics[#1]{#3}}% Store image in box 71 | \leavevmode\rlap{\usebox1}% Print image 72 | \rlap{\hspace*{90pt}\raisebox{\dimexpr\ht1+0.9\baselineskip}{\colorbox{white}{{\footnotesize#2}}}}% Print label 73 | \phantom{\usebox1}% Insert appropriate spcing 74 | } 75 | -------------------------------------------------------------------------------- /e/compare_hosp_counts.do: -------------------------------------------------------------------------------- 1 | /********************************************************************************************/ 2 | /* COPY CODE FROM HOSPITAL ESTIMATES TO GET DLHS LINKED WITH PC, AND SCALE UP PC BED COUNTS */ 3 | /********************************************************************************************/ 4 | 5 | /* combine DLHS, Population Census, Economic Census, to estimate hospital 6 | capacity at the district and subdistrict level. */ 7 | 8 | /* merge DLHS, PC, EC together at district level */ 9 | use $covidpub/hospitals/dlhs4_hospitals_dist.dta, clear 10 | merge 1:1 pc11_state_id pc11_district_id using $covidpub/hospitals/ec_hospitals_dist.dta, gen(_m_ec13) 11 | merge 1:1 pc11_state_id pc11_district_id using $covidpub/hospitals/pc_hospitals_dist.dta, gen(_m_pc11) 12 | 13 | /* drop if missing pc11 ids */ 14 | drop if mi(pc11_state_id) | mi(pc11_district_id) 15 | 16 | /* reconcile variable names (though really should do this in the build files above) */ 17 | ren dlhs4* dlhs* 18 | 19 | /* key variables */ 20 | /* dlhs: dlhs4_total_beds, dlhs4_total_count, dlhs4_total_staff */ 21 | /* ec13: ec_emp_hosp_priv, ec_emp_hosp_gov */ 22 | /* pc11: pc_hosp_beds_u pc_clinic_beds_u */ 23 | 24 | /* generate private share from EC */ 25 | gen ec_priv_hosp_share = ec_emp_hosp_priv / (ec_emp_hosp_priv + ec_emp_hosp_gov) 26 | sum ec_priv_hosp_share,d 27 | /* tons of variation, from 0 to 1, med .52, close to uniform */ 28 | 29 | /* generate total ec emp in hospitals */ 30 | gen ec_emp_hosp_tot = ec_emp_hosp_priv + ec_emp_hosp_gov 31 | 32 | /* gen urban to rural doctor share */ 33 | gen pc_doc_u_share = pc_docs_pos_u / (pc_docs_pos_r + pc_docs_pos_u) 34 | 35 | /* gen urban to rural doctor in hospital share */ 36 | gen pc_hosp_doc_u_share = pc_docs_hosp_u / (pc_docs_hosp_r + pc_docs_hosp_u) 37 | 38 | /* scale up urban beds in pop census using rural share of doctors */ 39 | 40 | /* use overall doc share for clinic beds */ 41 | gen pc_clinic_beds = pc_clinic_beds_u / pc_doc_u_share 42 | 43 | /* use hospital doc share for hospital beds */ 44 | gen pc_hosp_beds = pc_hosp_beds_u / pc_hosp_doc_u_share 45 | 46 | /* scale up DLHS primary health clinics */ 47 | foreach v in beds count staff pop { 48 | replace dlhs_phc_`v' = dlhs_phc_`v' * dlhs_phc_mult 49 | } 50 | 51 | /* combine two DLHS clinic types */ 52 | egen dlhs_clinic_beds = rowtotal(dlhs_chc_beds dlhs_phc_beds) 53 | 54 | /* compare different clinic type counts */ 55 | corr dlhs_dh_beds dlhs_chc_beds dlhs_phc_beds dlhs_clinic_beds pc_clinic_beds pc_hosp_beds 56 | 57 | 58 | /* log correlation */ 59 | foreach v in dlhs_dh_beds dlhs_chc_beds dlhs_phc_beds dlhs_clinic_beds pc_clinic_beds pc_hosp_beds { 60 | gen ln_`v' = ln(`v' + 1) 61 | } 62 | 63 | corr ln_* 64 | 65 | 66 | 67 | dlhs4_dh_beds int %9.0g Total beds in district hospitals 68 | dlhs4_dh_count byte %9.0g Total district hospitals 69 | dlhs4_dh_staff int %9.0g Total staff district hospitals 70 | dlhs4_chc_beds int %9.0g Total beds in community health centers 71 | dlhs4_chc_count byte %9.0g Total community health centers 72 | dlhs4_chc_staff int %9.0g Total staff in community health centers 73 | dlhs4_phc_beds int %9.0g Total beds in primary health centers 74 | dlhs4_phc_count byte %9.0g Total primary health centers 75 | dlhs4_phc_staff int %9.0g Total staff in primary health centers 76 | dlhs4_phc_pop long %9.0g Population covered by sampled primary health centers 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | /***********************/ 91 | /* explore ICU shares */ 92 | /***********************/ 93 | use $health/DLHS4_FacilitySurveyData/AHS_FACILITY/AHS_dh, clear 94 | append using $health/DLHS4_FacilitySurveyData/NON_AHS_FACILITY/DH_NONAHS 95 | 96 | /* merge in pc11 districts */ 97 | merge m:1 state dist using $health/DLHS4_FacilitySurveyData/dlhs4_district_key, keepusing(pc11_state_id pc11_state_name pc11_district_id pc11_district_name) 98 | drop if _merge == 2 99 | drop _merge 100 | 101 | collapse (sum) qd2 qd68_total, by(pc11_state_name pc11_state_id) 102 | 103 | gen ratio = qd68_total / qd2 104 | 105 | sort ratio 106 | list 107 | 108 | merge 1:1 pc11_state_id using $pc11/pc11_pca_state_clean, keepusing(pc11_pca_tot_p) 109 | 110 | gen icu_per_100k = qd68_total / pc11_pca_tot_p * 100000 111 | gen bed_per_k = qd2 / pc11_pca_tot_p * 1000 112 | 113 | sort icu_per_100k 114 | list pc11_state_name icu_per_100k bed_per_k 115 | -------------------------------------------------------------------------------- /e/create_agmark_plots.do: -------------------------------------------------------------------------------- 1 | use $covidpub/agmark/agmark_clean, clear 2 | drop if mi(lgd_state_id) 3 | 4 | /* adjust formats of identifying variables */ 5 | format date %dM_d,_CY 6 | tostring lgd_state_id, format("%02.0f") replace 7 | tostring lgd_district_id, format("%03.0f") replace 8 | 9 | /* indicate if something is a perishabel */ 10 | gen perishable = 1 if (group == 8 | group == 9 | group == 15) 11 | replace perishable = 0 if mi(perishable) 12 | 13 | /* save overall data file */ 14 | save $tmp/agmark_data, replace 15 | 16 | /****************/ 17 | /* TOTAL VOLUME */ 18 | /****************/ 19 | /* replace quantity with 0 if it's a number, we can't convert this, it's 0.28% of total entries */ 20 | replace qty = . if unit == 1 21 | 22 | /* first collapse to state-district-date level */ 23 | collapse (sum) qty, by(date lgd_state_id lgd_district_id) 24 | 25 | /* merge in covid case data */ 26 | merge m:1 date lgd_state_id lgd_district_id using $covidpub/covid/covid_infected_deaths 27 | drop _merge 28 | 29 | /* now collapse to national-day level */ 30 | collapse (sum) qty cases death, by(date) 31 | 32 | /* get year */ 33 | gen year = year(date) 34 | 35 | /* save */ 36 | save $tmp/agmark_total_ts, replace 37 | 38 | /**************/ 39 | /* LIVESTOCK */ 40 | /**************/ 41 | use $tmp/agmark_data, clear 42 | 43 | /* keep large livestock */ 44 | keep if item == 47 | item == 52 | item == 89 | item == 119 | item == 140 | item == 211 | item == 254 | item == 255 | item == 256 | item == 226 | item == 237 45 | 46 | /* collapse to state-district-date level */ 47 | collapse (sum) qty, by(date lgd_state_id lgd_district_id) 48 | 49 | /* merge in covid data by date */ 50 | merge m:1 date lgd_state_id lgd_district_id using $covidpub/covid/covid_infected_deaths 51 | drop _merge 52 | 53 | /* collapse to national-date level */ 54 | collapse (sum) qty cases death, by(date) 55 | 56 | /* create year */ 57 | gen year = year(date) 58 | 59 | /* save */ 60 | save $tmp/agmark_livestock_ts, replace 61 | -------------------------------------------------------------------------------- /e/describe_migration.do: -------------------------------------------------------------------------------- 1 | /* Matching migration data to covid data */ 2 | 3 | /* use covid dataset */ 4 | use $covidpub/covid/covid_infected_deaths.dta, clear 5 | 6 | /* merge migration data */ 7 | merge m:1 lgd_district_id using $covidpub/migration/district_migration.dta 8 | 9 | /* drop _merge */ 10 | drop _merge 11 | 12 | /* merge population data */ 13 | merge m:1 lgd_district_id using $covidpub/demography/dem_district.dta 14 | 15 | /* drop missing values */ 16 | drop if mi(lgd_state_id) 17 | drop if mi(lgd_district_id) 18 | 19 | /* generate migration district data (share in national total*total national migrants) */ 20 | gen outltmigration = outltmigrationshare * outltmigrantstotal 21 | 22 | /* gen per capita variables */ 23 | gen total_cases_pc = total_cases / pc11_pca_tot_p 24 | gen outltmigration_pc = outltmigration / pc11_pca_tot_p 25 | 26 | /* gen log variables */ 27 | foreach var in total_cases outltmigration pc11_pca_tot_p outltmigration_pc total_cases_pc { 28 | gen log_`var' = ln(`var') 29 | } 30 | 31 | /* keep latest covid data */ 32 | keep if date == 22082 33 | 34 | /* save dataset */ 35 | save $tmp/covid_migration.dta, replace 36 | 37 | /* binscatter log cases vs. log outmigratns */ 38 | binscatter log_total_cases log_outltmigration 39 | graphout cases_outmigrants 40 | 41 | /* repeat, controlling for population */ 42 | binscatter log_total_cases log_outltmigration, control(log_pc11_pca_tot_p) xlabel(7.5(.5)10.5) ylabel(2.5(.5)4.5) 43 | graphout cases_outmigrants_popcontrol 44 | 45 | /* per capita variables */ 46 | binscatter log_total_cases_pc log_outltmigration_pc, control(log_pc11_pca_tot_p) 47 | graphout cases_outmigrants_pc 48 | 49 | /* repeat, restricting to bihar and UP */ 50 | gen sample = inlist(lgd_state_name, "bihar", "uttar pradesh") 51 | binscatter log_total_cases log_outltmigration if sample == 1 52 | graphout cases_outmigrants_subsample 53 | binscatter log_total_cases log_outltmigration if sample == 1, control(log_pc11_pca_tot_p) xlabel(7.5(.5)10.5) ylabel(2.5(.5)4.5) xtitle("Log number typical outmigrants") ytitle("Log cases (5/22)") 54 | graphout cases_outmigrants_popcontrol_subsample 55 | binscatter log_total_cases_pc log_outltmigration_pc if sample == 1, control(log_pc11_pca_tot_p) 56 | graphout cases_outmigrants_pc_subsample 57 | 58 | reg log_total_cases log_pc11_pca_tot_p if sample == 1 59 | predict case_hat, resid 60 | reg log_outltmigration log_pc11_pca_tot_p if sample == 1 61 | predict outmigrants_hat, resid 62 | 63 | twoway (scatter case_hat outmigrants_hat if sample == 1, xtitle("Log residual number typical outmigrants") ytitle("Log residual cases (5/22)")) /// 64 | (lfit case_hat outmigrants_hat if sample & inrange(outmigrants_hat, -2, 2)) 65 | graphout scatter 66 | 67 | /* regression versions */ 68 | reg log_total_cases log_outltmigration 69 | reg log_total_cases log_outltmigration log_pc11_pca_tot_p 70 | reg log_total_cases log_outltmigration if sample == 1 71 | reg log_total_cases log_outltmigration log_pc11_pca_tot_p if sample == 1 72 | -------------------------------------------------------------------------------- /e/dlhs.do: -------------------------------------------------------------------------------- 1 | global out ~/iec/SAworking/hosp 2 | mkdir $out 3 | 4 | ls ~/iec/health/DLHS4_FacilitySurveyData/AHS_FACILITY 5 | ls ~/iec/health/DLHS4_FacilitySurveyData/NON_AHS_FACILITY 6 | 7 | /* explore data */ 8 | 9 | 10 | /* hospitals (dh), community health centers (chc), primary health centers (phc), sub-health centers (shc) */ 11 | 12 | /* AHS districts */ 13 | 14 | /* district hospitals */ 15 | use ~/iec/health/DLHS4_FacilitySurveyData/AHS_FACILITY/AHS_dh.dta , clear 16 | /* variables of interest: */ 17 | /* qd2 double %3.0f TOTAL NUMBER OF BEDS */ 18 | /* note: has beds broken out by type */ 19 | 20 | /* community health cetners */ 21 | use ~/iec/health/DLHS4_FacilitySurveyData/AHS_FACILITY/AHS_chc.dta , clear 22 | /* qc571 double %3.0f Total Number of beds in CHC */ 23 | 24 | /* primary health center */ 25 | use ~/iec/health/DLHS4_FacilitySurveyData/AHS_FACILITY/AHS_phc.dta , clear 26 | /* qp429a double %2.0f Total number of bed sanction for PHC */ 27 | /* qp429b double %2.0f Total number of bed available in PHC */ 28 | 29 | 30 | /* sub health centers */ 31 | use ~/iec/health/DLHS4_FacilitySurveyData/AHS_FACILITY/AHS_shc.dta , clear 32 | /* NO INPATIENT CARE */ 33 | 34 | 35 | /* NON AHS districts */ 36 | 37 | /* district hospitals */ 38 | use ~/iec/health/DLHS4_FacilitySurveyData/NON_AHS_FACILITY/DH_NONAHS.dta , clear 39 | /* qd2 double %3.0f TOTAL NUMBER OF BEDS */ 40 | 41 | /* community health centers */ 42 | use ~/iec/health/DLHS4_FacilitySurveyData/NON_AHS_FACILITY/CHC_NONAHS.dta , clear 43 | /* qc571 double %3.0f Total Number of beds in CHC */ 44 | 45 | /* primary health center */ 46 | use ~/iec/health/DLHS4_FacilitySurveyData/NON_AHS_FACILITY/PHC_NONAHS.dta , clear 47 | /* qp429a double %2.0f Total number of bed sanction for PHC */ 48 | /* qp429b double %2.0f Total number of bed available in PHC */ 49 | 50 | 51 | /* scatter beds vs staff */ 52 | reg dlhs4_total_beds dlhs4_total_staff 53 | scatter dlhs4_total_beds dlhs4_total_staff 54 | graphout x 55 | 56 | /* explore */ 57 | exit 58 | 59 | forval i = 2/36 { 60 | tab state state_name if state == `i' 61 | } 62 | 63 | 64 | 65 | use $iec/health/hosp/hospitals_dist, clear 66 | 67 | /* sum vars */ 68 | sum dlhs4_perk_total_beds dlhs4_perk_total_facilities dlhs4_perk_total_staff , d 69 | sum pc_perk_beds_tot pc_perk_beds_allo pc_perk_beds_urb_tot pc_perk_beds_urb_allo , d 70 | sum ec_perk_emp_hosp_priv ec_perk_emp_hosp_gov ec_perk_emp_hosp_tot , d 71 | 72 | /* compare beds vars */ 73 | corr dlhs4_perk_total_beds pc_perk_beds_tot 74 | corr dlhs4_perk_total_beds pc_perk_beds_urb_tot 75 | reg dlhs4_perk_total_beds pc_perk_beds_tot 76 | reg dlhs4_perk_total_beds pc_perk_beds_urb_tot 77 | 78 | /* compare rank vars */ 79 | corr rank_dlhs4_perk_total_beds rank_pc_perk_beds_tot 80 | reg rank_dlhs4_perk_total_beds rank_pc_perk_beds_tot 81 | 82 | scatter rank_dlhs4_perk_total_beds rank_pc_perk_beds_tot 83 | graphout ranks 84 | 85 | /* compare bottom vars */ 86 | corr bot_dlhs4_perk_total_beds bot_pc_perk_beds_tot 87 | reg bot_dlhs4_perk_total_beds bot_pc_perk_beds_tot 88 | tab bot_dlhs4_perk_total_beds bot_pc_perk_beds_tot 89 | 90 | /* pc vs dlhs bed count */ 91 | gen pc_dlhs_beds_ratio = pc_beds_tot / dlhs4_total_beds 92 | gen pc_dlhs_priv_share = (pc_beds_tot - dlhs4_total_beds) / pc_beds_tot 93 | sum pc_dlhs_beds_ratio ec_priv_hosp_share, d 94 | corr pc_dlhs_beds_ratio ec_priv_hosp_share 95 | 96 | /* is pc capturing private hospitals? */ 97 | tabstat ec_priv_hosp_share pc_dlhs_priv_share [aw=pc11_pca_tot_p], by(pc11_state_name ) 98 | corr ec_priv_hosp_share pc_dlhs_priv_share [aw=pc11_pca_tot_p] 99 | /* doesn't look like it, since ec_priv_share seems more correlated */ 100 | 101 | 102 | -------------------------------------------------------------------------------- /e/explore_ec_microdata.do: -------------------------------------------------------------------------------- 1 | use $tmp/ec13_hosp, clear 2 | 3 | keep if nic == 861 4 | 5 | sum emp_all, d 6 | -------------------------------------------------------------------------------- /e/explore_idi_survey_r2.do: -------------------------------------------------------------------------------- 1 | global idi ~/iec/covid/idi_survey/round2 2 | 3 | /* import data */ 4 | use $idi/wb2_cleaned_2020_08_07, clear 5 | 6 | /* relabel demo_ag_hh var for easy interpretation on graphs */ 7 | label define a 0 "Non-ag household" 1 "Ag household" 8 | label values demo_ag_hh_r2 a 9 | 10 | /* create earnings variables */ 11 | foreach t in lckdwn curr{ 12 | gen lab_`t'_earn_r2 = lab_`t'_wage_r2 * lab_`t'_freq_mean_r2 13 | } 14 | 15 | /* earnings change between lckdwn and r2 */ 16 | gen earn_change_r2 = (lab_curr_earn_r2 - lab_lckdwn_earn_r2) / lab_lckdwn_earn_r2 17 | 18 | /* top code earnings change */ 19 | sum earn_change_r2, d 20 | replace earn_change_r2 = . if earn_change_r2 > r(p95) 21 | 22 | /* label earnings change */ 23 | la var earn_change_r2 "% change in earnings since lockdown" 24 | 25 | /* gen indicator variable for whether an individual faced difficulty in fert purchase */ 26 | gen fert_diff = agr_fert_diffs_none_prop_r2 27 | 28 | label define df 1 "Faced no difficulty" 0 "Faced dificulty" 29 | label values fert_diff ft 30 | 31 | /* 2 obs in r2 have negative weights - drop them */ 32 | drop if weight_hh_r2 < 0 33 | 34 | /* set scheme */ 35 | set scheme pn 36 | 37 | /**********/ 38 | /* Labour */ 39 | /**********/ 40 | 41 | /* 1. What are those who were unemployed in the previous round doing now? */ 42 | tab lab_curr_occu_r2 if lab_curr_occu_r1 == 0 43 | tab demo_ag_hh_r2 if lab_curr_occu_r1 == 0 44 | 45 | /* clone current occupation variable */ 46 | gen r2_occ = lab_curr_occu_r2 47 | replace r2_occ = 6 if lab_curr_occu_r1 == 0 & demo_ag_hh_r2 == 1 48 | replace r2_occ = . if r2_occ < 0 49 | 50 | /* label values */ 51 | label define r2 0 "Unemployed" 1 "Self-employed non-ag" 2 "Salaried pvt" 3 "Salaried govt" 4 "Daily wage ag" 5 "Daily wage non-ag" 6 "Working on own farm" 99 "Other" 52 | label values r2_occ r2 53 | 54 | /* plot */ 55 | la var r2_occ " " 56 | catplot r2_occ if lab_curr_occu_r1 == 0, title("Current occupation of sample unemployed during lockdown", margin(medium)) 57 | graphout lab_then_now 58 | 59 | /* 2. Labour market status since lockdown remains bleak */ 60 | graph bar lab_freq_change_r2 lab_wagechange_mean_r2 earn_change_r2 if inlist(lab_curr_occu_r2, 1, 2, 3, 4, 5) [aw = weight_hh_r2], ytitle("% change since lockdown", margin(small)) bargap(20) legend(label(1 "Weekly workdays change") label(2 "Daily wage change") label(3 "Weekly earnings change")) 61 | graphout lab_status 62 | 63 | /* 3. Who is still getting work */ 64 | cibar lab_freq_change_r2 if inlist(lab_curr_occu_r2, 1, 2, 3, 4, 5) [aw = weight_hh_r2], over(lab_curr_occu_r2) graphopts(ytitle("% change in weekly workdays since lockdown")) 65 | graphout recovery 66 | 67 | /***************/ 68 | /* Agriculture */ 69 | /***************/ 70 | 71 | /* 1. general state of agriculture */ 72 | graph bar agr_land_change_mean_r2 agr_fert_cost_mean_r2 agr_borrow_mean_r2 agr_borrow_kcc_mean_r2 [aw = weight_hh_r2], ytitle("% change since last season") legend(label(1 "Planned land for kharif cultivation") label(2 "Fertilizer spending") label(3 "Borrowing") label(4 "Borrowing - KCC")) ylabel(-0.25 (0.05) 0.1) 73 | graphout ag_stat 74 | 75 | /* 2. fertilizers */ 76 | graph bar fert_diff agr_fert_price_all_inc_prop_r2 [aw = weight_hh_r2], bar(1, color(green)) bar(2, color(red)) bargap(30) legend(label( 1 "Faced no difficulty in fertilizer purchase") label(2 "Reported a price increase of fertilizers")) ytitle("Percentage", margin(small)) ylabel(0 (0.1) 0.6, grid) 77 | graphout fert 78 | 79 | /* 3. planned land for cultivation, by state */ 80 | ciplot agr_land_change_mean_r2 [aw = weight_hh_r2], by(state) xtitle("State") ytitle("% change in land planned for kharif cultivation") 81 | graphout state_land 82 | 83 | /* 4. graphs to show ag households are doing well */ 84 | cibar con_limit_wk_reduce_prop_r2 [aw = weight_hh_r2], over(demo_ag_hh_r2) graphopts(ytitle("Reduced proportion size of meals in the last week") ylabel(0 (0.02) 0.2) name(food_1, replace)) 85 | cibar con_limit_wk_out_prop_r2 [aw = weight_hh_r2], over(demo_ag_hh_r2) graphopts(ytitle("Ran out of food in the last week") ylabel(0 (0.02) 0.2) name(food_2, replace)) 86 | graph combine food_1 food_2, ycommon 87 | graphout ag_better 88 | 89 | /* 5. relief diff between ag/non-ag households */ 90 | ciplot rel_amt_received_mean_r2 [aw = weight_hh_r2], by(demo_ag_hh_r2) xtitle(" ") name(relief_amt, replace) 91 | graphout relief_amt 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /e/explore_pc_dlhs_doctors.do: -------------------------------------------------------------------------------- 1 | /* Investigates Hospital Definitions in DLHS4 and PC (Issue #14) */ 2 | 3 | /* open DLHS4 dataset */ 4 | use $covidpub/hospitals/dlhs4_hospitals_dist.dta, clear 5 | 6 | /* merge with PC data */ 7 | merge 1:1 pc11_district_id using $covidpub/hospitals/pc_hospitals_dist.dta 8 | drop if _merge != 3 9 | 10 | /* collapse to state level */ 11 | collapse (sum) dlhs4* pc11_pca_tot_p pc_*, by(pc11_state_id) 12 | 13 | /* add state names */ 14 | get_state_names, y(11) 15 | 16 | /* drop states with populations less than 5m */ 17 | drop if pc11_pca_tot_p < 5000000 18 | 19 | /* generate absolute values table */ 20 | sort pc_docs 21 | list pc11_state_name pc_docs pc_docs_hosp dlhs4_total_staff 22 | 23 | /* generate ratio variables */ 24 | gen docs_ratio = pc_docs / dlhs4_total_staff 25 | gen docs_hosp_ratio = pc_docs_hosp / dlhs4_total_staff 26 | gen ratio_diff = docs_hosp_ratio - docs_ratio 27 | 28 | /* generate ratios table */ 29 | sort docs_ratio 30 | list pc11_state_name docs_ratio docs_hosp_ratio ratio_diff 31 | -------------------------------------------------------------------------------- /e/gen_survey_map.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | import contextily as ctx 3 | import pandas as pd 4 | import matplotlib as mpl 5 | import matplotlib.pyplot as plt 6 | import getpass 7 | import os 8 | ​ 9 | ​ 10 | # select the population you want to work with and store full variable name in var 11 | var = "dummy" 12 | ​ 13 | df = pd.read_excel("nrega.xlsx") 14 | 15 | # convert the dataframe to a geodataframe 16 | df = gpd.GeoDataFrame(pd.read_excel("nrega.xlsx"), geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),crs={'init' :'epsg:4326'}) 17 | ​ 18 | # convert the crs of the dataframe 19 | df = df.to_crs(epsg=3857) 20 | ​ 21 | # sort values by longitude 22 | df = df.sort_values(by="longitude") 23 | ​ 24 | # identify minimum and maxiumum values of variable of interest 25 | vmin = df[var].min() 26 | vmax = df[var].max() 27 | ​ 28 | # set up a figure 29 | f, ax = plt.subplots(1, figsize=[10,15]) 30 | ​ 31 | # choose colormap 32 | cmap = "viridis_r" 33 | 34 | # plot figure 35 | df.plot(column=var, ax=ax, vmin=vmin, vmax=vmax, cmap=cmap, alpha=0.85) 36 | ​ 37 | # add basemap 38 | ctx.add_basemap(ax, source=ctx.providers.Stamen.TonerLite, zoom=6) 39 | ​ 40 | # set axis parameters - these are manually set to be the window over all of India 41 | ax.set_xlim([7510000, 10000000]) 42 | ax.set_ylim([1250000, 3750000]) 43 | ax.axes.xaxis.set_visible(False) 44 | ax.axes.yaxis.set_visible(False) 45 | 46 | # set plot title 47 | ax.set_title(f"% reporting NREGA unavailability", fontsize=18, pad=8) 48 | ​ 49 | # add colorbar 50 | cax = f.add_axes([0.93, .25, 0.025, 0.5]) 51 | sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax)) 52 | ​ 53 | # fake up the array of the scalar mappable. 54 | sm._A = [] 55 | cb = f.colorbar(sm, cax=cax) 56 | 57 | # label the colorbar 58 | cb.set_label(label='Share', fontsize=16, rotation=270, labelpad=30) 59 | cb.ax.tick_params(labelsize=14) 60 | 61 | plt.savefig("nrega.png", bbox_inches="tight", dpi=300) 62 | 63 | 64 | # save the figure 65 | username = getpass.getuser() 66 | plt.savefig(os.path.join("/scratch", username, "f{var}.png"), bbox_inches="tight", dpi=300) 67 | 68 | 69 | plt.savefig("/scratch/adibmk/labor_lost_work.png", bbox_inches="tight", dpi=300) 70 | 71 | 72 | plt.savefig(f"{var}.png", bbox_inches="tight", dpi=300) 73 | 74 | plt.close("all") 75 | 76 | -------------------------------------------------------------------------------- /e/get_vac_data.do: -------------------------------------------------------------------------------- 1 | /* define lgd matching programs */ 2 | qui do $ddl/covid/covid_progs.do 3 | qui do $ddl/tools/do/tools.do 4 | 5 | /* Pull district-level vaccination data from covid19india API */ 6 | import delimited "https://api.covid19india.org/csv/latest/cowin_vaccine_data_districtwise.csv", clear 7 | 8 | /* rename all the variables */ 9 | local k = 7 10 | local j = 1 11 | 12 | foreach var of var v* { 13 | 14 | local label : variable label `var' 15 | local label: subinstr local label "/" "" 16 | local label: subinstr local label "/" "" 17 | local label: subinstr local label "." "_" 18 | 19 | ren v`k' v_`label'_`j' 20 | local k = `k'+1 21 | local j = `j'+1 22 | 23 | cap ren v_`label'_* v_`label'_(#), renumber 24 | 25 | } 26 | 27 | /* drop first row containing variable names in the raw API */ 28 | drop in 1 29 | 30 | /* tag duplicates */ 31 | duplicates tag state_code district_key, gen(tag) 32 | keep if tag == 0 33 | drop tag 34 | cap drop v__* 35 | 36 | /* more renaming */ 37 | forval i = 1/10 { 38 | ren v*`i' v`i'* 39 | } 40 | 41 | ren v*_ v* 42 | 43 | /* reshape data from wide to long */ 44 | reshape long v1_ v2_ v3_ v4_ v5_ v6_ v7_ v8_ v9_ v10_, i(state district state_code district_key cowinkey) j(date) string 45 | 46 | destring v*, replace 47 | 48 | /* label variables */ 49 | la var v1_ "Total Individuals Registered" 50 | la var v2_ "Total Sessions Conducted" 51 | la var v3_ "Total Sites" 52 | la var v4_ "First Dose Administered" 53 | la var v5_ "Second Dose Administered" 54 | la var v6_ "Male(Individuals Vaccinated)" 55 | la var v7_ "Female(Individuals Vaccinated)" 56 | la var v8_ "Transgender(Individuals Vaccinated)" 57 | la var v9_ "Total Covaxin Administered" 58 | la var v10_ "Total CoviShield Administered" 59 | 60 | /* rename final vars */ 61 | ren v1_ total_reg 62 | ren v2_ total_sessions 63 | ren v3_ total_sites 64 | ren v4_ total_first_dose 65 | ren v5_ total_second_dose 66 | ren v6_ total_vac_male 67 | ren v7_ total_vac_female 68 | ren v8_ total_vac_trans 69 | ren v9_ total_covaxin 70 | ren v10_ total_covishield 71 | 72 | /* create time variable */ 73 | gen day = substr(date, 1, 2) 74 | gen month = substr(date, 3, 2) 75 | gen year = substr(date, 5, 4) 76 | 77 | destring day month year, replace 78 | gen edate = mdy(month, day, year) 79 | format edate %dM_d,_CY 80 | 81 | /* generate unique id on district key and date */ 82 | egen id = group(district_key edate) 83 | isid id 84 | 85 | /* set as panel */ 86 | xtset id edate, daily 87 | 88 | save $tmp/vaccines_clean , replace 89 | 90 | /****************/ 91 | /* match to LGD */ 92 | /****************/ 93 | use $tmp/vaccines_clean, clear 94 | 95 | /* drop extra variables */ 96 | drop district_key state_code 97 | 98 | /* create lgd_state variable to merge */ 99 | gen lgd_state_name = lower(state) 100 | 101 | /* fix dadra and nager haveli and daman and diu */ 102 | replace lgd_state_name = "dadra and nagar haveli" if district == "Dadra and Nagar Haveli" 103 | replace lgd_state_name = "daman and diu" if (district == "Daman") | (district == "Diu") 104 | 105 | /* merge in lgd state id */ 106 | merge m:1 lgd_state_name using $keys/lgd_state_key, keepusing(lgd_state_id) keep(match master) nogen 107 | 108 | /* now create an lgd_district variable to merge */ 109 | gen lgd_district_name = lower(district) 110 | 111 | /* fix misspellings and name changes */ 112 | synonym_fix lgd_district_name, synfile($ddl/covid/b/str/cov19india_vaccine_district_fixes.txt) replace 113 | 114 | /* save */ 115 | save $tmp/temp, replace 116 | 117 | /* run masala merge */ 118 | keep lgd_state_name lgd_district_name 119 | duplicates drop 120 | masala_merge lgd_state_name using $keys/lgd_district_key, s1(lgd_district_name) minbigram(0.2) minscore(0.6) outfile($tmp/vaccine_lgd_district) 121 | 122 | /* keep master matches */ 123 | keep if match_source < 7 124 | 125 | /* drop unneeded variables */ 126 | keep lgd_state_name lgd_district_name_using lgd_district_name_master 127 | 128 | /* merge data back in */ 129 | ren lgd_district_name_master lgd_district_name 130 | merge 1:m lgd_state_name lgd_district_name using $tmp/temp 131 | drop _merge 132 | 133 | /* now replace the district name with the lgd key name */ 134 | drop lgd_district_name 135 | ren lgd_district_name_using lgd_district_name 136 | 137 | /* ensure that it is it square */ 138 | egen dgroup = group(lgd_state_name lgd_district_name) 139 | fillin date dgroup 140 | drop dgroup _fillin 141 | 142 | /* save data */ 143 | export delimited using "$tmp/covid_vaccination.csv", replace 144 | -------------------------------------------------------------------------------- /e/graphs_idi_r3.do: -------------------------------------------------------------------------------- 1 | use $iec/covid/idi_survey/wb3_clean, clear 2 | 3 | drop if weight_hh_r3 < 0 4 | 5 | global ag agr_loc_shift_prop_r3 agr_harvest_outlook_prop_r3 6 | 7 | /* collapse dataset to shrid level */ 8 | collapse (mean) $ag *change* hea_symp*prop_r3 rel_pds_any_prop_r3 rel*mean_r3 con_stillinsecure_prop* lab_occ*none_r3 (firstnm) state [pw = weight_hh_r3] , by(shrid) 9 | 10 | /* merge to shrids */ 11 | merge 1:1 shrid using $iec/covid/idi_survey/survey_shrid_data.dta, keep(master match) nogen 12 | 13 | /* add nightlights data */ 14 | merge 1:1 shrid using $shrug/data/shrug_nl_wide, keep(master match) nogen keepusing(*2013) 15 | 16 | /* sc-st share */ 17 | merge 1:1 shrid using $shrug/data/shrug_pc11_pca, keepusing(*pca_tot_p *p_sc) keep(master match) nogen 18 | 19 | /* poverty rate */ 20 | merge 1:1 shrid using $shrug/data/shrug_secc, keepusing(secc_pov_rate_rural) keep(master match) nogen 21 | 22 | /* village directory chars */ 23 | merge 1:1 shrid using $shrug/data/shrug_pc11_vd, keepusing(pc11_vd_asha pc11_vd_ams *wkl_haat *vd_mrkt) keep(master match) nogen 24 | 25 | /* keep only variables we need */ 26 | keep $ag *change* land* pc11_pca* ec13* tdist* rural* *light* secc* pc11_vd* hea_symp*prop_r3 rel_pds_any_prop_r3 rel*mean_r3 shrid state *insecure* *none* 27 | 28 | /* generate sc population share */ 29 | gen sc_share = pc11_pca_p_sc/pc11_pca_tot_p 30 | 31 | /************************/ 32 | /* Analysis begins here */ 33 | /************************/ 34 | 35 | set scheme pn 36 | 37 | /* consumption recovery */ 38 | ren con_stillinsecure_prop* insecure* 39 | 40 | twoway lfitci insecure_r3 secc_pov_rate_rural, ytitle("% HH still food insecure - Sept 2020", margin(medium)) xtitle("Poverty % in village - SECC") name(insecure2, replace) /// 41 | note("Note: The Y-axis shows % of HH in the village that became food insecure due to the pandemic and haven't recovered", size(vsmall)) 42 | graphout cons_pov 43 | 44 | /* relief poverty rate */ 45 | /* MNREGA targeting */ 46 | /* take logs */ 47 | gen temp = rel_mnrega_wages_mean_r3 + 1 48 | gen ln_mnrega = ln(temp) 49 | replace temp = secc_pov_rate_rural + 1 50 | gen ln_pov = ln(temp) 51 | drop temp 52 | 53 | reg ln_mnrega ln_pov 54 | 55 | /* Save coefficients for graph */ 56 | local beta_pov = round(_b[ln_pov],0.001) 57 | 58 | test _b[ln_pov] = 0 59 | local p_val = round(`r(p)', 0.001) 60 | 61 | twoway lfitci ln_mnrega ln_pov, ytitle("Log (mean MNREGA wages received)") xtitle("Log (SECC village poverty rate)") clcolor(navy) acolor(ltblue%80) /// 62 | text( 5.4 5.5 "ln(MNREGA wage) on ln(poverty)" " " "Regression coefficient: 0`beta_pov'***", orient(horizontal) size(vsmall) justification(center) fcolor(white) box margin(small)) 63 | graphout targeting 64 | 65 | /* unemployment as of september 2020 */ 66 | twoway lfitci lab_occ_none_r3 tdist_100, ytitle("% HH unemployed - September 2020") xtitle("Distance to nearest town (Km)") name(unemp_1, replace) ylabel(0.25 (.05) .6) clcolor(navy) acolor(green) 67 | twoway lfitci lab_occ_lckdwn_none_r3 tdist_100, ytitle("% HH unemployed - Lockdown") xtitle("Distance to nearest town (Km)") name(unemp_2, replace) ylabel(0.25 (.05) .6) clcolor(pink) acolor(red) 68 | 69 | graph combine unemp_2 unemp_1, rows(1) 70 | graphout unemp_urb 71 | 72 | /* agriculture infrastructure - roads, access to AMS, mandis */ 73 | ren agr_loc_shift_prop_r3 shift_loc 74 | 75 | reg shift_loc rural_road 76 | estimates store Road 77 | reg shift_loc pc11_vd_mrkt 78 | estimates store Mandi 79 | 80 | la var pc11_vd_mrkt "Village has regular mandis" 81 | 82 | coefplot Road Mandi, drop(_cons) yline(0) scheme(plottig) levels(90) legend(label(1 "Village has a road", 2 "Village has weekly mandis", 3 "Distance to nearest city")) recast(bar) vertical ytitle("Outcome: Whether cultivators shifted selling location in 2020", size(small)) 83 | graphout infra 84 | 85 | /* Harvest outlook */ 86 | reg harvest_outlook landless_share 87 | 88 | /* Save coefficients for graph */ 89 | local beta_land = round(_b[landless_share],0.001) 90 | 91 | twoway lfitci harvest_outlook landless_share, ytitle("% of farmers with a +ve harvest outlook this year vs. last year") xtitle("Census: Share of landless working age population (18-65)") clcolor(sienna) acolor(sand%80) /// 92 | text( 0.6 0.7 "Harvest outlook on landless share" " " "Regression coefficient: `beta_land'***", orient(horizontal) size(vsmall) justification(center) fcolor(white) box margin(small)) 93 | graphout ag_outlook 94 | 95 | /* wage change versus poverty rate */ 96 | ren lab_wagechange_mean* wagechange* 97 | 98 | -------------------------------------------------------------------------------- /e/pop_estimates_21.csv: -------------------------------------------------------------------------------- 1 | state_name pop_2021_est 2 | uttar pradesh 240000000 3 | maharashtra 123100000 4 | bihar 124700000 5 | west bengal 99600000 6 | madhya pradesh 85400000 7 | tamil nadu 77800000 8 | rajasthan 81000000 9 | karnataka 67600000 10 | gujarat 63900000 11 | andhra pradesh 53900000 12 | odisha 46400000 13 | telangana 38500000 14 | kerala 35700000 15 | jharkhand 38600000 16 | assam 35600000 17 | punjab 30100000 18 | chhattisgarh 29400000 19 | haryana 28200000 20 | delhi 18700000 21 | jammu & kashmir 13600000 22 | uttarakhand 11200000 23 | himachal pradesh 7450000 24 | tripura 4170000 25 | meghalaya 3340000 26 | manipur 3091000 27 | nagaland 2300000 28 | goa 1586000 29 | arunachal pradesh 1570000 30 | puducherry 1413000 31 | mizoram 1239000 32 | chandigarh 1158000 33 | sikkim 690000 34 | dadra & nagar haveli and daman & diu 615000 35 | andaman & nicobar 417000 36 | ladakh 289000 37 | lakshadweep 73183 38 | -------------------------------------------------------------------------------- /e/prep_dlhs_data.do: -------------------------------------------------------------------------------- 1 | /* This file preps DLHS Data 2 | 1. structure DLHS data, combine by state 3 | 2. merge with PC11 state and district codes 4 | */ 5 | 6 | /**************************/ 7 | /* 1. Structure DLHS data */ 8 | /**************************/ 9 | 10 | /* initiate empty files for each */ 11 | cap mkdir $tmp/dlhs 12 | clear 13 | save $tmp/dlhs/dlhs_BIRTH, emptyok replace 14 | save $tmp/dlhs/dlhs_cab, emptyok replace 15 | save $tmp/dlhs/dlhs_HOUSEHOLD, emptyok replace 16 | save $tmp/dlhs/dlhs_IMMU, emptyok replace 17 | save $tmp/dlhs/dlhs_marriage, emptyok replace 18 | save $tmp/dlhs/dlhs_person, emptyok replace 19 | save $tmp/dlhs/dlhs_village, emptyok replace 20 | save $tmp/dlhs/dlhs_WOMAN, emptyok replace 21 | 22 | /* combine state data for each file type */ 23 | local statelist Andaman_Nicobar AndhraPradesh ArunachalPradesh Chandigarh GOA Haryana HimachalPradesh Karnataka Kerala Maharashtra Manipur Meghalaya Mizoram Nagaland Puducherry Punjab Sikkim TamilNadu Telangana Tripura WestBengal 24 | 25 | /* cycle through all states with dlhs data */ 26 | foreach state in `statelist' { 27 | 28 | /* get the list of files in the state folder */ 29 | local filelist: dir "$health/dlhs/raw/`state'" files "*cab.dta" 30 | 31 | /* cycle through the data files for this state */ 32 | foreach file in `filelist' { 33 | 34 | /* extract the name of this file */ 35 | tokenize "`file'" , parse("_") 36 | local var = "`3'" 37 | 38 | /* open the file */ 39 | use $health/dlhs/raw/`state'/`file', clear 40 | qui count 41 | local counter = `counter' + `r(N)' 42 | 43 | /* save the state name */ 44 | gen state_name = "`state'" 45 | replace state_name = lower(state_name) 46 | 47 | /* append to the full file */ 48 | append using $tmp/dlhs/dlhs_`var' 49 | 50 | /* resave full file */ 51 | save $tmp/dlhs/dlhs_`var', replace 52 | } 53 | } 54 | 55 | 56 | /****************************/ 57 | /* 2. Match with PC11 codes */ 58 | /****************************/ 59 | /* 05/19/20 - for now this only deals with the cab data, merging in some hh variables from ahs_comb */ 60 | 61 | /* open the DLHS data file, clean and save */ 62 | use $tmp/dlhs/dlhs_cab, clear 63 | 64 | /* clean state names to match pc11_state_name */ 65 | gen pc11_state_name = state_name 66 | replace pc11_state_name = subinstr(pc11_state_name, "pradesh", " pradesh", .) 67 | replace pc11_state_name = "andaman nicobar islands" if pc11_state_name == "andaman_nicobar" 68 | replace pc11_state_name = "tamil nadu" if pc11_state_name == "tamilnadu" 69 | replace pc11_state_name = "andhra pradesh" if pc11_state_name == "telangana" 70 | replace pc11_state_name = "west bengal" if pc11_state_name == "westbengal" 71 | 72 | /* merge in pc11 id from key */ 73 | merge m:1 pc11_state_name dist using $health/dlhs/dlhs4_district_key, keepusing(pc11_state_id pc11_district_id) keep(match master) nogen 74 | 75 | /* Basic Cleaning */ 76 | /* drop 14,076 records from Karnataka that have all data fields missing */ 77 | drop if mi(psu) 78 | 79 | /* drop duplicates - force these to drop as these are all duplicated records but won't 80 | get dropped with a simple duplicates drop because of missing values */ 81 | duplicates drop primekeynew, force 82 | 83 | /* rename the primekeynew to be an index for DLHS */ 84 | ren primekeynew index 85 | 86 | /* create pregnancy indicator */ 87 | gen pregnant = 1 if !mi(hv81) & (hv81 == 1 | hv81 == 2) 88 | replace pregnant = 0 if mi(pregnant) 89 | 90 | /* define dlhs_merge variable to describe which observations are from cab, comb, or both */ 91 | gen dlhs_merge = 3 92 | replace dlhs_merge = 2 if mi(q77_intro) | q77_intro == 2 93 | cap label define dlhs_merge 1 "cab only" 2 "comb only" 3 "cab & comb" 94 | label values dlhs_merge dlhs_merge 95 | 96 | /* save in permanent dlhs folder */ 97 | save $health/dlhs/dlhs_cab, replace 98 | -------------------------------------------------------------------------------- /forecasting/README.md: -------------------------------------------------------------------------------- 1 | # COVID forecasting map 2 | 3 | This subfolder contains the backend code to construct the DDL COVID forecasting interactive map, [hosted here](http://www.devdatalab.org/covid-forecast). Forecast variables such as prospective Rt featured in the map are provided by [COVID_metrics](https://twitter.com/COVID_metrics), and supplement DDL COVID data from this repository. 4 | 5 | 6 | Note: this codebase is not intended to be entirely executable or reproducible, rather by open-sourcing we hope to share our methodologies and increase transparency of the approaches taken in the data processing steps for the DDL COVID forecasting map. 7 | 8 | 9 | ![DAG](covid_dag.png) 10 | -------------------------------------------------------------------------------- /forecasting/b/create_vector_tileset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this script takes geojson district data and creates a vector tileset for pushing to mapbox. 4 | # this requires tippecanoe and tile-join, which are installed in ~/iec/local/share/tippecanoe/ 5 | 6 | # note: --generate-ids option is required for referencing feature ids in 7 | # e.g. hover effects. from Mapbox: "mapbox/tippecanoe#615 adds the most 8 | # basic --generate-ids option (using the input feature sequence for the 9 | # ID), with the disclaimer that the IDs are not stable and that their 10 | # format may change in the future." 11 | 12 | # create full-data district tileset with zoom range defined (cost saver) 13 | ~/iec/local/share/tippecanoe/tippecanoe --force -z8 -Z5 -o $TMP/covid_data_plot.mbtiles --read-parallel --coalesce-smallest-as-needed --detect-shared-borders --generate-ids $1 14 | 15 | # create district tileset with most recent observations (for map) 16 | ~/iec/local/share/tippecanoe/tippecanoe --force -z8 -Z5 -o $TMP/covid_data_map.mbtiles --read-parallel --coalesce-smallest-as-needed --detect-shared-borders --generate-ids $2 17 | 18 | # merge tilesets 19 | ~/iec/local/share/tippecanoe/tile-join --force -o $TMP/covid_data.mbtiles $TMP/covid_data_map.mbtiles $TMP/covid_data_plot.mbtiles 20 | -------------------------------------------------------------------------------- /forecasting/b/data_to_geojson.py: -------------------------------------------------------------------------------- 1 | # take DTA data and joins with shapefiles for both shrid and dist 2 | # outputs geojson, which will then be merged into a tileset using tippecanoe 3 | # depends on py_spatial env (run from snakemake) 4 | 5 | 6 | ############ 7 | # Preamble # 8 | ############ 9 | 10 | import sys, os, importlib 11 | import geopandas as gpd 12 | import pandas as pd 13 | import argparse 14 | 15 | # import ddlpy utils 16 | from ddlpy.geospatialtools.utils import import_vector_data 17 | 18 | # initialize args 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--intable", type=str) 21 | parser.add_argument("--inshp", type=str) 22 | parser.add_argument("--outfile", type=str) 23 | args = parser.parse_args() 24 | 25 | # define tabular import fn 26 | def import_tabular_data(fp): 27 | """ 28 | Reads in tabular data with file extension checks 29 | fp: filepath for datafile to be imported, must bs shp/csv/dta/excel 30 | """ 31 | # expand data filepath 32 | fp = os.path.expanduser(fp) 33 | 34 | # assert that the data file exists 35 | if not os.path.isfile(fp): 36 | raise OSError("Input file not found") 37 | 38 | # ensure that the data file is a readable format 39 | fp_ext = os.path.splitext(fp)[1] 40 | if fp_ext not in [".csv", ".dta", ".xls", ".xlsx"]: 41 | raise ValueError("Data must be .dta, .csv, .xlsx/.xls format") 42 | 43 | # read in csv 44 | if fp_ext == ".csv": 45 | target_df = pd.read_csv(fp) 46 | 47 | # read in excel 48 | if fp_ext in [".xls", "xlsx"]: 49 | target_df = pd.read_excel(fp) 50 | 51 | # read in dta 52 | if fp_ext == ".dta": 53 | target_df = pd.read_stata(fp) 54 | 55 | return target_df 56 | 57 | # function to merge tabular data with a shapefile / gdf object 58 | def table_geodataframe_join(poly_in, join_id, fp_table, fp_out=""): 59 | 60 | # expand filepaths 61 | fp_table = os.path.expanduser(fp_table) 62 | fp_out = os.path.expanduser(fp_out) 63 | 64 | # assert that the filepaths exist 65 | if not os.path.isfile(fp_table): 66 | raise OSError("Tabular data file not found") 67 | 68 | # read in the tabular data 69 | tab_data = import_tabular_data(fp_table) 70 | 71 | # execute the merge 72 | # joined = poly_in.merge(tab_data, on=join_id, how='left') 73 | # inner join removes district polygons wihtout data rather than keeping empty geometries 74 | joined = poly_in.merge(tab_data, on=join_id, how='inner') 75 | 76 | # convert any categorical columns to string (breaks to_file gpd method) 77 | for column in joined.select_dtypes(include='category').columns: joined[column] = joined[column].astype('string') 78 | 79 | # write to geojson in desired location 80 | joined.to_file(fp_out, driver="GeoJSON") 81 | 82 | 83 | ################# 84 | # District data # 85 | ################# 86 | 87 | # read in district shapefile simplified on mapshaper.org 88 | dist_poly = import_vector_data(f'{args.inshp}') 89 | 90 | # run the join 91 | print("initiating district-level join") 92 | table_geodataframe_join(poly_in=dist_poly, join_id='lgd_d_id', fp_table=f'{args.intable}', fp_out=os.path.expanduser(f'{args.outfile}')) 93 | 94 | -------------------------------------------------------------------------------- /forecasting/b/merge_ddl_pred_data.do: -------------------------------------------------------------------------------- 1 | /****************************/ 2 | /* District level data join */ 3 | /****************************/ 4 | 5 | /* pull globals */ 6 | process_yaml_config ~/ddl/covid/forecasting/config/config.yaml 7 | 8 | /* combine DDL covid data and UChicago predictions */ 9 | use $cdata/pred_data_district, clear 10 | 11 | /* merge in DDL data */ 12 | merge m:1 lgd_state_id lgd_district_id using $cdata/ddl_data 13 | keep if _merge == 3 14 | drop _merge 15 | 16 | /* some var cleanup. start with formatting ids and geonames */ 17 | ren lgd_state_id lgd_s_id 18 | ren lgd_district_id lgd_d_id 19 | ren lgd_state_name lgd_s_name 20 | ren lgd_district_name lgd_d_name 21 | 22 | /* capitalize geonames */ 23 | replace lgd_s_name = upper(substr(lgd_s_name,1,1)) + substr(lgd_s_name,2,.) 24 | replace lgd_d_name = upper(substr(lgd_d_name,1,1)) + substr(lgd_d_name,2,.) 25 | 26 | /* other var tweaks */ 27 | ren dates date 28 | foreach var of varlist rt_* *cases* { 29 | replace `var' = round(`var', .01) 30 | } 31 | 32 | /* confirm drop of extraneous modeling vars */ 33 | cap drop t_* 34 | 35 | /* save to permadir */ 36 | save $cdata/merged_data_district, replace 37 | 38 | /* CSV version */ 39 | outsheet using $cdata/merged_data_district.csv, comma replace 40 | -------------------------------------------------------------------------------- /forecasting/b/old/push_predicted_metadata.py: -------------------------------------------------------------------------------- 1 | # push predicted covid variable metadata to DDL AWS bucket (web server) 2 | # in practice this is just the most recent rt_pred date from the latest run in a js object 3 | # this will then be used as the basis for the choropleth in the web app 4 | 5 | # note: you need the aws cli and an operational config for this to work (currently only TL has this) 6 | # but can easily set up for others 7 | 8 | import json 9 | import requests 10 | import argparse 11 | import boto3 12 | import os 13 | 14 | # initialize args 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--file", type=str) 17 | args = parser.parse_args() 18 | 19 | # pull file input into python obj 20 | pushfile = f'{args.file}' 21 | fname = os.path.basename(pushfile) 22 | 23 | ########################## 24 | # upload new zips to AWS # 25 | ########################## 26 | 27 | # status report 28 | print(f'pushing data from {pushfile} to AWS') 29 | 30 | # initialize the boto s3 resource 31 | s3 = boto3.resource('s3') 32 | 33 | # execute AWS command to push the new zip file to S3. 34 | # This requires your aws cli be configured properly, and depends on the current bucket subdirectory configuration 35 | data = open(pushfile, 'rb') 36 | s3.Bucket('shrug-assets-ddl').put_object(Key='static/main/assets/other/' + fname, Body=data, ACL='public-read', ContentType='text/html') 37 | -------------------------------------------------------------------------------- /forecasting/b/process_ddl_data.do: -------------------------------------------------------------------------------- 1 | /* process DDL covid data for merging with dist-level predictions */ 2 | 3 | /* FIXME TODO: paths - use globals */ 4 | /* pull globals */ 5 | process_yaml_config ~/ddl/covid/forecasting/config/config.yaml 6 | 7 | /* read from covidi repo output */ 8 | use ~/iec/covid/hospitals/pc_hospitals_dist.dta , clear 9 | 10 | /* keep vars to include in the tileset */ 11 | keep lgd_*id pc_clinics pc_num_hospitals 12 | 13 | /* write out for merging */ 14 | save $cdata/ddl_data, replace 15 | -------------------------------------------------------------------------------- /forecasting/b/process_predicted_data.do: -------------------------------------------------------------------------------- 1 | /* assemble statewise district-level Rt estimates manually from CSVs */ 2 | /* TODO FIXME: get rid of absolute paths and use globals */ 3 | 4 | /* pull globals */ 5 | process_yaml_config ~/ddl/covid/forecasting/config/config.yaml 6 | 7 | /* function to append state and district files */ 8 | cap prog drop append_covid_estimates 9 | prog def append_covid_estimates 10 | syntax anything 11 | local geo "`anything'" 12 | 13 | /* get all state or dist-level files */ 14 | global imports $cdata/all_rt_estimates 15 | local files : dir "$imports" files "*`geo'*.csv" 16 | 17 | /* loop over files to save as .dta and append. slow logic but concise */ 18 | clear 19 | save $tmp/covid_appender, emptyok replace 20 | foreach file in `files' { 21 | insheet using $imports/`file', names clear 22 | local state_abbrev = substr("`file'", 1, 2) 23 | drop v1 24 | append using $tmp/covid_appender 25 | save $tmp/covid_appender, replace 26 | } 27 | end 28 | 29 | /*************/ 30 | /* Districts */ 31 | /*************/ 32 | 33 | /* append raw data */ 34 | append_covid_estimates district 35 | 36 | /* stringify ids */ 37 | gen tmp = string(lgd_state_id,"%02.0f") 38 | drop lgd_state_id 39 | ren tmp lgd_state_id 40 | gen tmp = string(lgd_district_id,"%03.0f") 41 | drop lgd_district_id 42 | ren tmp lgd_district_id 43 | 44 | /* HACK - get rid of duplicates on date */ 45 | count 46 | local pre_drop `r(N)' 47 | ddrop lgd_state_id lgd_district_id dates 48 | count 49 | local post_drop `r(N)' 50 | di "`post_drop' / `pre_drop'" 51 | assert `post_drop' / `pre_drop' > 0.997 52 | 53 | /* assert there are no duplicate entries for any district at any date */ 54 | distinct lgd_state_id lgd_district_id dates, joint 55 | assert `r(ndistinct)' == `r(N)' 56 | 57 | /* final output for dist data - save to tmp as we'll be merging back districts we want to keep */ 58 | drop state district 59 | order lgd* dates, first 60 | drop t_* 61 | save $tmp/pred_data_all_dists, replace 62 | 63 | /* new data file with single entry of latest date for each district - 64 | used for choropleth */ 65 | 66 | /* get most recent date for imputation */ 67 | gen sdate = date(dates, "YMD") 68 | gsort -sdate 69 | 70 | /* keep most recent observed rt_pred for each district */ 71 | keep if !mi(rt_pred) 72 | gsort lgd_district_id lgd_state_id dates 73 | bysort lgd_district_id lgd_state_id : gen order = _n 74 | by lgd_district_id lgd_state_id: gen latest = _n == _N 75 | keep if latest 76 | 77 | /* DROP IF OVER A MONTH OUT OF DATE */ 78 | 79 | /* create a var for the lag between today and the most recent date */ 80 | gen lag = sdate - daily("`c(current_date)'", "DMY") 81 | 82 | /* drop if over 30 days out of date */ 83 | drop if lag < -30 84 | 85 | /* create 100xed Rt for scaling (MB only allows interpolated fills with integer stops...) */ 86 | gen rt_pred_100x = 100 * rt_pred 87 | 88 | /* merge back to district data */ 89 | preserve 90 | keep lgd_district_id lgd_state_id 91 | merge 1:m lgd_district_id lgd_state_id using $tmp/pred_data_all_dists, keep(match) nogen 92 | 93 | /* DTA and CSV versions */ 94 | save $cdata/pred_data_district, replace 95 | outsheet using $cdata/pred_data_district.csv, comma replace 96 | restore 97 | 98 | /* keep only bare minimum of variables */ 99 | ren lgd_district_id lgd_d_id 100 | keep lgd_d_id rt_pred_100x 101 | 102 | /* save for adding to tileset */ 103 | save $cdata/pred_data_rt_choropleth, replace 104 | 105 | 106 | /**********/ 107 | /* States */ 108 | /**********/ 109 | 110 | /* append raw data */ 111 | append_covid_estimates state 112 | 113 | /* stringify ids */ 114 | gen tmp = string(lgd_state_id,"%02.0f") 115 | drop lgd_state_id 116 | ren tmp lgd_state_id 117 | 118 | /* same basic assertion */ 119 | distinct lgd_state_id dates, joint 120 | assert `r(ndistinct)' == `r(N)' 121 | 122 | /* minimal cleanup here */ 123 | drop state 124 | order lgd* dates, first 125 | 126 | /* final output for state data */ 127 | drop t_* 128 | save $cdata/pred_data_state, replace 129 | 130 | /* CSV version */ 131 | outsheet using $cdata/pred_data_state.csv, comma replace 132 | -------------------------------------------------------------------------------- /forecasting/b/pull_predicted_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # $1 is credential file location; $2 is the helper script location; $3 is target directory for downloads 4 | GOOGLE_APPLICATION_CREDENTIALS=$1 python3 $2 --dir $3 5 | -------------------------------------------------------------------------------- /forecasting/b/pull_predicted_data_helper.py: -------------------------------------------------------------------------------- 1 | # pulls complete predictions data from Satej's google cloud bucket 2 | from google.cloud import storage 3 | from pathlib import Path 4 | import argparse 5 | 6 | # initialize args 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--dir", type=str) 9 | args = parser.parse_args() 10 | 11 | # set target location 12 | target_path = Path(f'{args.dir}') 13 | 14 | # define name of satej's GC bucket 15 | bucket_name = "daily_pipeline" 16 | 17 | # loop over estimates and download each file 18 | for blob in storage.Client().list_blobs(bucket_name, prefix = "pipeline/est"): 19 | filename = Path(blob.name).name 20 | print(f"{blob.name} -> {filename}") 21 | blob.download_to_filename(target_path / filename) 22 | -------------------------------------------------------------------------------- /forecasting/b/push_public_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # push public covid forecasting partnership data to public dropbox folder. 4 | # note: this only makes sense (1) on Polaris and (2) if you have Rclone configured properly. 5 | # file link: https://www.dropbox.com/s/cuyn0wj6bsuilwq/merged_data.dta?dl=0 6 | 7 | # zip up state and dist DTAs and CSVs 8 | cd ~/iec/covid/forecasting/ 9 | tar -vczf covid_forecast.tar.gz README.md merged_data_district.dta merged_data_district.csv pred_data_district.dta pred_data_district.csv pred_data_state.dta pred_data_state.csv 10 | cd - 11 | 12 | # push to the public data folder 13 | # this will change to AWS eventually 14 | rclone copy ~/iec/covid/forecasting/covid_forecast.tar.gz my_remote:SamPaul/covid_data/forecasts 15 | printf "finished pushing data to dropbox" 16 | -------------------------------------------------------------------------------- /forecasting/b/push_vector_tileset.py: -------------------------------------------------------------------------------- 1 | # take tippecanoe vector tileset and push to mapbox 2 | # see: https://docs.mapbox.com/api/maps/uploads/ 3 | # this requires mapbox credentials, which are defined in the YAML config for this project 4 | # resulting tileset will have the tileset ID of devdatalab.rural-data-portal in mapbox studio 5 | 6 | import json 7 | import requests 8 | import argparse 9 | import boto3 10 | 11 | # initialize args 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--token", type=str) 14 | parser.add_argument("--file", type=str) 15 | parser.add_argument("--tilesetname", type=str) 16 | args = parser.parse_args() 17 | 18 | 19 | ########################## 20 | # Request S3 Credentials # 21 | ########################## 22 | 23 | # retrieve S3 credentials. mapbox access token must be passed as an argument 24 | params = ( 25 | ('access_token', f'{args.token}'), 26 | ) 27 | response = requests.post('https://api.mapbox.com/uploads/v1/devdatalab/credentials', params=params) 28 | json_data = response.json() if response and response.status_code == 200 else None 29 | 30 | # process the JSON response to pull necessary fields 31 | bucket = json_data['bucket'] 32 | key = json_data['key'] 33 | url = json_data['url'] 34 | accessKeyId = json_data['accessKeyId'] 35 | secretAccessKey = json_data['secretAccessKey'] 36 | sessionToken = json_data['sessionToken'] 37 | key = json_data['key'] 38 | 39 | # define username and vector tileset name 40 | username = 'devdatalab' 41 | tileset_name = f'{args.tilesetname}' 42 | 43 | 44 | ########################## 45 | # Upload to staging area # 46 | ########################## 47 | 48 | # iniatialize AWS session with temp credentials 49 | session = boto3.Session( 50 | aws_access_key_id = accessKeyId, 51 | aws_secret_access_key = secretAccessKey, 52 | aws_session_token = sessionToken, 53 | ) 54 | 55 | # initialize the boto s3 resource 56 | s3 = session.resource('s3') 57 | 58 | # upload file to Mapbox's S3 staging bucket 59 | #aws s3 cp f'{args.token}' s3://{bucket}/{key} --region us-east-1 60 | data = open(f'{args.file}', 'rb') 61 | s3.Bucket(bucket).put_object(Key=key, Body=data) 62 | 63 | 64 | ########################### 65 | # Create upload to Mapbox # 66 | ########################### 67 | 68 | # define the API call and initiate upload 69 | headers = { 70 | 'Content-Type': 'application/json', 71 | 'Cache-Control': 'no-cache', 72 | } 73 | data = '{ "url": ' + f'"{url}"' + ', "tileset": ' + f'"{username}.{tileset_name}"' + ' }' # awkward bc fstrings can't handle literal colons 74 | response = requests.post('https://api.mapbox.com/uploads/v1/devdatalab', headers=headers, params=params, data=data) 75 | 76 | # get upload ID from the response 77 | json_data = response.json() if response and response.status_code == 201 else None 78 | upload_id = json_data['id'] 79 | 80 | 81 | ######################### 82 | # Assert against errors # 83 | ######################### 84 | 85 | # check upload status 86 | response = requests.get(f'https://api.mapbox.com/uploads/v1/devdatalab/{upload_id}', params=params) 87 | 88 | # assert there are no errors in the response 89 | json_data = response.json() if response and response.status_code == 200 else None 90 | error = json_data['error'] 91 | assert not error 92 | 93 | -------------------------------------------------------------------------------- /forecasting/b/test_merged_data.py: -------------------------------------------------------------------------------- 1 | # general imports - use spatial env in configs/ 2 | import geopandas as gpd 3 | import pandas as pd 4 | from pathlib import Path 5 | import shutil 6 | 7 | # import our configs 8 | import sys, os 9 | from ddlpy.utils.tools import process_yaml_config 10 | config = process_yaml_config('~/ddl/covid/forecasting/config/config.yaml') 11 | 12 | # shorten path globals 13 | CCODE = Path(os.path.expanduser(config['globals']['ccode'])) 14 | CDATA = Path(os.path.expanduser(config['globals']['cdata'])) 15 | 16 | # read temp directory from env variable 17 | TMP = Path(os.environ['TMP']) 18 | 19 | 20 | ############### 21 | # Merge tests # 22 | ############### 23 | 24 | # combine DDL covid data and UChicago predictions 25 | pred_data = pd.read_stata(CDATA / 'pred_data_district.dta') 26 | ddl_data = pd.read_stata(CDATA / 'ddl_data.dta') 27 | merged_data = pred_data.merge(ddl_data, how='inner', on=['lgd_district_id', 'lgd_state_id']) 28 | 29 | # check merge rate 30 | if (len(merged_data) / len(pred_data)) < 0.98: 31 | raise ValueError('merge rate from DDL data to covid predictions on LGD state / dist must be greater than 98%') 32 | 33 | 34 | ##################### 35 | # Identifiers tests # 36 | ##################### 37 | 38 | # read in the merged data saved by Stata script 39 | merged_data = pd.read_stata(CDATA / 'merged_data_district.dta') 40 | 41 | # assert we're unique on LGD state/dist and time 42 | if not merged_data.set_index(['lgd_d_id','lgd_s_id', 'date']).index.is_unique: 43 | raise ValueError('LGD state and district do not uniquely identify observations across dates') 44 | 45 | # assert no missings in identifiers 46 | idnames = ['lgd_d_id', 'lgd_s_id'] 47 | for idname in idnames: 48 | if not merged_data[idname].isna().sum() == 0: 49 | raise ValueError(f'Identifier {idname} has missings') 50 | 51 | ################### 52 | # Variables tests # 53 | ################### 54 | 55 | # look for missings 56 | varnames = ['rt_pred', 'total_cases', 'new_cases_ts'] 57 | for varname in varnames: 58 | if not merged_data[varname].isna().sum() == 0: 59 | raise ValueError(f'Variable {varname} has missings') 60 | 61 | 62 | ############## 63 | # Dates test # 64 | ############## 65 | 66 | # THIS HAS BEEN OVERRULED - we allow for differential dates now 67 | ## convert to pd datetime format for sorting 68 | #merged_data['date'] = pd.to_datetime(merged_data['date']) 69 | # 70 | ## get latest date observed for RT within each district into an array 71 | #latest_df = merged_data.loc[merged_data.groupby(['lgd_d_id','lgd_s_id']).date.idxmax()] 72 | # 73 | ## assert we only have a single latest date across all dists 74 | #if not len(latest_df['date'].unique()) == 1: 75 | # raise ValueError(f'Different districts have different latest Rt observation dates in merged DTA file') 76 | # 77 | ## pull latest date into a string 78 | #latest_date = latest_df.iloc[0]['date'].strftime('%Y-%m-%d') 79 | # 80 | ## read in the JSON object in a JS file that contains this "most recent date" metadata to compare to the date in the tabular data 81 | #with open(CDATA / 'pred_metadata.js') as f: 82 | # lines = f.readlines() 83 | #json_date = lines[0].split('most_recent":"',1)[1][:10] 84 | # 85 | ## check that the latest tabular date matches 86 | #if not latest_date == json_date: 87 | # raise ValueError(f'Different latest dates in tabular file and JSON metadata') 88 | 89 | 90 | ################# 91 | # GeoJSON tests # 92 | ################# 93 | 94 | ## read in geojson output that gets transformed to vector tileset 95 | ## hack around gpd.read_file having STRANGE conda-related error when reading from ~/iec/ filesystem?! 96 | #geojson = gpd.read_file(CDATA / 'district.geojson') 97 | # 98 | ## check merged state ids are the same 99 | #geojson['lgd_s_id_x'].equals(geojson['lgd_s_id_y']) 100 | # 101 | ## check that the geojson file also has the same latest date 102 | #geojson['date'] = pd.to_datetime(geojson['date']) 103 | #json_latest = geojson.loc[geojson.groupby(['lgd_d_id','lgd_s_id_x']).date.idxmax()] 104 | #if not len(json_latest['date'].unique()) == 1: 105 | # raise ValueError(f'Different districts have different latest Rt observation dates in geojson file') 106 | # 107 | ## check that the latest date agrees with JS metadata 108 | #latest_date = json_latest.iloc[0]['date'].strftime('%Y-%m-%d') 109 | #if not latest_date == json_date: 110 | # raise ValueError(f'Different latest dates in geojson file and JSON metadata') 111 | 112 | # EXIT 113 | print('TESTS PASSED') 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /forecasting/config/config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # config file for COVID forecasting site 3 | 4 | # define globals shared across Python and Stata 5 | globals: 6 | ccode: ~/ddl/covid/forecasting 7 | cdata: ~/iec/covid/forecasting 8 | tileset_name: covid-forecasting 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /forecasting/config/forecasting.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - conda-forge 4 | dependencies: 5 | - python=3 6 | - requests 7 | - boto3 8 | - git=2 9 | - google-cloud-storage 10 | - pyyaml -------------------------------------------------------------------------------- /forecasting/config/forecasting_spatial.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - defaults 4 | dependencies: 5 | - python>=3.7 6 | - geos 7 | - geopandas=0.9 8 | - geotiff=1 9 | - git=2 10 | - haversine=0.4 11 | - matplotlib=3 12 | - numpy=1 13 | - pandas=1 14 | - proj=6 15 | - pygeos=0.8 16 | - pysal=2 17 | - rasterio=1 18 | - rasterstats=0.14 19 | - shapely=1 20 | - pyyaml 21 | - pip 22 | - pip: 23 | - topojson 24 | -------------------------------------------------------------------------------- /forecasting/update_forecasts_cronjob.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # set this up with the following cron command (executes at 1030AM daily): 4 | # $ crontab -e 5 | # $ 30 10 * * * source $HOME/.bashrc; touch $TMP/rerun_indicator.txt; $HOME/ddl/covid/forecasting/update_forecasts_cronjob.sh 6 | 7 | # depends on slack messaging hook in env variable SLACKKEY 8 | if [[ -z "$SLACKKEY" ]]; then 9 | printf "\nENV variable $SLACKKEY must be defined for cronjob to execute. Add to your .bashrc\n" 10 | fi 11 | 12 | # send init message via slack 13 | curl -X POST -H 'Content-type: application/json' --data '{"text":":building_construction: Beginning auto-update of COVID forecasting platform"}' https://hooks.slack.com/services/$SLACKKEY 14 | 15 | # change dir to scratch for logging 16 | cd /scratch/`whoami` 17 | 18 | # run update script with basic error handling 19 | printf "\nbegin update build: ~/ddl/covid/forecasting/Snakemake\n" 20 | if snakemake --conda-not-block-search-path-envvars --directory $HOME/ddl/covid/forecasting/ --snakefile $HOME/ddl/covid/forecasting/Snakefile --cores 4 --use-conda; then 21 | 22 | # if we don't have an error, send a slack 23 | curl -X POST -H 'Content-type: application/json' --data '{"text":":not-a-dumpster-fire: Successful update of forecasting data!"}' https://hooks.slack.com/services/$SLACKKEY 24 | else 25 | 26 | # if we do have an error, send a slack 27 | curl -X POST -H 'Content-type: application/json' --data '{"text":":rotating_light: FAILURE: auto-update of COVID data had non-zero exit status"}' https://hooks.slack.com/services/$SLACKKEY 28 | fi 29 | 30 | # move back to starting dir 31 | cd - 32 | -------------------------------------------------------------------------------- /make_covid.do: -------------------------------------------------------------------------------- 1 | /* This makefile runs all the data construction steps in the repo */ 2 | 3 | /* globals that need to be set: 4 | $tmp -- a temporary folder 5 | $ccode -- this root folder for this repo 6 | $covidpub -- processed data used as inputs for COVID variable construction 7 | */ 8 | 9 | global fast 1 10 | 11 | /*****************************/ 12 | /* PART 1 -- DDL SERVER ONLY */ 13 | /*****************************/ 14 | 15 | /* match DLHS4 to PC11 districts */ 16 | /* in: $health/DLHS4, $keys/pc11_district_key. out: $health/DLHS4 */ 17 | do $ccode/b/create_dlhs4_pc11_district_key 18 | 19 | /* collapse raw DLHS4 data to district level */ 20 | /* in: $health/DLHS4, pc11_pca_district. out: $health/hosp/dlhs4_hospitals_dist, $covidpub/dhls4_hospitals_dist */ 21 | do $ccode/b/prep_dlhs4_district 22 | 23 | /* prepare short village/town directory and PCA to save in public repo */ 24 | /* in: TD/VD. out: $covidpub/pc11r_hosp, pc11r_hosp */ 25 | do $ccode/b/prep_hosp_pca_vd 26 | 27 | /* generate demographic data and save in public repo */ 28 | do $ccode/b/gen_lgd_pc11_demographics 29 | 30 | /* prepare EC microdata on hospitals */ 31 | /* in: raw economic census 2013. out: $covidpub/ec_hosp_microdata */ 32 | do $ccode/b/prep_ec_hosp_microdata 33 | 34 | /* build age distribution by district/subdistrict, using SECC + PC */ 35 | if "$fast" != "1" { 36 | do $ccode/b/gen_age_distribution 37 | } 38 | 39 | /* Process and generate HMIS distirct data*/ 40 | do $core/hmis/b/create_hmis_district_yearly.do 41 | do $core/hmis/b/create_hmis_district_clean.do 42 | do $core/hmis/b/create_hmis_district_keys.do 43 | 44 | /* Process and generate HMIS subdistrict data*/ 45 | do $core/hmis/b/create_hmis_subdistrict_yearly.do 46 | do $core/hmis/b/create_hmis_subdistrict_clean.do 47 | do $core/hmis/b/create_hmis_subdistrict_keys.do 48 | 49 | /* download latest district-level case data (runs in py3 conda env) */ 50 | do $ccode/b/get_case_data 51 | 52 | /* build NSS deaths data */ 53 | do $ccode/b/gen_nss_district_key.do 54 | do $ccode/b/prep_nss75.do 55 | 56 | /* copy and process keys */ 57 | do $ccode/b/copy_keys.do 58 | 59 | /* process NFHS data */ 60 | // note: this is not executable (sourced from collaborators) but included for reference 61 | // do $ccode/b/ddl_nfhs_poll_hmis.do 62 | 63 | /***********************************************/ 64 | /* PART 2 -- RUNS FROM DATA LINKED IN GIT REPO */ 65 | /***********************************************/ 66 | 67 | /* aggregate case data into a district file with confirmed + deaths */ 68 | do $ccode/b/aggregate_case_data 69 | 70 | /* prepare PC11 hospital/clinic data */ 71 | do $ccode/b/prep_pc_hosp.do 72 | 73 | /* prepare economic census (2013) hospital data */ 74 | do $ccode/b/prep_ec_hosp.do 75 | 76 | /* clean migration data and transform to LGD */ 77 | do $ccode/b/clean_migration.do 78 | 79 | /* clean agmark mandi price data */ 80 | do $ccode/b/clean_agmark.do 81 | 82 | /* prepare SECC district-level poverty data [unfinished] */ 83 | // do $ccode/b/prep_secc.do 84 | 85 | /* subdistrict-level urbanization */ 86 | // gen_urbanization_subdist -- subdistrict PCA urbanization 87 | 88 | 89 | /***************************************/ 90 | /* PART 3 ANALYTICAL RESULTS/ESTIMATES */ 91 | /***************************************/ 92 | 93 | /* predict district and subdistrict mortality distribution based on age distribution */ 94 | /* out: estimates/(sub)district_age_dist_cfr */ 95 | do $ccode/a/predict_age_cfr 96 | 97 | /* combine PC and DLHS hospital capacity */ 98 | do $ccode/a/estimate_hosp_capacity 99 | 100 | /* export some additional stats that were asked for into a combined file */ 101 | do $ccode/a/impute_additional_fields 102 | 103 | 104 | /*****************************/ 105 | /* PART 4 -- DDL SERVER ONLY */ 106 | /*****************************/ 107 | 108 | /* push data and metadata to production. metadata will be included in 109 | data download links as well. */ 110 | // shell source $ccode/b/push_data.sh 111 | -------------------------------------------------------------------------------- /str/manual_covid_case_district_match.csv: -------------------------------------------------------------------------------- 1 | idm_master,idu_using,_pc11_district_name_master,_pc11_district_name_using 2 | 06-nuh,06-mewat,nuh,mewat 3 | 09-prayagraj,09-allahabad,prayagraj,allahabad 4 | 29-bengaluru,29-bangalore,bengaluru,bangalore 5 | 29-belagavi,29-belgaum,belagavi,belgaum 6 | 16-gomati,16-south tripura,gomati,south tripura 7 | 03-s.a.s. nagar,03-sahibzada ajit singh nagar,s.a.s. nagar,sahibzada ajit singh nagar 8 | 06-gurugram,06-gurgaon,gurugram,gurgaon 9 | 09-shamli,09-muzaffarnagar,shamli,muzaffarnagar 10 | 28-s.p.s. nellore,28-sri potti sriramulu nellore,s.p.s. nellore,sri potti sriramulu nellore 11 | --------------------------------------------------------------------------------