├── .gitignore
├── LICENSE
├── README.md
├── figures
    ├── clean
    │   ├── figure00_graphical_abstract.eps
    │   ├── figure01_schematic.pdf
    │   ├── figure02_performance.pdf
    │   ├── figure03_CA_fires.pdf
    │   ├── figure04_annual_smokePM.png
    │   ├── figure05_state_trends.pdf
    │   ├── figure06_smokePM_by_covariates.pdf
    │   ├── figure07_extreme_days.pdf
    │   ├── figureS01_hysplit.png
    │   ├── figureS02_smoke_days.png
    │   ├── figureS03_plume_size_by_date.png
    │   ├── figureS04_hms_missing_dates.pdf
    │   ├── figureS05_station_timeseries.png
    │   ├── figureS06_improve_csn_speciated.png
    │   ├── figureS07_aod_training_locations.png
    │   ├── figureS08_AOD_performance.png
    │   ├── figureS09_grid_to_station.pdf
    │   ├── figureS10_hysplit_aod_comparison.png
    │   ├── figureS11_station_modis_landcover.png
    │   ├── figureS12_poor_performance_stations.png
    │   ├── figureS13_fire_smoke_dynamics.png
    │   ├── figureS14_burned_area_trend.png
    │   ├── figureS15_annual_fires.png
    │   ├── figureS16_state_fires.png
    │   ├── figureS17_maps_avgs5yr_labeled.png
    │   └── figureS18_PurpleAir_comparison.png
    └── raw
    │   ├── figure00a.png
    │   ├── figure00b.png
    │   ├── figure01a.png
    │   ├── figure01b.png
    │   ├── figure01c.png
    │   ├── figure01d.png
    │   ├── figure02a.png
    │   ├── figure02b.png
    │   ├── figure02c_histogram.png
    │   ├── figure02c_map.png
    │   ├── figure02d.png
    │   ├── figure03a.png
    │   ├── figure03b-d_time_series.png
    │   ├── figure03b_map.png
    │   ├── figure03c_map.png
    │   ├── figure03d_map.png
    │   ├── figure04.png
    │   ├── figure05a.png
    │   ├── figure05b_map.png
    │   ├── figure05b_time_series.png
    │   ├── figure06a-d_maps.png
    │   ├── figure06e.png
    │   ├── figure07a-b.png
    │   ├── figure07c.png
    │   ├── figureS01a.png
    │   ├── figureS01b.png
    │   ├── figureS01c.png
    │   ├── figureS02a.png
    │   ├── figureS02b.png
    │   ├── figureS02c.jpg
    │   ├── figureS02d.png
    │   ├── figureS02e.png
    │   ├── figureS02f.png
    │   ├── figureS03.png
    │   ├── figureS04.pdf
    │   ├── figureS05.png
    │   ├── figureS06a.png
    │   ├── figureS06b.png
    │   ├── figureS07.png
    │   ├── figureS08a.png
    │   ├── figureS08b.png
    │   ├── figureS09.pdf
    │   ├── figureS10.png
    │   ├── figureS11.png
    │   ├── figureS12a.png
    │   ├── figureS12b.png
    │   ├── figureS12c.png
    │   ├── figureS13.png
    │   ├── figureS14.png
    │   ├── figureS15.png
    │   ├── figureS16.png
    │   ├── figureS17.png
    │   └── figureS18.png
├── scripts
    ├── main
    │   ├── 01_01_train_AOD_Sherlock.R
    │   ├── 01_02_predict_AOD_Sherlock.R
    │   ├── 01_02_predict_AOD_Sherlock.sh
    │   ├── 02_01_define_smokePM_training_jobs.R
    │   ├── 02_02_smokePM_training.sh
    │   ├── 02_02_train_smokePM_Sherlock.R
    │   ├── 02_03_predict_smokePM_Sherlock.R
    │   ├── 02_03_predict_smokePM_Sherlock.sh
    │   ├── 02_04_combine_preds.R
    │   ├── 02_05_finalize_10km_grid.R
    │   ├── 02_06_gridded_predictions_to_county.R
    │   ├── 02_07_gridded_predictions_to_zip.R
    │   ├── 02_08_gridded_predictions_to_tract.R
    │   ├── 02_09_finalize_preds.R
    │   ├── 03_01_figure00.R
    │   ├── 03_02_figure01.R
    │   ├── 03_03_figure02.R
    │   ├── 03_04_figure03.R
    │   ├── 03_05_figure04.R
    │   ├── 03_06_figure05.R
    │   ├── 03_07_figure06.R
    │   ├── 03_08_figure07.R
    │   ├── 04_01_manuscript_calculations.R
    │   ├── 05_01_figureS01.R
    │   ├── 05_02_figureS02.R
    │   ├── 05_03_figureS03.R
    │   ├── 05_04_figureS04.R
    │   ├── 05_05_figureS05.R
    │   ├── 05_06_figureS06.R
    │   ├── 05_07_figureS07.R
    │   ├── 05_08_figureS08.R
    │   ├── 05_09_figureS09.R
    │   ├── 05_10_figureS10.R
    │   ├── 05_11_figureS11.R
    │   ├── 05_12_figureS12.R
    │   ├── 05_13_figureS13.R
    │   ├── 05_14_figureS14.R
    │   ├── 05_15_figureS15.R
    │   ├── 05_16_figureS16.R
    │   ├── 05_17_figureS17.R
    │   ├── 05_18_figureS18.R
    │   ├── 06_01_tableS01.R
    │   └── 06_02_tableS04.R
    └── setup
    │   ├── 00_01_load_packages.R
    │   ├── 00_02_load_functions.R
    │   └── 00_03_load_settings.R
└── tables
    ├── clean
        ├── tableS01_distance_to_plume_effects.tex
        ├── tableS02_anomAOD_model_inputs.tex
        ├── tableS03_smokePM_model_inputs.tex
        └── tableS04_model_performance.tex
    └── raw
        └── tableS01.tex


/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 
41 | # Other
42 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 ECHOLab: Environmental Change and Human Outcomes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # daily-10km-smokePM
  2 | Repo supporting [Childs et al 2022 "Daily local-level estimates of ambient wildfire smoke PM<sub>2.5</sub> for the contiguous US"](https://doi.org/10.1021/acs.est.2c02934).
  3 | 
  4 | Results from the paper are in the `figures/clean` and `tables/clean` folders. Code to replicate results are in the `scripts` folder. Data, models, and predictions are in [Dropbox](https://www.dropbox.com/sh/e7m3313fb7sqxui/AABGu-jUO3Ps2isGHbB2EGfAa?dl=0).
  5 | 
  6 | ## Final predictions
  7 | Daily smoke PM<sub>2.5</sub> predictions from Jan 1, 2006 to Dec 31, 2020 for the contiguous US can be [downloaded](https://www.dropbox.com/sh/16bwdnfbakvuf3x/AABlnrek080Qu9YnbkLkUk8ha?st=a2oli9on&dl=0) at the following spatial scales:
  8 | * [10 km grid](https://www.dropbox.com/sh/9mbcxy65crd5cex/AAAk2_1QE6A5rAS7M5We-iapa?st=f8gglo0m&dl=0)
  9 | * [County](https://www.dropbox.com/sh/tze93uz29lzyr0h/AABulMjC_l-b5YssncaK5u7ha?st=tujqcqjp&dl=0)
 10 | * [ZCTA5](https://www.dropbox.com/sh/6ynye9u1vl8idbj/AAAAjn61JpxEoyfbESsbQY9Ua?st=9kfa7mxr&dl=0)
 11 | * [Census tract](https://www.dropbox.com/sh/uw81pu5eh6o4q5v/AACZJplCr-EiLruAIFn0nm5Na?st=mz7gp2su&dl=0)
 12 | 
 13 | Data download is also available through [Harvard Dataverse](https://doi.org/10.7910/DVN/DJVMTV).
 14 | 
 15 | ### Descriptions of final prediction files
 16 | 10 km grid
 17 | * `10km_grid/10km_grid_wgs84/`: this is a folder that contains the shapefile for the 10 km grid.
 18 | * `10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds`: this is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at 10 km resolution from January 1, 2006 to December 31, 2020 for the contiguous US. The `grid_id_10km` column in this file corresponds to the `ID` column in the 10 km grid shapefile. All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 $\mu g/m^3$ and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the grid cell-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on grid cell-days in the 10 km grid and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
 19 | 
 20 | ```
 21 | library(lubridate)
 22 | library(sf)
 23 | library(dplyr)
 24 | library(tidyr)
 25 | 
 26 | # Load smokePM predictions on smoke days
 27 | preds = readRDS("./final/10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.rds")
 28 | 
 29 | # Load 10 km grid
 30 | grid_10km = read_sf("./final/10km_grid/10km_grid_wgs84/10km_grid_wgs84.shp")
 31 | 
 32 | # Load full set of dates
 33 | dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
 34 | 
 35 | # Get full combination of grid cell-days
 36 | # Warning: this may require a large amount of memory
 37 | out = expand.grid(grid_id_10km = grid_10km$ID, date = dates)
 38 | 
 39 | # Match smokePM predictions on smoke days to grid cell-days
 40 | out = left_join(out, preds, by = c("grid_id_10km", "date"))
 41 | 
 42 | # Predict 0 for remaining grid cell-days, which are non-smoke days
 43 | out = mutate(out, smokePM_pred = replace_na(smokePM_pred, 0))
 44 | ```
 45 | 
 46 | * `10km_grid/smokePM2pt5_predictions_daily_10km_20060101-20201231.csv`: this is the same as `smokePM2pt5_predictions_daily_10km_20060101-20201231.rds`, except it is saved as a CSV file.
 47 | 
 48 | County
 49 | * `county/tl_2019_us_county/`: this is a folder that contains the shapefile for CONUS counties in 2019. Files were downloaded from the US Census Bureau TIGER/Line Shapefiles [website](https://www.census.gov/cgi-bin/geo/shapefiles/index.php). R users may also use the `tigris` package. This shapefile includes only counties within the spatial domain over which smoke PM2.5 predictions are made.
 50 | 
 51 | * `county/smokePM2pt5_predictions_daily_county_20060101-20201231.rds`: this is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at the county level from January 1, 2006 to December 31, 2020 for the contiguous US. County-level smoke PM2.5 predictions are aggregated from smoke PM2.5 predictions at the 10 km resolution using population and area of intersection-weighted averaging (see `scripts/main/02_06_gridded_predictions_to_county.R`). The `GEOID` column in this file corresponds to the `GEOID` column in the county shapefile. All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 $\mu g/m^3$ and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the county-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on county-days in the counties and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
 52 | 
 53 | ```
 54 | library(lubridate)
 55 | library(sf)
 56 | library(dplyr)
 57 | library(tidyr)
 58 | 
 59 | # Load smokePM predictions on smoke days
 60 | preds = readRDS("./final/county/smokePM2pt5_predictions_daily_county_20060101-20201231.rds")
 61 | 
 62 | # Load counties
 63 | counties = read_sf("./final/county/tl_2019_us_county")
 64 | 
 65 | # Load full set of dates
 66 | dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
 67 | 
 68 | # Get full combination of county-days
 69 | # Warning: this may require a large amount of memory
 70 | out = expand.grid(GEOID = counties$GEOID, date = dates)
 71 | 
 72 | # Match smokePM predictions on smoke days to county-days
 73 | out = left_join(out, preds, by = c("GEOID", "date"))
 74 | 
 75 | # Predict 0 for remaining county-days, which are non-smoke days
 76 | out = mutate(out, smokePM_pred = replace_na(smokePM_pred, 0))
 77 | ```
 78 | * `county/smokePM2pt5_predictions_daily_county_20060101-20201231.csv`: this is the same as `smokePM2pt5_predictions_daily_county_20060101-20201231.rds`, except it is saved as a CSV file.
 79 | 
 80 | ZIP Code Tabulation Area (ZCTA5)
 81 | * `zcta/tl_2019_us_zcta510/`: ths is a folder that contains the shapefile for CONUS zip code tabulation areas in 2019. Files were downloaded from the US Census Bureau TIGER/Line Shapefiles [website](https://www.census.gov/cgi-bin/geo/shapefiles/index.php). R users may also use the `tigris` package. This shapefile includes only ZCTAs within the spatial domain over which smoke PM2.5 predictions are made.
 82 | 
 83 | * `zcta/smokePM2pt5_predictions_daily_zcta_20060101-20201231.rds`: this is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at the ZCTA5 level from January 1, 2006 to December 31, 2020 for the contiguous US. ZCTA-level smoke PM2.5 predictions are aggregated from smoke PM2.5 predictions at the 10 km resolution using population and area of intersection-weighted averaging (see `scripts/main/02_07_gridded_predictions_to_zip.R`).The `GEOID10` column in this file corresponds to the `GEOID10` column in the ZCTA shapefile. All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 $\mu g/m^3$ and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the ZCTA-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on ZCTA-days in the ZCTAs and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
 84 | 
 85 | ```
 86 | library(lubridate)
 87 | library(sf)
 88 | library(dplyr)
 89 | library(tidyr)
 90 | 
 91 | # Load smokePM predictions on smoke days
 92 | preds = readRDS("./final/zcta/smokePM2pt5_predictions_daily_zcta_20060101-20201231.rds")
 93 | 
 94 | # Load ZCTAs
 95 | zctas = read_sf("./final/zcta/tl_2019_us_zcta510")
 96 | 
 97 | # Load full set of dates
 98 | dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
 99 | 
100 | # Get full combination of ZCTA-days
101 | # Warning: this may require a large amount of memory
102 | out = expand.grid(GEOID10 = zctas$GEOID10, date = dates)
103 | 
104 | # Match smokePM predictions on smoke days to ZCTA-days
105 | out = left_join(out, preds, by = c("GEOID10", "date"))
106 | 
107 | # Predict 0 for remaining ZCTA-days, which are non-smoke days
108 | out = mutate(out, smokePM_pred = replace_na(smokePM_pred, 0))
109 | ```
110 | 
111 | * `zcta/smokePM2pt5_predictions_daily_zcta_20060101-20201231.csv`: this is the same as `smokePM2pt5_predictions_daily_zcta_20060101-20201231.rds`, except it is saved as a CSV file.
112 | 
113 | Census tract
114 | * `tract/tracts/`: this is a folder that contains the shapefiles for CONUS census tracts by state/territory in 2019. Files were downloaded from the US Census Bureau TIGER/Line Shapefiles [website](https://www.census.gov/cgi-bin/geo/shapefiles/index.php). R users may also use the `tigris` package. This shapefile includes only tracts within the spatial domain over which smoke PM2.5 predictions are made.
115 | 
116 | * `tract/smokePM2pt5_predictions_daily_tract_20060101-20201231.rds`: this is a file that contains a data frame with the final set of daily smoke PM2.5 predictions on smoke days at the tract level from January 1, 2006 to December 31, 2020 for the contiguous US. Tract-level smoke PM2.5 predictions are aggregated from smoke PM2.5 predictions at the 10 km resolution using population and area of intersection-weighted averaging (see `scripts/main/02_08_gridded_predictions_to_tract.R`). The `GEOID` column in this file corresponds to the `GEOID` column in the tract shapefiles. All rows in this file are predictions on smoke days. Predictions on non-smoke days are by construction 0 $\mu g/m^3$ and not included in this file. A smoke PM2.5 prediction of 0 in this file means that the tract-day did have a smoke day but did not have elevated PM2.5. The full set of smoke PM2.5 predictions on both smoke days and non-smoke days can be obtained by setting the smoke PM2.5 prediction to 0 on tract-days in the tracts and in the January 1, 2006-December 31, 2020 date range that are not in this file. For example, the R code below returns the full set of smoke PM2.5 predictions:
117 | 
118 | ```
119 | library(lubridate)
120 | library(sf)
121 | library(dplyr)
122 | library(tidyr)
123 | 
124 | # Load smokePM predictions on smoke days
125 | preds = readRDS("./final/tract/smokePM2pt5_predictions_daily_tract_20060101-20201231.rds")
126 | 
127 | # Load tracts
128 | tracts = list.files("./final/tract/tracts", full.names = T, pattern = "\\.shp$")
129 | tracts = lapply(tracts, read_sf)
130 | tracts = bind_rows(tracts)
131 | 
132 | # Load full set of dates
133 | dates = seq.Date(ymd("20060101"), ymd("20201231"), by = "day")
134 | 
135 | # Get full combination of tract-days
136 | # Warning: this may require a large amount of memory
137 | out = expand.grid(GEOID = tracts$GEOID, date = dates)
138 | 
139 | # Match smokePM predictions on smoke days to tract-days
140 | out = left_join(out, preds, by = c("GEOID", "date"))
141 | 
142 | # Predict 0 for remaining tract-days, which are non-smoke days
143 | out = mutate(out, smokePM_pred = replace_na(smokePM_pred, 0))
144 | ```
145 | 
146 | * `tract/smokePM2pt5_predictions_daily_tract_20060101-20201231.csv`: this is the same as `smokePM2pt5_predictions_daily_tract_20060101-20201231.rds`, except it is saved as a CSV file.
147 | 
148 | ## How to replicate results
149 | 1. Download this repository.
150 | 2. Download the [Dropbox](https://www.dropbox.com/sh/e7m3313fb7sqxui/AABGu-jUO3Ps2isGHbB2EGfAa?dl=0) folder. Place files downloaded from Dropbox in the same folder as the downloaded GitHub repository.
151 | 3. Change settings in `scripts/setup/00_03_load_settings.R`:
152 |     1. Set `gee_email` to your Google Earth Engine email.
153 |     2. Set `key` to the value of your US Census Bureau API Key (which can be requested [here](https://api.census.gov/data/key_signup.html)).
154 |     3. Set `num_cores` to the number of cores to use in parallel computing.
155 |     4. Set `path_dropbox` to the location of the data downloaded from Dropbox.
156 |     5. Set `path_github` to the location of this downloaded repository's root.
157 | 4. Install packages listed in `scripts/setup/00_01_load_packages.R`.
158 | 5. Set working directory to this downloaded repository's root.
159 | 6. Run scripts in `scripts/main`. Some scripts may require relatively large computer memory.
160 | 


--------------------------------------------------------------------------------
/figures/clean/figure01_schematic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figure01_schematic.pdf


--------------------------------------------------------------------------------
/figures/clean/figure02_performance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figure02_performance.pdf


--------------------------------------------------------------------------------
/figures/clean/figure03_CA_fires.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figure03_CA_fires.pdf


--------------------------------------------------------------------------------
/figures/clean/figure04_annual_smokePM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figure04_annual_smokePM.png


--------------------------------------------------------------------------------
/figures/clean/figure05_state_trends.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figure05_state_trends.pdf


--------------------------------------------------------------------------------
/figures/clean/figure06_smokePM_by_covariates.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figure06_smokePM_by_covariates.pdf


--------------------------------------------------------------------------------
/figures/clean/figure07_extreme_days.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figure07_extreme_days.pdf


--------------------------------------------------------------------------------
/figures/clean/figureS01_hysplit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS01_hysplit.png


--------------------------------------------------------------------------------
/figures/clean/figureS02_smoke_days.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS02_smoke_days.png


--------------------------------------------------------------------------------
/figures/clean/figureS03_plume_size_by_date.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS03_plume_size_by_date.png


--------------------------------------------------------------------------------
/figures/clean/figureS04_hms_missing_dates.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS04_hms_missing_dates.pdf


--------------------------------------------------------------------------------
/figures/clean/figureS05_station_timeseries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS05_station_timeseries.png


--------------------------------------------------------------------------------
/figures/clean/figureS06_improve_csn_speciated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS06_improve_csn_speciated.png


--------------------------------------------------------------------------------
/figures/clean/figureS07_aod_training_locations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS07_aod_training_locations.png


--------------------------------------------------------------------------------
/figures/clean/figureS08_AOD_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS08_AOD_performance.png


--------------------------------------------------------------------------------
/figures/clean/figureS09_grid_to_station.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS09_grid_to_station.pdf


--------------------------------------------------------------------------------
/figures/clean/figureS10_hysplit_aod_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS10_hysplit_aod_comparison.png


--------------------------------------------------------------------------------
/figures/clean/figureS11_station_modis_landcover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS11_station_modis_landcover.png


--------------------------------------------------------------------------------
/figures/clean/figureS12_poor_performance_stations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS12_poor_performance_stations.png


--------------------------------------------------------------------------------
/figures/clean/figureS13_fire_smoke_dynamics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS13_fire_smoke_dynamics.png


--------------------------------------------------------------------------------
/figures/clean/figureS14_burned_area_trend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS14_burned_area_trend.png


--------------------------------------------------------------------------------
/figures/clean/figureS15_annual_fires.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS15_annual_fires.png


--------------------------------------------------------------------------------
/figures/clean/figureS16_state_fires.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS16_state_fires.png


--------------------------------------------------------------------------------
/figures/clean/figureS17_maps_avgs5yr_labeled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS17_maps_avgs5yr_labeled.png


--------------------------------------------------------------------------------
/figures/clean/figureS18_PurpleAir_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/clean/figureS18_PurpleAir_comparison.png


--------------------------------------------------------------------------------
/figures/raw/figure00a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure00a.png


--------------------------------------------------------------------------------
/figures/raw/figure00b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure00b.png


--------------------------------------------------------------------------------
/figures/raw/figure01a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure01a.png


--------------------------------------------------------------------------------
/figures/raw/figure01b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure01b.png


--------------------------------------------------------------------------------
/figures/raw/figure01c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure01c.png


--------------------------------------------------------------------------------
/figures/raw/figure01d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure01d.png


--------------------------------------------------------------------------------
/figures/raw/figure02a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure02a.png


--------------------------------------------------------------------------------
/figures/raw/figure02b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure02b.png


--------------------------------------------------------------------------------
/figures/raw/figure02c_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure02c_histogram.png


--------------------------------------------------------------------------------
/figures/raw/figure02c_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure02c_map.png


--------------------------------------------------------------------------------
/figures/raw/figure02d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure02d.png


--------------------------------------------------------------------------------
/figures/raw/figure03a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure03a.png


--------------------------------------------------------------------------------
/figures/raw/figure03b-d_time_series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure03b-d_time_series.png


--------------------------------------------------------------------------------
/figures/raw/figure03b_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure03b_map.png


--------------------------------------------------------------------------------
/figures/raw/figure03c_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure03c_map.png


--------------------------------------------------------------------------------
/figures/raw/figure03d_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure03d_map.png


--------------------------------------------------------------------------------
/figures/raw/figure04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure04.png


--------------------------------------------------------------------------------
/figures/raw/figure05a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure05a.png


--------------------------------------------------------------------------------
/figures/raw/figure05b_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure05b_map.png


--------------------------------------------------------------------------------
/figures/raw/figure05b_time_series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure05b_time_series.png


--------------------------------------------------------------------------------
/figures/raw/figure06a-d_maps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure06a-d_maps.png


--------------------------------------------------------------------------------
/figures/raw/figure06e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure06e.png


--------------------------------------------------------------------------------
/figures/raw/figure07a-b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure07a-b.png


--------------------------------------------------------------------------------
/figures/raw/figure07c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figure07c.png


--------------------------------------------------------------------------------
/figures/raw/figureS01a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS01a.png


--------------------------------------------------------------------------------
/figures/raw/figureS01b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS01b.png


--------------------------------------------------------------------------------
/figures/raw/figureS01c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS01c.png


--------------------------------------------------------------------------------
/figures/raw/figureS02a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS02a.png


--------------------------------------------------------------------------------
/figures/raw/figureS02b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS02b.png


--------------------------------------------------------------------------------
/figures/raw/figureS02c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS02c.jpg


--------------------------------------------------------------------------------
/figures/raw/figureS02d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS02d.png


--------------------------------------------------------------------------------
/figures/raw/figureS02e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS02e.png


--------------------------------------------------------------------------------
/figures/raw/figureS02f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS02f.png


--------------------------------------------------------------------------------
/figures/raw/figureS03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS03.png


--------------------------------------------------------------------------------
/figures/raw/figureS04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS04.pdf


--------------------------------------------------------------------------------
/figures/raw/figureS05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS05.png


--------------------------------------------------------------------------------
/figures/raw/figureS06a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS06a.png


--------------------------------------------------------------------------------
/figures/raw/figureS06b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS06b.png


--------------------------------------------------------------------------------
/figures/raw/figureS07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS07.png


--------------------------------------------------------------------------------
/figures/raw/figureS08a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS08a.png


--------------------------------------------------------------------------------
/figures/raw/figureS08b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS08b.png


--------------------------------------------------------------------------------
/figures/raw/figureS09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS09.pdf


--------------------------------------------------------------------------------
/figures/raw/figureS10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS10.png


--------------------------------------------------------------------------------
/figures/raw/figureS11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS11.png


--------------------------------------------------------------------------------
/figures/raw/figureS12a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS12a.png


--------------------------------------------------------------------------------
/figures/raw/figureS12b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS12b.png


--------------------------------------------------------------------------------
/figures/raw/figureS12c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS12c.png


--------------------------------------------------------------------------------
/figures/raw/figureS13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS13.png


--------------------------------------------------------------------------------
/figures/raw/figureS14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS14.png


--------------------------------------------------------------------------------
/figures/raw/figureS15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS15.png


--------------------------------------------------------------------------------
/figures/raw/figureS16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS16.png


--------------------------------------------------------------------------------
/figures/raw/figureS17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS17.png


--------------------------------------------------------------------------------
/figures/raw/figureS18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echolab-stanford/daily-10km-smokePM/26d52f20048353b2dd4c394715b5df7e9a558bcc/figures/raw/figureS18.png


--------------------------------------------------------------------------------
/scripts/main/01_01_train_AOD_Sherlock.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Trains anomalous AOD model.
  8 | # ------------------------------------------------------------------------------
  9 | txt_progress_file <- file.path(path_output, "anomAOD", "model", "aod_xgb_progress.txt")
 10 | max_xgb_rounds <- 10000
 11 | bayes_opt_n_init <- 24
 12 | bayes_opt_n_iter <- 24
 13 | 
 14 | if (Sys.getenv('SLURM_JOB_ID') != ""){
 15 |   usable.cores <- Sys.getenv("SLURM_NTASKS_PER_NODE")
 16 | } else {
 17 |   usable.cores <- 2
 18 | }
 19 | 
 20 | mod_data <- readRDS(file.path(path_data, "4_clean", "aod_training.rds"))
 21 | 
 22 | xgb_opt_fun <- function(eta, gamma, max_depth, subsample, colsample_bytree, 
 23 |                         min_child_weight, xgb_mat, cv_ind = NULL, 
 24 |                         rounds_max = 100, nfold = 4, nthread = 2,
 25 |                         progress_file = ""){
 26 |   set.seed(10001)
 27 |   mod_xgb_cv <- xgb.cv(
 28 |     params = list(booster = "gbtree",
 29 |                   eta = eta,
 30 |                   gamma = gamma,
 31 |                   max_depth = max_depth,
 32 |                   subsample = subsample,
 33 |                   colsample_bytree = colsample_bytree,
 34 |                   min_child_weight = min_child_weight, 
 35 |                   objective = "reg:squarederror",
 36 |                   eval_metric = "rmse"),
 37 |     data = xgb_mat,
 38 |     nrounds = rounds_max,
 39 |     nthread = nthread,
 40 |     nfold = nfold,
 41 |     folds = cv_ind,
 42 |     early_stopping_rounds = 10,
 43 |     print_every_n = 20,
 44 |     verbose = TRUE)
 45 |   
 46 |   if(progress_file != ""){
 47 |     write(paste0("nrounds = ", mod_xgb_cv$best_iteration, 
 48 |                  ", eta = ", eta, 
 49 |                  ", gamma = ", gamma, 
 50 |                  ", max_depth = ", max_depth, 
 51 |                  ", colsample_bytree = ", colsample_bytree, 
 52 |                  ", subsample = ", subsample, 
 53 |                  ", min_child_weight = ", min_child_weight, 
 54 |                  ", RMSE = ", mod_xgb_cv$evaluation_log %>% 
 55 |                    filter(iter == mod_xgb_cv$best_iteration) %>% 
 56 |                    pull(test_rmse_mean),
 57 |                  "\n"),
 58 |           file = progress_file,
 59 |           append = T)}
 60 |   list(Score = mod_xgb_cv$evaluation_log %>% 
 61 |          filter(iter == mod_xgb_cv$best_iteration) %>% 
 62 |          pull(test_rmse_mean) %>% 
 63 |          multiply_by(-1), 
 64 |        Pred = mod_xgb_cv$best_iteration) %>% 
 65 |     return
 66 | }
 67 | 
 68 | xgb_train_mat <- xgb.DMatrix(
 69 |   data = model.matrix.lm(~.-1, 
 70 |                          data = mod_data %>% 
 71 |                            mutate(month = as.factor(month)) %>%
 72 |                            select(month, lat, lon, aot_anom, 
 73 |                                   aot_anom_lag1, aot_anom_lag2, aot_anom_lag3, 
 74 |                                   fire_dist_km, closest_fire_area, closest_fire_num_points,
 75 |                                   pbl_min, pbl_max, pbl_mean,
 76 |                                   wind_u, wind_v, 
 77 |                                   dewpoint_temp_2m, temp_2m,  
 78 |                                   sea_level_pressure, surface_pressure, precip, 
 79 |                                   elevation_mean, elevation_stdDev, 
 80 |                                   developed, barren, forest, shrubland, cultivated, 
 81 |                                   wetlands, herbaceous, water),
 82 |                          na.action = "na.pass"), 
 83 |   label = mod_data %>% 
 84 |     pull(aod_anom))
 85 | 
 86 | # define folds
 87 | cv_folds = mod_data %>% 
 88 |   mutate(row_no = row_number()) %>% 
 89 |   group_by(fold) %>% 
 90 |   summarise(row_nos = list(row_no)) %>% 
 91 |   pull(row_nos) 
 92 | 
 93 | 
 94 | # run bayesian optimization to find the best hyperparameters
 95 | write("", txt_progress_file)
 96 | 
 97 | tic <- Sys.time()
 98 | print(paste0("Starting bayesian optimization at ", tic))
 99 | set.seed(10001)
100 | bayes_opt_params <- BayesianOptimization(
101 |   function(eta, gamma, max_depth, subsample,
102 |            colsample_bytree, min_child_weight){
103 |     xgb_opt_fun(eta, gamma, max_depth, subsample, 
104 |                 colsample_bytree, min_child_weight, 
105 |                 xgb_train_mat, cv_ind = cv_folds, 
106 |                 rounds_max = max_xgb_rounds, nfold = length(cv_folds), 
107 |                 nthread = usable.cores, 
108 |                 progress_file = txt_progress_file)},
109 |   bounds = list(
110 |     eta = c(0.0001, 0.2),
111 |     gamma = c(0, 100),
112 |     max_depth = c(2L, 50L),
113 |     colsample_bytree = c(0.5, 1),
114 |     subsample = c(0.25, 1),
115 |     min_child_weight = c(1L, 50L)),
116 |   init_points = bayes_opt_n_init, # heuristic on how many points? 
117 |   n_iter = bayes_opt_n_iter)
118 | toc <- Sys.time()
119 | print(paste0("bayesian optimization completed at ", toc))
120 | print(toc - tic)
121 | 
122 | mod_gb_final <- xgb.train(
123 |   params = c(as.list(bayes_opt_params$Best_Par),
124 |              booster = "gbtree",
125 |              objective = "reg:squarederror",
126 |              eval_metric = "rmse"),
127 |   data = xgb_train_mat,
128 |   nrounds = bayes_opt_params$History %>%
129 |     filter(Value == bayes_opt_params$Best_Value) %>%
130 |     pull(Round) %>%
131 |     magrittr::extract(unlist(bayes_opt_params$Pred), .),
132 |   verbose = 1)
133 | 
134 | saveRDS(bayes_opt_params, file.path(path_output, "anomAOD", "model", "aod_bayes_opt_params.rds"))
135 | xgb.save(mod_gb_final, file.path(path_output, "anomAOD", "model", "aod_mod.xgb"))
136 | 
137 | # save variable importance information 
138 | feat_names <- model.matrix.lm(~.-1, 
139 |                               data = mod_data %>% 
140 |                                 mutate(month = as.factor(month)) %>%
141 |                                 select(month, lat, lon, aot_anom, 
142 |                                        aot_anom_lag1, aot_anom_lag2, aot_anom_lag3, 
143 |                                        fire_dist_km, closest_fire_area, closest_fire_num_points,
144 |                                        pbl_min, pbl_max, pbl_mean,
145 |                                        wind_u, wind_v, 
146 |                                        dewpoint_temp_2m, temp_2m,  
147 |                                        sea_level_pressure, surface_pressure, precip, 
148 |                                        elevation_mean, elevation_stdDev, 
149 |                                        developed, barren, forest, shrubland, cultivated, 
150 |                                        wetlands, herbaceous, water),
151 |                               na.action = "na.pass") %>% colnames
152 | 
153 | var_import <- xgb.importance(model = mod_gb_final)
154 | 
155 | saveRDS(list(variable_importance = var_import, 
156 |              feature_names = feat_names), 
157 |         file.path(path_output, "anomAOD", "model", "AOD_var_importance.rds"))
158 | 


--------------------------------------------------------------------------------
/scripts/main/01_02_predict_AOD_Sherlock.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #SBATCH --job-name=predict_AOD
 4 | #
 5 | #SBATCH --partition=serc
 6 | #SBATCH --nodes=1
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --cpus-per-task=33
 9 | #SBATCH --time=1-06:00:00
10 | #SBATCH --mem-per-cpu=20G
11 | #SBATCH --output=predict_AOD.log
12 | #SBATCH --mail-type=ALL
13 | 
14 | # load modules
15 | ml physics gdal/2.2.1 udunits proj/4.9.3 geos 
16 | ml R/4.0.2
17 | 
18 | # execute script
19 | Rscript scripts/main/01_02_predict_AOD_Sherlock.R
20 | 


--------------------------------------------------------------------------------
/scripts/main/02_01_define_smokePM_training_jobs.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | #-------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Defines smoke PM2.5 training jobs.
 8 | #-------------------------------------------------------------------------------
 9 | # csv with jobs to submit 
10 | expand.grid(cv_fold_num = c(0:4,99),
11 |             drop_vars = c("", "traj_points", "aod_anom_pred")) %>%
12 |   write.table(file.path(path_data, "smokePM_training_jobs.csv"), 
13 |                             row.names = FALSE, col.names = FALSE, sep = ",")
14 | 


--------------------------------------------------------------------------------
/scripts/main/02_02_smokePM_training.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export IFS=","
 3 | 
 4 | cat data/smokePM_training_jobs.csv | while read a b; do 
 5 | 
 6 | job_file="train_smokePM_fold${a}_drop${b}.job"
 7 | 
 8 | echo "#!/bin/bash
 9 | 
10 | #SBATCH  -p serc
11 | #SBATCH --job-name=train_smokePM_fold${a}_drop${b}
12 | #SBATCH --nodes=1              
13 | #SBATCH --ntasks-per-node=20
14 | #SBATCH --mem-per-cpu=20GB
15 | #SBATCH --time=5-00:00:00
16 | #SBATCH --mail-type=ALL
17 | #SBATCH --output=logs/train_smokePM_fold${a}_drop${b}.log
18 | 
19 | 
20 | ml R/4.0.2
21 | 
22 | Rscript scripts/main/02_02_train_smokePM_Sherlock.R "$a" "$b" " > $job_file
23 | 
24 |     sbatch $job_file
25 | 
26 | done
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/main/02_02_train_smokePM_Sherlock.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | #-------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Trains smoke PM2.5 model.
  8 | #-------------------------------------------------------------------------------
  9 | args <- commandArgs(TRUE)
 10 | cv_fold_num <- as.numeric(args[1])
 11 | drop_vars <- strsplit(args[2], split = "-")[[1]] # second arg should be any character strings of variables to be dropped separated by dashes, matches are on containment of that string in the column name
 12 | 
 13 | print(paste0("tuning model, excluding fold ", cv_fold_num))
 14 | print(paste0("excluding variables with names containing ",
 15 |              paste0(drop_vars, collapse = " or ")))
 16 | 
 17 | txt_progress_file <- file.path(path_output, "smokePM", "model", 
 18 |                                paste0("smokePM_xgb_progress_fold", cv_fold_num, 
 19 |                                       paste0(c("_drop", drop_vars), collapse = "-"), ".txt"))
 20 | max_xgb_rounds <- 10000
 21 | bayes_opt_n_init <- 24
 22 | bayes_opt_n_iter <- 16
 23 | 
 24 | param_bounds <- list(
 25 |   eta = c(0.001, 0.1),
 26 |   gamma = c(0, 50),
 27 |   max_depth = c(2L, 25L),
 28 |   colsample_bytree = c(0.5, 1),
 29 |   subsample = c(0.25, 1),
 30 |   min_child_weight = c(1L, 50L))
 31 | 
 32 | if (Sys.getenv('SLURM_JOB_ID') != "") {
 33 |   usable.cores <- Sys.getenv("SLURM_NTASKS_PER_NODE")
 34 | } else {
 35 |   usable.cores <- 2
 36 | }
 37 | 
 38 | print(paste0("there are ", usable.cores, " usable cores"))
 39 | 
 40 | mod_data <- readRDS(file.path(path_data, "smokePM_training.rds")) %>% 
 41 |   filter(fold != cv_fold_num)
 42 | 
 43 | xgb_train_mat <- xgb.DMatrix(
 44 |   data = model.matrix.lm(~.-1, 
 45 |                          data = mod_data %>% 
 46 |                            select(month, lat, lon, 
 47 |                                   aot_anom, aot_anom_lag1, aot_anom_lag2, aot_anom_lag3,
 48 |                                   aod_anom_pred_0.00, aod_anom_pred_0.25, aod_anom_pred_0.50,
 49 |                                   aod_anom_pred_0.75, aod_anom_pred_1.00, aod_anom_pred_mean,
 50 |                                   AODmissing,
 51 |                                   num_traj_points_height_1, num_traj_points_height_2,
 52 |                                   num_traj_points_height_3, num_traj_points_height_4, 
 53 |                                   num_traj_points_height_5,
 54 |                                   fire_dist_km, closest_fire_area, closest_fire_num_points,
 55 |                                   pbl_min, pbl_max, pbl_mean,
 56 |                                   wind_u, wind_v, 
 57 |                                   dewpoint_temp_2m, temp_2m,  
 58 |                                   sea_level_pressure, surface_pressure, precip, 
 59 |                                   elevation_mean, elevation_stdDev, 
 60 |                                   developed, barren, forest, shrubland, cultivated, 
 61 |                                   wetlands, herbaceous, water) %>% 
 62 |                            select(-contains(drop_vars)),
 63 |                          na.action = "na.pass"), 
 64 |   label = mod_data %>% 
 65 |     pull(smokePM))
 66 | 
 67 | 
 68 | xgb_opt_fun <- function(eta, gamma, max_depth, subsample, colsample_bytree, 
 69 |                         min_child_weight, xgb_mat, cv_ind = NULL, 
 70 |                         rounds_max = 100, nfold = 4, nthread = 2,
 71 |                         progress_file = ""){
 72 |   if(progress_file != ""){
 73 |     write(paste0("eta = ", eta, 
 74 |                  ", gamma = ", gamma, 
 75 |                  ", max_depth = ", max_depth, 
 76 |                  ", colsample_bytree = ", colsample_bytree, 
 77 |                  ", subsample = ", subsample, 
 78 |                  ", min_child_weight = ", min_child_weight),
 79 |           file = progress_file,
 80 |           append = T)}
 81 |   
 82 |   set.seed(10001)
 83 |   mod_xgb_cv <- xgb.cv(
 84 |     params = list(booster = "gbtree",
 85 |                   eta = eta,
 86 |                   gamma = gamma,
 87 |                   max_depth = max_depth,
 88 |                   subsample = subsample,
 89 |                   colsample_bytree = colsample_bytree,
 90 |                   min_child_weight = min_child_weight, 
 91 |                   objective = "reg:squarederror",
 92 |                   eval_metric = "rmse"),
 93 |     data = xgb_mat,
 94 |     nrounds = rounds_max,
 95 |     nthread = nthread,
 96 |     nfold = nfold,
 97 |     folds = cv_ind,
 98 |     early_stopping_rounds = 10,
 99 |     print_every_n = 20,
100 |     verbose = TRUE)
101 |   
102 |   if (progress_file != ""){
103 |     write(paste0("nrounds = ", mod_xgb_cv$best_iteration, 
104 |                  ", RMSE = ", mod_xgb_cv$evaluation_log %>% 
105 |                    filter(iter == mod_xgb_cv$best_iteration) %>% 
106 |                    pull(test_rmse_mean),
107 |                  "\n"),
108 |           file = progress_file,
109 |           append = T)}
110 |   
111 |   list(Score = mod_xgb_cv$evaluation_log %>% 
112 |          filter(iter == mod_xgb_cv$best_iteration) %>% 
113 |          pull(test_rmse_mean) %>% 
114 |          multiply_by(-1), 
115 |        Pred = mod_xgb_cv$best_iteration) %>% 
116 |     return
117 | }
118 | 
119 | 
120 | # define folds
121 | cv_folds = mod_data %>% 
122 |   mutate(row_no = row_number()) %>% 
123 |   group_by(fold) %>% 
124 |   summarise(row_nos = list(row_no)) %>% 
125 |   pull(row_nos) 
126 | 
127 | 
128 | # run bayesian optimization to find the best hyperparameters
129 | write("", txt_progress_file)
130 | 
131 | tic <- Sys.time()
132 | set.seed(10001)
133 | bayes_opt_params <- BayesianOptimization(
134 |   function(eta, gamma, max_depth, subsample,
135 |            colsample_bytree, min_child_weight){
136 |     xgb_opt_fun(eta, gamma, max_depth, subsample, 
137 |                 colsample_bytree, min_child_weight, 
138 |                 xgb_train_mat, cv_ind = cv_folds, 
139 |                 rounds_max = max_xgb_rounds, nfold = length(cv_folds), 
140 |                 nthread = min(usable.cores, 8),
141 |                 progress_file = txt_progress_file)},
142 |   bounds = param_bounds,
143 |   init_points = bayes_opt_n_init, # heuristic on how many points? 
144 |   n_iter = bayes_opt_n_iter)
145 | toc <- Sys.time()
146 | toc - tic
147 | 
148 | saveRDS(bayes_opt_params, 
149 |         file.path(path_output, "smokePM", "model", 
150 |                   paste0("smokePM_bayes_opt_params_fold", cv_fold_num, 
151 |                          paste0(c("_drop", drop_vars), collapse = "-"),
152 |                          ".rds")))
153 | 
154 | mod_gb_final <- xgb.train(
155 |   params = c(as.list(bayes_opt_params$Best_Par),
156 |              booster = "gbtree",
157 |              objective = "reg:squarederror",
158 |              eval_metric = "rmse"),
159 |   data = xgb_train_mat,
160 |   nrounds = bayes_opt_params$History %>%
161 |     filter(Value == bayes_opt_params$Best_Value) %>%
162 |     pull(Round) %>%
163 |     magrittr::extract(unlist(bayes_opt_params$Pred), .),
164 |   verbose = 1)
165 | 
166 | xgb.save(mod_gb_final,
167 |          file.path(path_output, "smokePM", "model", 
168 |                    paste0("smokePM_mod_fold", cv_fold_num, 
169 |                           paste0(c("_drop", drop_vars), collapse = "-"),
170 |                           ".xgb")))
171 | 
172 | # predict for full data set and save predictions
173 | pred_data <- readRDS(file.path(path_data, "smokePM_training.rds"))
174 | 
175 | xgb_pred_mat <- xgb.DMatrix(
176 |   data = model.matrix.lm(~.-1, 
177 |                          data = pred_data %>% 
178 |                            select(month, lat, lon, 
179 |                                   aot_anom, aot_anom_lag1, aot_anom_lag2, aot_anom_lag3,
180 |                                   aod_anom_pred_0.00, aod_anom_pred_0.25, aod_anom_pred_0.50,
181 |                                   aod_anom_pred_0.75, aod_anom_pred_1.00, aod_anom_pred_mean,
182 |                                   AODmissing,
183 |                                   num_traj_points_height_1, num_traj_points_height_2,
184 |                                   num_traj_points_height_3, num_traj_points_height_4, 
185 |                                   num_traj_points_height_5,
186 |                                   fire_dist_km, closest_fire_area, closest_fire_num_points,
187 |                                   pbl_min, pbl_max, pbl_mean,
188 |                                   wind_u, wind_v, 
189 |                                   dewpoint_temp_2m, temp_2m,  
190 |                                   sea_level_pressure, surface_pressure, precip, 
191 |                                   elevation_mean, elevation_stdDev, 
192 |                                   developed, barren, forest, shrubland, cultivated, 
193 |                                   wetlands, herbaceous, water) %>% 
194 |                            select(-contains(drop_vars)),
195 |                          na.action = "na.pass"))
196 | 
197 | preds <- pred_data %>% 
198 |   select(id, date, fold) %>% 
199 |   cbind(smokePM_pred = predict(mod_gb_final, xgb_pred_mat))
200 | 
201 | saveRDS(preds, 
202 |         file.path(path_output, "smokePM", "model", paste0("smokePM_pred_fold", cv_fold_num, 
203 |                                       paste0(c("_drop", drop_vars), collapse = "-"), 
204 |                                       ".rds")))
205 | 
206 | # time permitting, calculate and save the variable importance
207 | var_import <-  xgb.importance(model = mod_gb_final)
208 | feat_names <- model.matrix.lm(~.-1, 
209 |                               data = mod_data %>% 
210 |                                 select(month, lat, lon, 
211 |                                        aot_anom, aot_anom_lag1, aot_anom_lag2, aot_anom_lag3,
212 |                                        aod_anom_pred_0.00, aod_anom_pred_0.25, aod_anom_pred_0.50,
213 |                                        aod_anom_pred_0.75, aod_anom_pred_1.00, aod_anom_pred_mean,
214 |                                        AODmissing,
215 |                                        num_traj_points_height_1, num_traj_points_height_2,
216 |                                        num_traj_points_height_3, num_traj_points_height_4, 
217 |                                        num_traj_points_height_5,
218 |                                        fire_dist_km, closest_fire_area, closest_fire_num_points,
219 |                                        pbl_min, pbl_max, pbl_mean,
220 |                                        wind_u, wind_v, 
221 |                                        dewpoint_temp_2m, temp_2m,  
222 |                                        sea_level_pressure, surface_pressure, precip, 
223 |                                        elevation_mean, elevation_stdDev, 
224 |                                        developed, barren, forest, shrubland, cultivated, 
225 |                                        wetlands, herbaceous, water) %>% 
226 |                                 select(-contains(drop_vars)),
227 |                               na.action = "na.pass") %>% colnames
228 | 
229 | saveRDS(list(variable_importance = var_import, 
230 |              feature_names = feat_names), 
231 |         file.path(path_output, "smokePM", "model", 
232 |                   paste0("smokePM_var_importance_fold", cv_fold_num, 
233 |                          paste0(c("_drop", drop_vars), collapse = "-"),
234 |                          ".rds")))
235 | 


--------------------------------------------------------------------------------
/scripts/main/02_03_predict_smokePM_Sherlock.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #SBATCH --job-name=predict_smokePM
 4 | #
 5 | #SBATCH --partition=serc
 6 | #SBATCH --nodes=1
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --cpus-per-task=30
 9 | #SBATCH --time=1-06:00:00
10 | #SBATCH --mem-per-cpu=16G
11 | #SBATCH --output=logs/predict_smokePM.log
12 | #SBATCH --mail-type=ALL
13 | 
14 | # load modules
15 | ml physics gdal/2.2.1 udunits proj/4.9.3 geos 
16 | ml R/4.0.2
17 | 
18 | # execute script
19 | Rscript scripts/main/02_03_predict_smokePM_Sherlock.R
20 | 


--------------------------------------------------------------------------------
/scripts/main/02_04_combine_preds.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | #-------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Combines smoke PM2.5 predictions.
 8 | #-------------------------------------------------------------------------------
 9 | smokePM_pred <- list.files(
10 |   file.path(path_output, "smokePM", "predictions", "10km_smoke_days"), 
11 |   full.names = TRUE
12 | ) %>% 
13 |   map_dfr(readRDS)
14 | 
15 | smokePM_pred %<>% mutate(smokePM_pred = pmax(0, smokePM_pred))
16 | 
17 | saveRDS(smokePM_pred, 
18 |         file.path(path_output, "smokePM", "predictions", "combined", 
19 |                   paste0("smokePM_predictions_", 
20 |                          format(min(smokePM_pred$date), "%Y%m%d"),
21 |                          "_",
22 |                          format(max(smokePM_pred$date), "%Y%m%d"),
23 |                          ".rds")))
24 | 


--------------------------------------------------------------------------------
/scripts/main/02_05_finalize_10km_grid.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Jessica Li
 7 | # Drops 10 km grid cells that we do not make predictions for. Saves predictions 
 8 | # and 10 km grid in root folder. 
 9 | # ------------------------------------------------------------------------------
10 | # Load 10 km grid
11 | grid_10km = read_sf(file.path(path_data, "1_grids", "grid_10km_wgs84"))
12 | 
13 | # Get grid cell IDs for which predictions are not made
14 | no_pred_cells = readRDS(list.files(file.path(
15 |   path_data, "ERA5_variables", "Land", "surface_pressure", "USA", "10km_grid", 
16 |   "UTC-0600", "daily_mean_of_1-hourly"), full.names = T)[1]) %>% 
17 |   filter(is.na(surface_pressure)) %>% 
18 |   pull(id_grid) %>% 
19 |   unique()
20 | 
21 | # Subset grid
22 | grid_10km = grid_10km %>% filter(!(ID %in% no_pred_cells))
23 | 
24 | # Save
25 | if (!dir.exists(file.path(path_final, "10km_grid", "10km_grid_wgs84"))) dir.create(file.path(path_final, "10km_grid", "10km_grid_wgs84"))
26 | write_sf(grid_10km, file.path(path_final, "10km_grid", "10km_grid_wgs84", "10km_grid_wgs84.shp"))
27 | 


--------------------------------------------------------------------------------
/scripts/main/02_06_gridded_predictions_to_county.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Aggregates 10 km grid smokePM predictions to county level.
 8 | # ------------------------------------------------------------------------------
 9 | unit <- "county" # alternatively, "tract"
10 | # load shapefile, plus 10km grid transformed to match the crs
11 | if(unit == "county"){
12 |   unit_sf <- counties() %>% 
13 |     filter(STATEFP %in% nonContig_stateFIPS == F)
14 | } else if(unit == "tract"){
15 |   unit_sf <- states() %>% 
16 |     filter(STATEFP %in% nonContig_stateFIPS == F) %>% 
17 |     pull(STATEFP) %>% 
18 |     map_dfr(function(x){
19 |       tracts(x, year = 2019) %>% select(STATEFP, GEOID)
20 |     })
21 | } else{
22 |   stop("only allowed units are \"tract\" or \"county\"")
23 | }
24 | 
25 | # read in the grid
26 | grid_10km <- st_read(file.path(path_final, "10km_grid", "10km_grid_wgs84", "10km_grid_wgs84.shp")) %>% 
27 |   st_transform(st_crs(unit_sf))
28 | 
29 | # make a crosswalk with intersection area with grid cells
30 | unit_cross = st_intersection(unit_sf,
31 |                              grid_10km) %>% 
32 |   select(GEOID, grid_id_10km = ID) %>% 
33 |   {cbind(st_drop_geometry(.), 
34 |          area = st_area(.))} 
35 | 
36 | # save the crosswalk since it takes a while to make 
37 | saveRDS(unit_cross, file.path(path_data, paste0(unit, "_grid_area_crosswalk.rds")))
38 | 
39 | # population by grid cell
40 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
41 |                   full.names = T) %>% purrr::map_dfr(read.csv)
42 | 
43 | # smoke PM predictions 
44 | smokePM <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds"))
45 | 
46 | # lets only save predictions if there's a smoke day in the unit, start by identifying smoke-days per unit
47 | unit_smoke_days = smokePM %>% # 51434138 rows
48 |   # add unit information, this will duplicate any rows that are in multiple counties
49 |   left_join(unit_cross %>% select(grid_id_10km, GEOID),
50 |             by = "grid_id_10km") %>% # 76009188 rows for county 
51 |   filter(!is.na(GEOID)) %>% # drop grid cells that don't match to a unit
52 |   # full set of unit-days with smoke 
53 |   select(date, GEOID) %>% 
54 |   unique  # 2308941 rows (should actually be less after dropping NAs)
55 | 
56 | unit_smokePM <- unit_smoke_days %>% 
57 |   # join in all grid-cells for each unit
58 |   left_join(unit_cross, by = "GEOID") %>% # 119622779 rows
59 |   # join in population and smoke PM predictions 
60 |   left_join(pop %>% select(grid_id_10km = ID, grid_pop_per_m2 = mean)) %>%
61 |   left_join(smokePM) # should still be 119622779 rows
62 | 
63 | # fill missings with 0s
64 | # calculate pop-weighted avg (density * area) over grid cells in each unit
65 | avg_unit_smokePM <- unit_smokePM %>% 
66 |   replace_na(list(smokePM_pred = 0)) %>%
67 |   mutate(area = unclass(area), 
68 |          pop = grid_pop_per_m2*area) %>%
69 |   group_by(GEOID, date) %>% 
70 |   summarise(smokePM_pred = weighted.mean(smokePM_pred, pop)) %>% 
71 |   ungroup
72 | 
73 | saveRDS(avg_unit_smokePM, 
74 |         file.path(path_output, "smokePM", "predictions", "combined", 
75 |           paste0(unit, "_smokePM_predictions_20060101_20201231.rds")))
76 | 


--------------------------------------------------------------------------------
/scripts/main/02_07_gridded_predictions_to_zip.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Aggregates 10 km grid smokePM predictions to zip level.
  8 | # ------------------------------------------------------------------------------
  9 | unit <- "zcta" # alternatively, "county"
 10 | # load shapefile, plus 10km grid transformed to match the crs
 11 | if (unit == "county") {
 12 |   unit_sf <- counties() %>% 
 13 |     filter(STATEFP %in% nonContig_stateFIPS == F)
 14 | } else if (unit == "tract") {
 15 |   unit_sf <- states() %>% 
 16 |     filter(STATEFP %in% nonContig_stateFIPS == F) %>% 
 17 |     pull(STATEFP) %>% 
 18 |     map_dfr(function(x){
 19 |       tracts(x, year = 2019) %>% select(STATEFP, GEOID)
 20 |     })
 21 | } else if (unit == "zcta") {
 22 |   unit_sf <- states() %>% 
 23 |     filter(STATEFP %in% nonContig_stateFIPS == F) %>% 
 24 |     pull(STATEFP) %>% 
 25 |     map_dfr(function(x){
 26 |       zctas(x, year = 2019) %>% select(ZCTA5CE10, GEOID10)
 27 |     })
 28 | } else {
 29 |   stop("only allowed units are \"tract\" or \"zcta\" or \"county\"")
 30 | }
 31 | 
 32 | # read in the grid
 33 | grid_10km <- st_read(file.path(path_final, "10km_grid", "10km_grid_wgs84", "10km_grid_wgs84.shp")) %>% 
 34 |   st_transform(st_crs(unit_sf))
 35 | 
 36 | # make a crosswalk with intersection area with grid cells
 37 | if (unit %in% c("county", "tract")) {
 38 |   unit_cross = st_intersection(unit_sf,
 39 |                                grid_10km) %>% 
 40 |     select(GEOID, grid_id_10km = ID) %>% 
 41 |     {cbind(st_drop_geometry(.), 
 42 |            area = st_area(.))} 
 43 | } else if (unit == "zcta") {
 44 |   unit_cross = st_intersection(unit_sf,
 45 |                                grid_10km) %>% 
 46 |     select(GEOID10, grid_id_10km = ID) %>% 
 47 |     {cbind(st_drop_geometry(.), 
 48 |            area = st_area(.))} 
 49 | }
 50 | # save the crosswalk since it takes a while to make 
 51 | saveRDS(unit_cross, file.path(path_data, paste0(unit, "_grid_area_crosswalk.rds")))
 52 | # unit_cross <- readRDS(file.path(path_data, paste0(unit, "_grid_area_crosswalk.rds")))
 53 | 
 54 | # smoke PM predictions 
 55 | smokePM <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds"))
 56 | 
 57 | # population by grid cell
 58 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
 59 |                   full.names = T) %>% purrr::map_dfr(read.csv)
 60 | 
 61 | # loop through work with one date at a time? rbind the data frames from the dates together
 62 | if (unit == "zcta") {
 63 |   unit_cross = unit_cross %>% rename(GEOID = GEOID10)
 64 | }
 65 | avg_unit_smokePM <- smokePM %>% 
 66 |   pull(date) %>% 
 67 |   unique %>% 
 68 |   purrr::map_dfr(function(i){
 69 |     print(i)
 70 |     # filter to the date of interest
 71 |     smokePM %>% 
 72 |       filter(date == i) %>%
 73 |       # add on all the tract/county unit IDs that are among the grid cells with smoke 
 74 |       left_join(unit_cross %>% select(grid_id_10km, GEOID),
 75 |                 by = "grid_id_10km") %>% 
 76 |       filter(!is.na(GEOID)) %>% # drop grid cells that don't match to a unit
 77 |       # full set of unit-days with smoke 
 78 |       select(GEOID, date) %>% 
 79 |       unique %>%
 80 |       # join in all grid-cells for each unit
 81 |       left_join(unit_cross, by = "GEOID") %>% 
 82 |       # join in population and smoke PM predictions
 83 |       left_join(pop %>% select(grid_id_10km = ID, grid_pop_per_m2 = mean),
 84 |                 by = "grid_id_10km") %>% 
 85 |       left_join(smokePM %>% filter(date == i), 
 86 |                 by = c("grid_id_10km", "date")) %>%
 87 |       # fill missing smoke PM values with zero
 88 |       replace_na(list(smokePM_pred = 0)) %>%
 89 |       mutate(area = unclass(area),
 90 |              pop = grid_pop_per_m2*area) %>%
 91 |       # for each unit-date calculate pop-weighted avg 
 92 |       group_by(GEOID, date) %>%
 93 |       summarise(smokePM_pred = weighted.mean(smokePM_pred, pop),
 94 |                 .groups = "drop") 
 95 |   })
 96 | if (unit == "zcta") {
 97 |   avg_unit_smokePM = avg_unit_smokePM %>% rename(GEOID10 = GEOID)
 98 | }
 99 | 
100 | saveRDS(avg_unit_smokePM, 
101 |         file.path(path_output, "smokePM", "predictions", "combined", 
102 |                   paste0(unit, "_smokePM_predictions_20060101_20201231.rds")))
103 | 


--------------------------------------------------------------------------------
/scripts/main/02_08_gridded_predictions_to_tract.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Aggregates 10 km grid smokePM predictions to census tract level.
 8 | # ------------------------------------------------------------------------------
 9 | unit <- "tract" # alternatively, "county"
10 | # load shapefile, plus 10km grid transformed to match the crs
11 | if(unit == "county"){
12 |   unit_sf <- counties() %>% 
13 |     filter(STATEFP %in% nonContig_stateFIPS == F)
14 | } else if(unit == "tract"){
15 |   unit_sf <- states() %>% 
16 |     filter(STATEFP %in% nonContig_stateFIPS == F) %>% 
17 |     pull(STATEFP) %>% 
18 |     map_dfr(function(x){
19 |       tracts(x, year = 2019) %>% select(STATEFP, GEOID)
20 |     })
21 | } else{
22 |   stop("only allowed units are \"tract\" or \"county\"")
23 | }
24 | 
25 | # read in the grid
26 | grid_10km <- st_read(file.path(path_final, "10km_grid", "10km_grid_wgs84", "10km_grid_wgs84.shp")) %>% 
27 |   st_transform(st_crs(unit_sf))
28 | 
29 | # make a crosswalk with intersection area with grid cells
30 | unit_cross = st_intersection(unit_sf,
31 |                              grid_10km) %>% 
32 |   select(GEOID, grid_id_10km = ID) %>% 
33 |   {cbind(st_drop_geometry(.), 
34 |          area = st_area(.))} 
35 | 
36 | # save the crosswalk since it takes a while to make 
37 | saveRDS(unit_cross, file.path(path_data, paste0(unit, "_grid_area_crosswalk.rds")))
38 | # unit_cross <- readRDS(file.path(path_data, paste0(unit, "_grid_area_crosswalk.rds")))
39 | 
40 | # smoke PM predictions 
41 | smokePM <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds"))
42 | 
43 | # population by grid cell
44 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
45 |                   full.names = T) %>% purrr::map_dfr(read.csv)
46 | 
47 | # loop through work with one date at a time? rbind the data frames from the dates together
48 | avg_unit_smokePM <- smokePM %>% 
49 |   pull(date) %>% 
50 |   unique %>% 
51 |   purrr::map_dfr(function(i){
52 |     print(i)
53 |     # filter to the date of interest
54 |     smokePM %>% 
55 |       filter(date == i) %>%
56 |       # add on all the tract/county unit IDs that are among the grid cells with smoke 
57 |       left_join(unit_cross %>% select(grid_id_10km, GEOID),
58 |                 by = "grid_id_10km") %>% 
59 |       filter(!is.na(GEOID)) %>% # drop grid cells that don't match to a unit
60 |       # full set of unit-days with smoke 
61 |       select(GEOID, date) %>% 
62 |       unique %>%
63 |       # join in all grid-cells for each unit
64 |       left_join(unit_cross, by = "GEOID") %>% 
65 |       # join in population and smoke PM predictions
66 |       left_join(pop %>% select(grid_id_10km = ID, grid_pop_per_m2 = mean),
67 |                 by = "grid_id_10km") %>% 
68 |       left_join(smokePM %>% filter(date == i), 
69 |                 by = c("grid_id_10km", "date")) %>%
70 |       # fill missing smoke PM values with zero
71 |       replace_na(list(smokePM_pred = 0)) %>%
72 |       group_by(GEOID, date) %>%
73 |       mutate(grid_pop_per_m2 = ifelse(rep(all(grid_pop_per_m2 == 0), n()), 1, grid_pop_per_m2)) %>%
74 |       ungroup() %>%
75 |       mutate(area = unclass(area),
76 |              pop = grid_pop_per_m2*area) %>%
77 |       # for each unit-date calculate pop-weighted avg 
78 |       group_by(GEOID, date) %>%
79 |       summarise(smokePM_pred = weighted.mean(smokePM_pred, pop),
80 |                 .groups = "drop") 
81 |   })
82 | 
83 | saveRDS(avg_unit_smokePM, 
84 |         file.path(path_output, "smokePM", "predictions", "combined", 
85 |           paste0(unit, "_smokePM_predictions_20060101_20201231.rds")))
86 | 


--------------------------------------------------------------------------------
/scripts/main/02_09_finalize_preds.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Jessica Li
 7 | # Saves smokePM aggregates in final folder.
 8 | # ------------------------------------------------------------------------------
 9 | # Load predictions
10 | preds = readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds"))
11 | 
12 | # Save
13 | saveRDS(preds, file.path(path_final, "10km_grid", "smokePM2pt5_predictions_daily_10km_20060101-20201231.rds"))
14 | 
15 | # Convert date to character
16 | preds = preds %>% mutate(date = format(date, "%Y%m%d"))
17 | 
18 | # Save
19 | write.csv(preds, file.path(path_final, "10km_grid", "smokePM2pt5_predictions_daily_10km_20060101-20201231.csv"), row.names = F)
20 | 
21 | # ------------------------------------------------------------------------------
22 | # Load predictions aggregated to county level
23 | preds = readRDS(file.path(path_output, "smokePM", "predictions", "combined", "county_smokePM_predictions_20060101_20201231.rds"))
24 | 
25 | # Save
26 | saveRDS(preds, file.path(path_final, "county", "smokePM2pt5_predictions_daily_county_20060101-20201231.rds"))
27 | 
28 | # Convert date to character
29 | preds = preds %>% mutate(date = format(date, "%Y%m%d"))
30 | 
31 | # Save
32 | write.csv(preds, file.path(path_final, "county", "smokePM2pt5_predictions_daily_county_20060101-20201231.csv"), row.names = F)
33 | 
34 | # ------------------------------------------------------------------------------
35 | # Load predictions aggregated to zip level
36 | preds = readRDS(file.path(path_output, "smokePM", "predictions", "combined", "zcta_smokePM_predictions_20060101_20201231.rds"))
37 | 
38 | # Save
39 | saveRDS(preds, file.path(path_final, "zcta", "smokePM2pt5_predictions_daily_zcta_20060101-20201231.rds"))
40 | 
41 | # Convert date to character
42 | preds = preds %>% mutate(date = format(date, "%Y%m%d"))
43 | 
44 | # Save
45 | write.csv(preds, file.path(path_final, "zcta", "smokePM2pt5_predictions_daily_zcta_20060101-20201231.csv"), row.names = F)
46 | 
47 | # ------------------------------------------------------------------------------
48 | # Load predictions aggregated to census tract level
49 | preds = readRDS(file.path(path_output, "smokePM", "predictions", "combined", "tract_smokePM_predictions_20060101_20201231.rds"))
50 | 
51 | # Save
52 | saveRDS(preds, file.path(path_final, "tract", "smokePM2pt5_predictions_daily_tract_20060101-20201231.rds"))
53 | 
54 | # Convert date to character
55 | preds = preds %>% mutate(date = format(date, "%Y%m%d"))
56 | 
57 | # Save
58 | write.csv(preds, file.path(path_final, "tract", "smokePM2pt5_predictions_daily_tract_20060101-20201231.csv"), row.names = F)
59 | 


--------------------------------------------------------------------------------
/scripts/main/03_01_figure00.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots graphical abstract.
  8 | # ------------------------------------------------------------------------------
  9 | # Load smokePM prediction
 10 | smokePM_preds <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds"))
 11 | 
 12 | # Load population density
 13 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
 14 |                   full.names = T) %>% purrr::map_dfr(read.csv)
 15 | 
 16 | # Load 10 km grid
 17 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84")) %>% 
 18 |   mutate(area = st_area(geometry) %>% unclass) # area in m^2
 19 | 
 20 | # 
 21 | extremes_by_year <- smokePM_preds %>%
 22 |   mutate(year = lubridate::year(date)) %>% 
 23 |   {full_join(group_by(., year) %>% 
 24 |                summarise(days_over50 = sum(smokePM_pred > 50),
 25 |                          days_over100 = sum(smokePM_pred > 100), 
 26 |                          days_over200 = sum(smokePM_pred > 200), 
 27 |                          .groups = "drop"),
 28 |              group_by(., year, grid_id_10km) %>%  
 29 |                summarise(pop_over50 = any(smokePM_pred >50),
 30 |                          pop_over100 = any(smokePM_pred > 100), 
 31 |                          pop_over200 = any(smokePM_pred > 200),
 32 |                          .groups = "drop") %>% 
 33 |                left_join(pop %>% left_join(grid_10km) %>%
 34 |                            transmute(grid_id_10km = ID, pop = mean*area)) %>% 
 35 |                group_by(year) %>% 
 36 |                summarise(across(starts_with("pop_over"), ~sum(.x*pop)), 
 37 |                          .groups = "drop"))}
 38 | # change in extremes
 39 | extremes_by_year %>% 
 40 |   mutate(period = case_when(year >= 2006 & year <=2010 ~ "2006 - 2010", 
 41 |                             year >= 2016 & year <=2020 ~ "2016 - 2020",
 42 |                             T ~ as.character(NA))) %>% 
 43 |   filter(!is.na(period)) %>% 
 44 |   group_by(period) %>% 
 45 |   summarise(across(contains("over"), mean)) %>% 
 46 |   pivot_longer(contains("over")) %>% 
 47 |   separate(name, into = c("panel", "cutoff"), sep = "_") %>%
 48 |   mutate(panel = dplyr::recode_factor(panel, 
 49 |                                       "days" = "average annual number of grid cell-days",
 50 |                                       "pop" = "average annual population exposed\nto at least one day",
 51 |                                       .ordered = TRUE)) %>%
 52 |   {ggplot(data = ., 
 53 |           aes(x = period, y = value, group = cutoff, 
 54 |               color = cutoff)) + 
 55 |       geom_point() + 
 56 |       geom_line() + 
 57 |       geom_text(data = group_by(., panel, cutoff) %>% 
 58 |                   summarise(asinh_mid = sinh(mean(asinh(value))),
 59 |                             .groups = "drop") %>% 
 60 |                   mutate(cutoff_num = gsub("over", "", cutoff), 
 61 |                          label_top = paste0("paste(\'Days > ", cutoff_num, "\', mu, \'g\')")),
 62 |                 aes(y = asinh_mid, label = label_top,
 63 |                     x = 2.1 + ifelse(cutoff == "over100", 0.03, 0) + 
 64 |                       ifelse(cutoff == "over50", 0.06, 0)), 
 65 |                 color = "black", hjust = 0, vjust = 0, parse = TRUE) + 
 66 |       geom_text(data = group_by(., panel, cutoff) %>% 
 67 |                   summarise(asinh_mid = sinh(mean(asinh(value))),
 68 |                             mult = max(value)/min(value),
 69 |                             .groups = "drop") %>% 
 70 |                   mutate(mult_round = formatC(round(signif(mult, 2), 0),
 71 |                                               format="d", big.mark=","), 
 72 |                          label_bottom = paste0(mult_round, "x increase")),
 73 |                 aes(y = asinh_mid, label = label_bottom, 
 74 |                     x = 2.1 + ifelse(cutoff == "over100", 0.03, 0) + 
 75 |                       ifelse(cutoff == "over50", 0.06, 0)), 
 76 |                 color = "black", hjust = 0, vjust = 1) + 
 77 |       geom_linerange(data = group_by(., panel, cutoff) %>% 
 78 |                        summarise(min = min(value), 
 79 |                                  max = max(value), 
 80 |                                  .groups = "drop"), 
 81 |                      mapping = aes(ymin = min, ymax = max, 
 82 |                                    x = 2.05 + ifelse(cutoff == "over100", 0.03, 0) + 
 83 |                                      ifelse(cutoff == "over50", 0.06, 0), 
 84 |                                    color = cutoff), 
 85 |                      inherit.aes = FALSE) + 
 86 |       facet_wrap(~panel, scales = "free", nrow = 1, strip.position = "left") + 
 87 |       scale_y_continuous(trans = "pseudo_log", 
 88 |                          expand = expansion(mult = c(0.07, 0.05)),
 89 |                          breaks = c(0, 1, 5, 10, 50, 100, 500, 
 90 |                                     1000, 5000, 10000, 50000, 100000, 
 91 |                                     500000, 1000000, 5000000, 10000000, 
 92 |                                     20000000),
 93 |                          labels = c("0", "1", "5", "10", "50", "100", "500", 
 94 |                                     "1k", "5k", "10k", "50k", "100k", 
 95 |                                     "500k", "1m", "5m", "10m", "20m")) +
 96 |       scale_x_discrete(expand = expansion(mult = c(0.1, 0.05))) + 
 97 |       coord_cartesian(clip = "off") +
 98 |       scale_color_manual(values = c("#e8993b", "#913818", "#4a571e"), 
 99 |                          guide = "none") + 
100 |       xlab("") + ylab("") + 
101 |       theme_classic() + 
102 |       theme(plot.margin = margin(0.5,5.5,0,-1, "lines"), 
103 |             text = element_text(size = 15),
104 |             axis.text.x = element_text(vjust = -0.5),
105 |             strip.placement = "outside", 
106 |             strip.background = element_blank(),
107 |             strip.text = element_text(size = 12),
108 |             panel.spacing.x = unit(6.25, "lines"))} %>% 
109 |   ggsave(file.path(path_figures, "figure00.png"), ., 
110 |          width = 8, height = 3.95)
111 | 


--------------------------------------------------------------------------------
/scripts/main/03_04_figure03.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots California fires 2018-2020.
  8 | 
  9 | # calfire from https://frap.fire.ca.gov/frap-projects/fire-perimeters/
 10 | # ------------------------------------------------------------------------------
 11 | # Load smokePM predictions
 12 | smokePM_preds <- list.files(
 13 |   file.path(path_output, "smokePM", "predictions", "10km_smoke_days"), 
 14 |   full.names = TRUE
 15 | ) %>% 
 16 |   map_dfr(readRDS) %>% 
 17 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
 18 | 
 19 | # Load 10 km grid
 20 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84"))
 21 | 
 22 | # Load population density
 23 | pop <- list.files(
 24 |   file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
 25 |   full.names = T
 26 | ) %>% 
 27 |   purrr::map_dfr(read.csv) %>% 
 28 |   rename(pop_density = mean)
 29 | 
 30 | # Define fire panels
 31 | fire_panels <- data.frame(
 32 |   panel_name = c("November 2018", "October - November 2019", "Fall 2020"), 
 33 |   start_date = as.Date(c("2018-11-01", "2019-10-20", "2020-08-10")), 
 34 |   end_date = as.Date(c("2018-11-30", "2019-11-15", "2020-11-15"))
 35 | )
 36 | 
 37 | # Load CAL FIRE data
 38 | calfire <- st_read(file.path(path_data, "CAL FIRE FRAP", "fire20_1.gdb"), layer = "firep20_1") %>% 
 39 |   mutate(Fire = recode(FIRE_NAME, 
 40 |                        "CAMP" = "Camp Fire", 
 41 |                        "KINCADE" = "Kincade Fire", 
 42 |                        "AUGUST COMPLEX FIRES" = "August Complex", 
 43 |                        "CZU LIGHTNING COMPLEX" = "CZU Lightning Complex"),
 44 |          panel_name = recode(FIRE_NAME, 
 45 |                              "CAMP" = "November 2018",  
 46 |                              "KINCADE" = "October - November 2019",
 47 |                              "AUGUST COMPLEX FIRES" = "Fall 2020", 
 48 |                              "CZU LIGHTNING COMPLEX" = "Fall 2020"), 
 49 |          panel_fire_no = recode(FIRE_NAME, 
 50 |                                 "CAMP" = 1,  
 51 |                                 "KINCADE" = 1,
 52 |                                 "AUGUST COMPLEX FIRES" = 2, 
 53 |                                 "CZU LIGHTNING COMPLEX" = 1, 
 54 |                                 .default = as.numeric(NA)), 
 55 |          select_fires = ((FIRE_NAME == "CAMP" & YEAR_ == "2018" & ALARM_DATE > as.Date("2018-11-01")) |
 56 |                            (FIRE_NAME == "KINCADE" & YEAR_ == "2019") |
 57 |                            FIRE_NAME  %in% c("AUGUST COMPLEX FIRES", "CZU LIGHTNING COMPLEX")))
 58 | 
 59 | # Load counties
 60 | counties <- tigris::counties()
 61 | 
 62 | # Load states
 63 | states <- tigris::states()
 64 | states %<>% st_transform(crs = st_crs(grid_10km))
 65 | 
 66 | # Limit to California
 67 | CA_cells <- st_intersects(grid_10km, states %>% filter(NAME == "California"))
 68 | CA_cells <- grid_10km$ID[CA_cells %>% 
 69 |                            purrr::map_lgl(function(x){length(x) > 0})]
 70 | 
 71 | # Match to counties
 72 | county_cells <- counties %>% 
 73 |   filter(NAME %in% c("Sacramento", "Santa Clara", "Fresno", "Sonoma")) %>% 
 74 |   select(NAME, geometry) %>% 
 75 |   purrr::pmap_dfr(function(NAME, geometry){
 76 |     data.frame(county = NAME, 
 77 |                grid_id_10km = grid_10km$ID[st_intersects(grid_10km, geometry) %>% 
 78 |                                              purrr::map_lgl(function(x){length(x) > 0})]) %>% 
 79 |       return()
 80 |   })
 81 | 
 82 | # Sampling of fires from 2018 to 2020 in CA
 83 | grid_fire_preds <-  fire_panels %>% 
 84 |   purrr::pmap_dfr(function(panel_name, start_date, end_date){
 85 |     smokePM_preds %>% filter(grid_id_10km %in% county_cells$grid_id_10km) %>% 
 86 |       {right_join(., 
 87 |                   expand.grid(grid_id_10km = pull(., grid_id_10km) %>% unique,
 88 |                               date = seq.Date(from = start_date, 
 89 |                                               to = end_date, 
 90 |                                               by = "day")))} %>% 
 91 |       replace_na(list(smokePM_pred = 0)) %>% 
 92 |       left_join(county_cells) %>% 
 93 |       mutate(panel_name = panel_name) %>% 
 94 |       return
 95 |   }) %>% 
 96 |   {mutate(., panel_name = factor(panel_name, levels = unique(.$panel_name),
 97 |                                  ordered = TRUE))} 
 98 | 
 99 | grid_fire_avgs <- group_by(grid_fire_preds, grid_id_10km, panel_name, county) %>% 
100 |   summarise(total_smokePM = mean(smokePM_pred),
101 |             .groups = "drop") %>% 
102 |   group_by(panel_name, county) %>% 
103 |   mutate(nobs = n()) %>% 
104 |   ungroup %>% 
105 |   mutate(type = factor("average", levels = c("time_series","average"), ordered = T),
106 |          county_no = as.numeric(as.factor(county)))
107 | 
108 | # Plot time series
109 | CA_fires_ts <- ggplot(mapping = aes(x = date, y = smokePM_pred, color = county)) + 
110 |   # Add lines for each grid cell
111 |   geom_line(data = grid_fire_preds %>% 
112 |               mutate(type = factor("time_series", levels = c("time_series", "average"), ordered = T)) %>% 
113 |               group_by(county) %>% 
114 |               mutate(n_cell = length(unique(grid_id_10km))),
115 |             aes(group = grid_id_10km, 
116 |                 alpha = I(20/n_cell))) + 
117 |   # Add mean lines for each county
118 |   geom_line(data = left_join(grid_fire_preds, pop, by = c("grid_id_10km" = "ID")) %>%
119 |               group_by(date, county, panel_name) %>%
120 |               summarise(smokePM_pred = weighted.mean(smokePM_pred, pop_density),
121 |                         .groups = "drop") %>%
122 |               mutate(type = factor("time_series", levels = c("time_series", "average"), ordered = T)),
123 |             size = 1.5) +
124 |   # Add dashes for average smoke PM for grid cell
125 |   geom_linerange(data = grid_fire_avgs, 
126 |                  aes(y = total_smokePM, color = county, 
127 |                      xmin = as.Date(county_no*2, origin = "2000-01-01"), 
128 |                      xmax = as.Date(county_no*2+1.5, origin = "2000-01-01"),
129 |                      alpha = I(20/nobs)),
130 |                  inherit.aes = FALSE) + 
131 |   # Add dashes for average smokePM for county
132 |   geom_linerange(data = left_join(grid_fire_avgs, pop, by = c("grid_id_10km" = "ID")) %>%
133 |                    group_by(panel_name, county, county_no, type) %>%
134 |                    summarise(total_smokePM = weighted.mean(total_smokePM, pop_density)),
135 |                  aes(y = total_smokePM, color = county,
136 |                      xmin = as.Date(county_no*2, origin = "2000-01-01"),
137 |                      xmax = as.Date(county_no*2+1.5, origin = "2000-01-01")),
138 |                  size = 1.5,
139 |                  inherit.aes = FALSE) +
140 |   # Add alpha = 0 points to average plots to get the zeros to align
141 |   geom_point(data = left_join(calfire %>% filter(select_fires),
142 |                               grid_fire_avgs %>% group_by(panel_name) %>% summarise(max = max(total_smokePM))),
143 |              aes(y = -panel_fire_no*0.075*max), 
144 |              x = as.Date(4, origin = "2000-01-01"), 
145 |              inherit.aes = FALSE,
146 |              alpha = 0) +
147 |   # Add line ranges on the bottom for the fires' start and end 
148 |   geom_linerange(data = left_join(calfire %>% filter(select_fires),
149 |                                   grid_fire_preds %>% group_by(panel_name) %>% summarise(max = max(smokePM_pred))) %>% 
150 |                    mutate(panel_name = factor(panel_name, levels = levels(grid_fire_preds$panel_name), ordered = T),
151 |                           type = factor("time_series", levels = c("time_series", "average"), ordered = T)), 
152 |                  aes(xmin = as.Date(ALARM_DATE), 
153 |                      xmax = as.Date(CONT_DATE), 
154 |                      y = -panel_fire_no*0.075*max), 
155 |                  inherit.aes = FALSE) + 
156 |   facet_wrap(panel_name~type, scales = "free", nrow =3, ncol = 2,
157 |              labeller = labeller(.multi_line = FALSE)) +
158 |   scale_color_manual(values = MetBrewer::met.brewer("Juarez")[c(1,6,5,3)],
159 |                      guide = "none") +
160 |   scale_y_continuous(sec.axis = sec_axis(~., 
161 |                                          name=expression(paste("Daily average (",mu, "g/", m^3, ")")))) +
162 |   theme_classic() + xlab("") + ylab(expression(paste("smoke ", PM[2.5]," (",mu, "g/", m^3, ")"))) +
163 |   theme(strip.background = element_blank(), 
164 |         strip.text.x = element_blank(),
165 |         panel.spacing.y = unit(1.5, "lines"), 
166 |         panel.spacing.x = unit(-2, "lines"))
167 | 
168 | # Make it into a gtable so we can edit
169 | CA_fires_ts <- ggplot_gtable(ggplot_build(CA_fires_ts))
170 | 
171 | # Make the average column narrower 
172 | CA_fires_ts$widths[9] <- CA_fires_ts$widths[9]*0.25 
173 | 
174 | # Remove right y axis on left panels and left y axis on right panels and x-axis from 2nd column
175 | CA_fires_ts %<>% gtable_remove_grobs(c("axis-l-1-2", "axis-l-2-2", "axis-l-3-2", 
176 |                                        "axis-r-1-1", "axis-r-2-1", "axis-r-3-1",
177 |                                        "axis-b-2-1", "axis-b-2-2", "axis-b-2-3")) 
178 | 
179 | # Repeat outside y axes on all plots 
180 | CA_fires_ts %>% 
181 |   gtable::gtable_add_grob(gtable::gtable_filter(CA_fires_ts, pattern = "ylab-l"),
182 |                           l = 3, t = 7, b = 8,
183 |                           name = "top-ylab-l") %>% 
184 |   gtable::gtable_add_grob(gtable::gtable_filter(CA_fires_ts, pattern = "ylab-l"),
185 |                           l = 3, t = 17, b = 18,
186 |                           name = "bottom-ylab-l") %>% 
187 |   gtable::gtable_add_grob(gtable::gtable_filter(CA_fires_ts, pattern = "ylab-r"),
188 |                           l = 11, t = 7, b = 8,
189 |                           name = "top-ylab-r") %>% 
190 |   gtable::gtable_add_grob(gtable::gtable_filter(CA_fires_ts, pattern = "ylab-r"),
191 |                           l = 11, t = 17, b = 18,
192 |                           name = "bottom-ylab-r") %>% 
193 |   ggsave(file.path(path_figures, "figure03b-d_time_series.png"), ., width = 6, height = 7)
194 | 
195 | # Plot map of select CA fires
196 | {ggplot() +
197 |     geom_sf(data = states %>% filter(NAME == "California")) +
198 |     geom_sf(data = counties %>%
199 |               filter(NAME %in% c("Sacramento", "Santa Clara", "Fresno", "Sonoma")),
200 |             aes(color = NAME, fill = NAME)) +
201 |     scale_color_manual(values = MetBrewer::met.brewer("Juarez")[c(1,6,5,3)],
202 |                        aesthetics = c("color", "fill")) +
203 |     ggnewscale::new_scale_fill() +
204 |     ggnewscale::new_scale_color() +
205 |     geom_sf(data = calfire %>% filter(select_fires),
206 |             color = NA,
207 |             fill = "black") +
208 |     theme_void()} %>%
209 |   ggsave(file.path(path_figures, "figure03a.png"), ., width = 4, height = 5)
210 | 
211 | # Plot all calfires during the time series
212 | fire_panels %>%
213 |   purrr::pmap(function(panel_name, start_date, end_date){
214 |     panel_label = c("b", "c", "d")[which(fire_panels$panel_name == panel_name)]
215 |     fig_name <- file.path(path_figures, paste0("figure03", panel_label, "_map.png"))
216 |     # print(fig_name)
217 |     calfire %>%
218 |       mutate(ALARM_DATE = as.Date(ALARM_DATE),
219 |              CONT_DATE = as.Date(CONT_DATE)) %>%
220 |       rowwise %>%
221 |       mutate(max_start = max(c(ALARM_DATE, start_date)),
222 |              min_end = min(c(CONT_DATE, end_date))) %>%
223 |       ungroup %>%
224 |       filter(max_start < min_end) %>%
225 |       {ggplot(data = .,) +
226 |           geom_sf(data = states %>% filter(NAME == "California")) +
227 |           geom_sf(color = NA,
228 |                   fill = "black") +
229 |           theme_void()} %>%
230 |       ggsave(fig_name,., width = 4, height = 5)
231 |     
232 |   })
233 | 


--------------------------------------------------------------------------------
/scripts/main/03_05_figure04.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots annual smoke PM.
 8 | # ------------------------------------------------------------------------------
 9 | # Load smoke PM predictions
10 | smokePM_preds <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds")) %>% 
11 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
12 | 
13 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84"))
14 | 
15 | annual_smokePM <- smokePM_preds %>% 
16 |   mutate(year = lubridate::year(date)) %>% 
17 |   group_by(grid_id_10km, year) %>% 
18 |   summarise(smokePM = sum(smokePM_pred), .groups = "drop") %>% 
19 |   # pivot wide, then back to long, to fill any missing years with zeros
20 |   pivot_wider(values_from = smokePM, names_from = year, names_prefix = "year_", values_fill = 0) %>% 
21 |   pivot_longer(starts_with("year"), values_to = "smokePM", names_to = "year", names_prefix = "year_") %>% 
22 |   # identify leap years for dividing by 365
23 |   mutate(year = as.numeric(year)) %>% 
24 |   mutate(y_days = 365 + lubridate::leap_year(year)*1, 
25 |          smokePM = smokePM/y_days)
26 | 
27 | # small miniatures of annual smoke PM
28 | right_join(grid_10km,
29 |            annual_smokePM %>% 
30 |              mutate(smokePM = pmin(smokePM, 5)),
31 |            by = c("ID" = "grid_id_10km")) %>%
32 |   {ggplot(data = ., aes(color = smokePM, fill = smokePM)) +
33 |       geom_sf() + 
34 |       facet_wrap(~year, nrow = 3, ncol = 5) +
35 |       scale_color_gradientn(colors = viridis::inferno(100, begin = 0, end = 1, direction = 1),
36 |                             name = expression(paste(mu, "g/", m^3)),
37 |                             aesthetics = c("fill", "color"), 
38 |                             values = scales::rescale(sinh(seq(0, 3, length.out = 100))),
39 |                             breaks = seq(0, 5, by = 1),
40 |                             labels = c(seq(0, 4, by = 1), ">5"),
41 |                             guide = guide_colorbar(barheight = 10, 
42 |                                                    title.theme = element_text(size = 16))) +
43 |       theme_void() + 
44 |       theme(text = element_text(size = 20))} %>% 
45 |   ggsave(file.path(path_figures, "figure04.png"), ., width = 15, height = 7)
46 | 


--------------------------------------------------------------------------------
/scripts/main/03_06_figure05.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots state trends.
  8 | # ------------------------------------------------------------------------------
  9 | # read in grid and load state geometries (a detailed one for calcs and generalized one for plotting)
 10 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84"))
 11 | states <- tigris::states(cb = FALSE) %>% 
 12 |   filter(!(STATEFP %in% nonContig_stateFIPS)) %>% 
 13 |   st_transform(crs = st_crs(grid_10km))
 14 | simple_states <- tigris::states(cb = TRUE) %>% 
 15 |   filter(!(STATEFP %in% nonContig_stateFIPS)) %>% 
 16 |   st_transform(crs = st_crs(grid_10km))
 17 | 
 18 | # load predictions and population data
 19 | smokePM_preds <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds")) %>% 
 20 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
 21 | 
 22 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
 23 |                   full.names = T) %>% purrr::map_dfr(read.csv)
 24 | 
 25 | # calculate decadal change in average smoke PM 
 26 | decadal_change <- smokePM_preds %>% 
 27 |   mutate(year = lubridate::year(date)) %>% 
 28 |   # calculate annual avg contribution of smokePM to daily PM
 29 |   group_by(year, grid_id_10km) %>% 
 30 |   summarise(annual_total_smokePM = sum(smokePM_pred), 
 31 |             .groups = "drop") %>% 
 32 |   mutate(y_days = 365 + leap_year(year)*1, 
 33 |          annual_daily_smokePM = annual_total_smokePM/y_days) %>% 
 34 |   select(-annual_total_smokePM) %>%
 35 |   # join with full set of years and grid cells to fill in zeros
 36 |   {left_join(expand.grid(year = unique(.$year),
 37 |                          grid_id_10km = unique(.$grid_id_10km)), 
 38 |              .)} %>% 
 39 |   replace_na(list(annual_daily_smokePM = 0)) %>% 
 40 |   # add the time periods to summarise over 
 41 |   mutate(period = case_when(year >= 2006 & year <=2010 ~ "years2006_2010", 
 42 |                             year >= 2016 & year <=2020 ~ "years2016_2020",
 43 |                             T ~ as.character(NA))) %>% 
 44 |   filter(!is.na(period)) %>% 
 45 |   # average the annual daily smokePM to the period
 46 |   group_by(period, grid_id_10km) %>% 
 47 |   summarise(annual_daily_smokePM = mean(annual_daily_smokePM),
 48 |             .groups = "drop") %>% 
 49 |   # make a  column for each period, and difference the columns
 50 |   pivot_wider(names_from = period, values_from = annual_daily_smokePM) %>% 
 51 |   mutate(decade_change = years2016_2020 - years2006_2010) 
 52 | 
 53 | # plot a map of the decadal change
 54 | grid_10km %>% 
 55 |   left_join(decadal_change, 
 56 |             by = c("ID" = "grid_id_10km")) %>% 
 57 |   mutate(decade_change = pmin(5, decade_change)) %>%
 58 |   {ggplot(data = .) + 
 59 |       geom_sf(aes(color = decade_change, fill = decade_change)) + 
 60 |       geom_sf(data = simple_states, 
 61 |               color = "grey20", lwd = 0.1, fill = NA) + 
 62 |       scale_color_gradientn(aesthetics = c("color", "fill"), 
 63 |                             name = expression(paste(mu, "g/", m^3)), 
 64 |                             colors =  cmocean::cmocean("balance",
 65 |                                                        start = 0.05, 
 66 |                                                        end = 0.95)(101),
 67 |                             values = scales::rescale(sinh(seq(-2, 2, length.out = 101))),
 68 |                             rescaler = mid_rescaler(0),
 69 |                             guide = guide_colorbar(barheight = 6),
 70 |                             breaks = seq(0, 5, by = 1),
 71 |                             labels = c(seq(0, 4, by = 1), ">5"),) + 
 72 |       theme_void() + 
 73 |       theme(legend.position = c(.92, 0.32),
 74 |             legend.justification = "center")} %>% 
 75 |   ggsave(file.path(path_figures, "figure05a.png"), ., width = 5, height = 4)
 76 | 
 77 | 
 78 | state_annual_cells <- states %>% 
 79 |   # identify the cells intersecting with each state (the result is the index of the grid cell insecting each state)
 80 |   {mutate(., 
 81 |           grid_id_10km_ind = st_intersects(., grid_10km))} %>%
 82 |   st_drop_geometry() %>% 
 83 |   unnest(grid_id_10km_ind) %>%
 84 |   # join in the grid cell identifiers based on grid cell index 
 85 |   left_join(grid_10km %>% mutate(grid_id_10km_ind = 1:n()) %>% 
 86 |               st_drop_geometry()) %>% 
 87 |   # join in population density for each 10km grid cell
 88 |   left_join(pop %>% 
 89 |               mutate(pop_density = mean)) %>% 
 90 |   # join in annual smoke averages
 91 |   left_join(smokePM_preds %>%
 92 |               mutate(year = lubridate::year(date)) %>% 
 93 |               group_by(year, grid_id_10km) %>%
 94 |               # calculate annual total smoke PM, then divide by days in the year
 95 |               summarise(annual_total_smokePM = sum(smokePM_pred), 
 96 |                         .groups = "drop") %>% 
 97 |               mutate(y_days = 365 + leap_year(year)*1, 
 98 |                      annual_daily_smokePM = annual_total_smokePM/y_days) %>% 
 99 |               # join with full set of years and grid cells to fill in zeros
100 |               {left_join(expand.grid(year = unique(.$year),
101 |                                      grid_id_10km = unique(.$grid_id_10km)), 
102 |                          .)} %>% 
103 |               replace_na(list(annual_daily_smokePM = 0)), 
104 |             by = c("ID" = "grid_id_10km"))
105 | 
106 | state_annual_cells %>% 
107 |   group_by(NAME, STUSPS, year) %>% 
108 |   summarise(annual_daily_smokePM = weighted.mean(annual_daily_smokePM, pop_density),
109 |             .groups = "drop") %>% 
110 |   mutate(states = ifelse(NAME %in% c("California", "Florida", 
111 |                                      "New York", "Oregon", "Michigan"), 
112 |                          NAME, 
113 |                          "rest")) %>% 
114 |   {ggplot(mapping = aes(x = year, y = annual_daily_smokePM, 
115 |                         group = NAME)) + 
116 |       geom_line(data = filter(., states == "rest"), 
117 |                 color = "grey30", lwd = 0.25, alpha = 0.5) +
118 |       geom_line(data = filter(., states!= "rest"),
119 |                 mapping = aes(color = states), 
120 |                 lwd = 1.5, alpha = 1) + 
121 |       ylab(expression(paste("smoke ", PM[2.5]," (", mu, "g/", m^3, ")"))) +
122 |       scale_color_manual(values = MetBrewer::met.brewer("VanGogh2", 15)[c(1,3, 7, 9, 15)],
123 |                          guide = "none") +
124 |       theme_classic() } %>% 
125 |   ggsave(file.path(path_figures, "figure05b_time_series.png"),
126 |          ., width = 5, height = 3.5)
127 | 
128 | simple_states %>%
129 |   mutate(states = ifelse(NAME %in% c("California", "Florida", 
130 |                                      "New York", "Oregon", "Michigan"), 
131 |                          NAME, 
132 |                          "rest")) %>% 
133 |   {ggplot() +
134 |       geom_sf(data = filter(., states == "rest"),
135 |               lwd = 0.1) + 
136 |       geom_sf(data = filter(., states != "rest"),
137 |               mapping = aes(color = states, fill = states),
138 |               lwd = 0.1) + 
139 |       scale_color_manual(values = MetBrewer::met.brewer("VanGogh2", 15)[c(1,3, 7, 9, 15)], #MetBrewer::met.brewer("VanGogh2", 15)[c(1, 4, 9, 14)], 
140 |                          guide = "none", 
141 |                          aesthetics = c("fill", "color")) +
142 |       theme_void()} %>% 
143 |   ggsave(file.path(path_figures, "figure05b_map.png"), 
144 |          ., width = 3, height = 2.5)
145 | 


--------------------------------------------------------------------------------
/scripts/main/03_08_figure07.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots extreme days.
  8 | # ------------------------------------------------------------------------------
  9 | smokePM_preds <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds")) %>% 
 10 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
 11 | 
 12 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
 13 |                   full.names = T) %>% purrr::map_dfr(read.csv)
 14 | 
 15 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84")) %>% 
 16 |   mutate(area = st_area(geometry) %>% unclass) # area in m^2
 17 | 
 18 | # check the population adds up 
 19 | pop %>% 
 20 |   rename(pop_density = mean) %>% # people per m2
 21 |   left_join(grid_10km) %>% 
 22 |   mutate(pop = pop_density*area) %>% 
 23 |   summarise(pop = sum(pop))
 24 | 
 25 | extremes_by_year <- smokePM_preds %>%
 26 |   mutate(year = lubridate::year(date)) %>% 
 27 |   {full_join(group_by(., year) %>% 
 28 |                summarise(days_over50 = sum(smokePM_pred > 50),
 29 |                          days_over100 = sum(smokePM_pred > 100), 
 30 |                          days_over200 = sum(smokePM_pred > 200), 
 31 |                          .groups = "drop"),
 32 |              group_by(., year, grid_id_10km) %>%  
 33 |                summarise(pop_over50 = any(smokePM_pred >50),
 34 |                          pop_over100 = any(smokePM_pred > 100), 
 35 |                          pop_over200 = any(smokePM_pred > 200),
 36 |                          .groups = "drop") %>% 
 37 |                left_join(pop %>% left_join(grid_10km) %>%
 38 |                            transmute(grid_id_10km = ID, pop = mean*area)) %>% 
 39 |                group_by(year) %>% 
 40 |                summarise(across(starts_with("pop_over"), ~sum(.x*pop)), 
 41 |                          .groups = "drop"))}
 42 | 
 43 | # change in extremes
 44 | extremes_by_year %>% 
 45 |   mutate(period = case_when(year >= 2006 & year <=2010 ~ "2006 - 2010", 
 46 |                             year >= 2016 & year <=2020 ~ "2016 - 2020",
 47 |                             T ~ as.character(NA))) %>% 
 48 |   filter(!is.na(period)) %>% 
 49 |   group_by(period) %>% 
 50 |   summarise(across(contains("over"), mean)) %>% 
 51 |   pivot_longer(contains("over")) %>% 
 52 |   separate(name, into = c("panel", "cutoff"), sep = "_") %>%
 53 |   mutate(panel = dplyr::recode_factor(panel, 
 54 |                                       "days" = "avg annual number of grid cell-days",
 55 |                                       "pop" = "avg annual population exposed to 1+ days",
 56 |                                       .ordered = TRUE)) %>%
 57 |   {ggplot(data = ., 
 58 |           aes(x = period, y = value, group = cutoff, 
 59 |              color = cutoff)) + 
 60 |       geom_point() + 
 61 |       geom_line() + 
 62 |       geom_text(data = group_by(., panel, cutoff) %>% 
 63 |                   summarise(asinh_mid = sinh(mean(asinh(value))),
 64 |                             .groups = "drop") %>% 
 65 |                   mutate(cutoff_num = gsub("over", "", cutoff), 
 66 |                          label_top = paste0("paste(\'Days > ", cutoff_num, "\', mu, \'g\')")),
 67 |                 aes(y = asinh_mid, label = label_top,
 68 |                     x = 2.1 + ifelse(cutoff == "over100", 0.03, 0) + 
 69 |                       ifelse(cutoff == "over50", 0.06, 0)), 
 70 |                 color = "black", hjust = 0, vjust = 0, parse = TRUE) + 
 71 |       geom_text(data = group_by(., panel, cutoff) %>% 
 72 |                   summarise(asinh_mid = sinh(mean(asinh(value))),
 73 |                             mult = max(value)/min(value),
 74 |                             .groups = "drop") %>% 
 75 |                   mutate(mult_round = formatC(round(signif(mult, 2), 0),
 76 |                                               format="d", big.mark=","), 
 77 |                          label_bottom = paste0(mult_round, "x increase")),
 78 |                 aes(y = asinh_mid, label = label_bottom, 
 79 |                     x = 2.1 + ifelse(cutoff == "over100", 0.03, 0) + 
 80 |                       ifelse(cutoff == "over50", 0.06, 0)), 
 81 |                 color = "black", hjust = 0, vjust = 1) + 
 82 |       geom_linerange(data = group_by(., panel, cutoff) %>% 
 83 |                        summarise(min = min(value), 
 84 |                                  max = max(value), 
 85 |                                  .groups = "drop"), 
 86 |                      mapping = aes(ymin = min, ymax = max, 
 87 |                                    x = 2.05 + ifelse(cutoff == "over100", 0.03, 0) + 
 88 |                                      ifelse(cutoff == "over50", 0.06, 0), 
 89 |                                    color = cutoff), 
 90 |                      inherit.aes = FALSE) + 
 91 |       facet_wrap(~panel, scales = "free", nrow = 1, strip.position = "left") + 
 92 |       scale_y_continuous(trans = "pseudo_log", 
 93 |                          breaks = c(0, 1, 5, 10, 50, 100, 500, 
 94 |                                     1000, 5000, 10000, 50000, 100000, 
 95 |                                     500000, 1000000, 5000000, 10000000, 
 96 |                                     20000000),
 97 |                          labels = c("0", "1", "5", "10", "50", "100", "500", 
 98 |                                     "1k", "5k", "10k", "50k", "100k", 
 99 |                                     "500k", "1m", "5m", "10m", "20m")) +
100 |       scale_x_discrete(expand = expansion(mult = c(0.1, 0.05))) + 
101 |       coord_cartesian(clip = "off") +
102 |       scale_color_manual(values = c("#e8993b", "#913818", "#4a571e"), 
103 |                          guide = "none") + 
104 |       xlab("") + ylab("") + 
105 |       theme_classic() + 
106 |       theme(plot.margin = margin(0.5,6,0,0, "lines"), 
107 |             strip.placement = "outside", 
108 |             strip.background = element_blank(),
109 |             strip.text = element_text(size = 10),
110 |             panel.spacing.x = unit(5, "lines"))} %>% 
111 |   ggsave(file.path(path_figures, "figure07a-b.png"), ., width = 8, height = 3.25)
112 | 
113 | # map of changing extremes
114 | simple_states <- tigris::states(cb = TRUE) %>% 
115 |   filter(!(STATEFP %in% nonContig_stateFIPS)) %>% 
116 |   st_transform(crs = st_crs(grid_10km))
117 | 
118 | decadal_change <- smokePM_preds %>% 
119 |   mutate(year = lubridate::year(date)) %>% 
120 |   # calculate annual avg contribution of smokePM to daily PM
121 |   group_by(year, grid_id_10km) %>% 
122 |   summarise(annual_total_smokePM = sum(smokePM_pred), 
123 |             annual_days_over50 = sum(smokePM_pred > 50), 
124 |             annual_days_over100 = sum(smokePM_pred > 100), 
125 |             .groups = "drop") %>% 
126 |   mutate(y_days = 365 + leap_year(year)*1, 
127 |          annual_daily_smokePM = annual_total_smokePM/y_days) %>%
128 |   select(-annual_total_smokePM) %>% 
129 |   # join with full set of years and grid cells to fill in zeros
130 |   {left_join(expand.grid(year = unique(.$year),
131 |                          grid_id_10km = unique(.$grid_id_10km)), 
132 |              .)} %>% 
133 |   replace_na(list(annual_daily_smokePM = 0, 
134 |                   annual_days_over50 = 0, 
135 |                   annual_days_over100 = 0)) %>% 
136 |   # add the time periods to summarise over 
137 |   mutate(period = case_when(year >= 2006 & year <=2010 ~ "years2006_2010", 
138 |                             year >= 2016 & year <=2020 ~ "years2016_2020",
139 |                             T ~ as.character(NA))) %>% 
140 |   filter(!is.na(period)) %>% 
141 |   # average the annual daily smokePM to the period
142 |   group_by(period, grid_id_10km) %>% 
143 |   summarise(across(starts_with("annual"), mean),
144 |             .groups = "drop") %>%  
145 |   # make a row for each metric, then column for each period, and difference the columns
146 |   pivot_longer(starts_with("annual")) %>% 
147 |   pivot_wider(names_from = period, values_from = value) %>% 
148 |   mutate(decade_change = years2016_2020 - years2006_2010) %>%
149 |   select(-starts_with("years")) %>% 
150 |   pivot_wider(names_from = name, values_from = decade_change)
151 | 
152 | decadal_change %>% 
153 |   left_join(grid_10km %>% select(grid_id_10km = ID)) %>% 
154 |   st_as_sf %>% 
155 |   mutate(annual_days_over50 = pmin(10, annual_days_over50)) %>%
156 |   {ggplot(data = ., 
157 |           aes(color = annual_days_over50,
158 |               fill = annual_days_over50)) + 
159 |       geom_sf() + 
160 |       geom_sf(data = simple_states, 
161 |               color = "grey20", lwd = 0.1, fill = NA) + 
162 |       scale_color_gradientn(aesthetics = c("color", "fill"), 
163 |                             name = "",
164 |                             colors = rev(colorRampPalette(RColorBrewer::brewer.pal(11, "BrBG"))(101)),
165 |                             values = scales::rescale(sinh(seq(-2, 2, length.out = 101))),
166 |                             rescaler = mid_rescaler(0),
167 |                             guide = guide_colorbar(barheight = 5),
168 |                             breaks = seq(-5, 10, by = 5),
169 |                             labels = c(seq(-5, 5, by = 5), ">10")) +
170 |       theme_void() + 
171 |       theme(legend.position = c(.87, 0.28),
172 |             legend.justification = "left", 
173 |             legend.title = element_text(size = 10))} %>% 
174 |   ggsave(file.path(path_figures, "figure07c.png"),. , width = 5, height = 3)
175 | 


--------------------------------------------------------------------------------
/scripts/main/04_01_manuscript_calculations.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Calculates statistics in manuscript.
 8 | # ------------------------------------------------------------------------------
 9 | epa_data <- readRDS(file.path(path_data, "3_intermediate", "station_smokePM.rds"))
10 | 
11 | # how many stations do we have data at 
12 | epa_data$id %>% unique %>% length
13 | 
14 | # how many stations have observations for at least 5 years 
15 | epa_data %>% 
16 |   group_by(id) %>% 
17 |   summarise(start_date = min(date), 
18 |             end_date = max(date),
19 |             .groups = "drop") %>% 
20 |   mutate(time_range = difftime(end_date, start_date, units = "days")) %>% 
21 |   filter(time_range > (365*5)) %>% 
22 |   nrow
23 | 
24 | # how many obs for aod model 
25 | test <- readRDS(file.path(path_data, "4_clean", "aod_training.rds"))
26 | nrow(test)
27 | 
28 | # stats on extreme smoke days 
29 | smokePM_preds <- readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds"))
30 | 
31 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"),
32 |                   full.names = T) %>% purrr::map_dfr(read.csv)
33 | 
34 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84")) %>% 
35 |   mutate(area = st_area(geometry) %>% unclass) # area in m^2
36 | 
37 | # This may require a large amount of memory
38 | extremes_by_year <- smokePM_preds %>%
39 |   mutate(year = lubridate::year(date)) %>% 
40 |   {full_join(group_by(., year) %>% 
41 |                summarise(days_over50 = sum(smokePM_pred > 50),
42 |                          days_over100 = sum(smokePM_pred > 100), 
43 |                          days_over200 = sum(smokePM_pred > 200), 
44 |                          .groups = "drop"),
45 |              group_by(., year, grid_id_10km) %>%  
46 |                summarise(pop_over50 = any(smokePM_pred >50),
47 |                          pop_over100 = any(smokePM_pred > 100), 
48 |                          pop_over200 = any(smokePM_pred > 200),
49 |                          .groups = "drop") %>% 
50 |                left_join(pop %>% left_join(grid_10km) %>%
51 |                            transmute(grid_id_10km = ID, pop = mean*area)) %>% 
52 |                group_by(year) %>% 
53 |                summarise(across(starts_with("pop_over"), ~sum(.x*pop)), 
54 |                          .groups = "drop"))}
55 | saveRDS(extremes_by_year, file.path(path_data, "extremes_by_year.rds"))
56 | # extremes_by_year = readRDS(file.path(path_data, "extremes_by_year.rds"))
57 | 
58 | # how far off are we on the really low smoke days PM
59 | smokePM_preds <- list.files(file.path(path_output, "smokePM", "model"), 
60 |                             pattern = "smokePM_pred", 
61 |                             full.names = TRUE) %>% 
62 |   grep("drop\\.", ., value = TRUE) %>% 
63 |   map_dfr(function(x){
64 |     readRDS(x) %>% 
65 |       mutate(test_fold = as.numeric(gsub("^smokePM_pred_fold|_drop-?.*\\.rds$", "", basename(x))))
66 |   }) %>%
67 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
68 | 
69 | smokePM_data <- readRDS(file.path(path_data, "4_clean", "smokePM_training.rds"))
70 | 
71 | comp <- smokePM_preds %>% 
72 |   mutate(test = ifelse(fold == test_fold, "test", "train")) %>%
73 |   filter(test == "test") %>%
74 |   left_join(smokePM_data %>% select(id, date, smokePM)) 
75 | 
76 | comp %>% 
77 |   filter(smokePM < 1) %>% 
78 |   mutate(pred_bin = cut(smokePM_pred, c(0, 5, 10, Inf), 
79 |                         right = FALSE,
80 |                         include.lowest = T)) %>% 
81 |   group_by(pred_bin) %>% 
82 |   summarise(n = n()) %>% 
83 |   mutate(pct = n/sum(n))
84 | 


--------------------------------------------------------------------------------
/scripts/main/05_01_figureS01.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots supplemental figure 1.
  8 | # ------------------------------------------------------------------------------
  9 | hysplit_date <- "2020-09-20"
 10 | 
 11 | states <- tigris::states(cb = TRUE) %>% 
 12 |   filter(!(STATEFP %in% nonContig_stateFIPS))
 13 | 
 14 | traj <- list.files(file.path(path_data, "HYSPLIT", "trajectories", hysplit_date), full.names = T)[1] %>% 
 15 |   list.files(full.names = T) %>% 
 16 |   magrittr::extract(4:6) %>% 
 17 |   map_dfr(function(x){readRDS(x) %>% mutate(file = x)})
 18 | 
 19 | height_data <- readRDS(file.path(path_data, "HYSPLIT", "miscellaneous", "heights.rds"))
 20 | 
 21 | # panel A, example hysplit traj from single point and hour 
 22 | traj %>% 
 23 |   mutate(init_height = gsub("^traj-traj-fwd-[0-2][0-9]-[0-1][0-9]-[0-3][0-9]-[0-2][0-9]-1lat_[0-9]{2}p[0-9]{2}_lon_-[0-9]?[0-9]{2}p[0-9]{2}-hgt_|-144h\\.rds$", "", basename(file))) %>% 
 24 |   {ggplot(data = .) + 
 25 |       geom_sf(data = states, inherit.aes = FALSE) + 
 26 |       geom_point(mapping = aes(x = lon, y = lat, 
 27 |                                color = height), size = 1) + 
 28 |       geom_path(aes(x = lon, y = lat, group = init_height), size = 0.2) +
 29 |       geom_text(data = filter(., hour_along == max(hour_along)), 
 30 |                 aes(x = lon + ifelse(init_height == "2500", 0.5, -0.5),
 31 |                     y = lat, 
 32 |                     label = paste0(init_height, " m"),
 33 |                     hjust = ifelse(init_height == "2500", 0, 1))) +
 34 |       scale_color_viridis_c("height above\nground level (m)", 
 35 |                             option = "magma", direction = -1) + 
 36 |       theme_void()} %>% 
 37 |   ggsave(filename = file.path(path_figures, "figureS01a.png"), .,
 38 |          width = 5, height = 3)
 39 | 
 40 | # panel B, distribution of heights for quintiles
 41 | if (!file.exists(file.path(path_data, "HYSPLIT", "miscellaneous", "height_quant.rds"))) {
 42 |   height_quant <- quantile(height_data, probs = seq(0, 1, by = 0.2))
 43 |   saveRDS(height_quant, file.path(path_data, "HYSPLIT", "miscellaneous", "height_quant.rds"))
 44 | } else {
 45 |   height_quant = readRDS(file.path(path_data, "HYSPLIT", "miscellaneous", "height_quant.rds"))
 46 | }
 47 | 
 48 | if (!file.existsfile.path(path_data, "HYSPLIT", "miscellaneous", "height_quant_df.rds")) {
 49 |   height_quant_df <- data.frame(height = height_quant/1e3, 
 50 |                                 quant = names(height_quant)) %>% 
 51 |     mutate(label = paste0(quant, "\n", round(height, 1), " km"))
 52 |   saveRDS(height_quant_df, file.path(path_data, "HYSPLIT", "miscellaneous", "height_quant_df.rds"))
 53 | } else {
 54 |   height_quant_df = readRDS(file.path(path_data, "HYSPLIT", "miscellaneous", "height_quant_df.rds"))
 55 | }
 56 | 
 57 | if (!file.exists(file.path(path_data, "HYSPLIT", "miscellaneous", "height_samp.rds"))) {
 58 |   set.seed(10001)
 59 |   height_samp <- sample(1:length(height_data), size = 1e6, replace = F)
 60 |   saveRDS(height_samp, file.path(path_data, "HYSPLIT", "miscellaneous", "height_samp.rds"))
 61 | } else {
 62 |   height_sample = readRDS(file.path(path_data, "HYSPLIT", "miscellaneous", "height_samp.rds"))
 63 | }
 64 | 
 65 | height_hist <- data.frame(height = height_data[height_samp]/1e3) %>% 
 66 |   # slice_sample(n = 1e6) %>%
 67 |   ggplot(aes(x = height, y = ..density..)) + 
 68 |   geom_histogram(boundary = 0) + 
 69 |   geom_vline(data = height_quant_df %>% filter(quant != "100%"), 
 70 |              aes(xintercept = height), 
 71 |              color = "#2b614e") + 
 72 |   geom_label(data = height_quant_df %>% filter(quant != "100%"), 
 73 |              aes(x = height, y = .3, label = label), 
 74 |              position = position_dodge(width = 0.2),
 75 |              label.size = 0, size = 2, label.padding = unit(0.1, "lines"),
 76 |              color = "#2b614e") + 
 77 |   xlab("height above ground level (km)") + ylab("density") + 
 78 |   scale_x_continuous(limits = c(NA, 12), expand = c(0.05, 0)) + 
 79 |   theme_classic()
 80 | 
 81 | ggsave(filename = file.path(path_figures, "figureS01b.png"), height_hist,
 82 |        width = 5, height = 3)
 83 | 
 84 | # panel C, counts in each height quintile in the 50km buffer 
 85 | hysplit <- list.files(
 86 |   file.path(path_data, "HYSPLIT", "10km_grid_2006-2020"),
 87 |   pattern = paste0(format(as.Date(hysplit_date), "%Y"), 
 88 |                    "_",
 89 |                    format(as.Date(hysplit_date), "%m")),
 90 |   full.names = T
 91 | ) %>% 
 92 |   readRDS %>% 
 93 |   mutate(date = as.Date(date, format = "%Y%m%d")) %>% 
 94 |   filter(date == as.Date(hysplit_date))
 95 | 
 96 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84"))
 97 | 
 98 | hysplit %>% 
 99 |   select(ID = id_grid, contains("num_traj_points_height")) %>% 
100 |   pivot_longer(starts_with("num")) %>% 
101 |   separate(name, into = c(NA, NA, NA, NA, "n"), sep = "_", remove = FALSE, 
102 |            convert = T) %>% 
103 |   left_join(data.frame(height = unname(height_quant)/1e3) %>% 
104 |               mutate(n = 1:n(),
105 |                      label = paste0(round(height, 1), 
106 |                                     " - ", 
107 |                                     lead(round(height, 1)), 
108 |                                     " km above ground level")) %>% 
109 |               select(n, label)) %>% 
110 |   left_join(grid_10km %>% select(ID)) %>% 
111 |   st_as_sf %>% 
112 |   {ggplot(data = .) + 
113 |       geom_sf(aes(fill = value, color = value)) + 
114 |       facet_wrap(~label, nrow = 1) + 
115 |       scale_color_gradientn(colors = colorspace::sequential_hcl(palette = "BuPu", n = 100, rev = T),
116 |                             name = "trajectory points\nwithin 50km",
117 |                             aesthetics = c("fill", "color"), 
118 |                             values = scales::rescale(sinh(seq(0, 5, length.out = 100))),
119 |                             guide = guide_colorbar(title.theme = element_text(size = 7),
120 |                                                    label.theme = element_text(size = 6.5), 
121 |                                                    barheight = 4, barwidth = 1)) +
122 |       theme_void() + 
123 |       theme(strip.text = element_text(size = 8),
124 |             panel.spacing = unit(-0.5, "lines"))} %>% 
125 |   ggsave(filename = file.path(path_figures, "figureS01c.png"),. ,
126 |          height = 3, width = 12)
127 | 


--------------------------------------------------------------------------------
/scripts/main/05_02_figureS02.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Jessica Li and Marissa Childs
  7 | # Plots supplemental figure 2.
  8 | # ------------------------------------------------------------------------------
  9 | chosen_date = "20181111"
 10 | chosen_year = year(ymd(chosen_date))
 11 | chosen_month = month(ymd(chosen_date))
 12 | 
 13 | # Get smoke plume shapes
 14 | smoke = readRDS(file.path(path_data, "smoke", "smoke_plumes_sfdf.rds"))
 15 | smoke = smoke %>% filter(date == chosen_date)
 16 | 
 17 | # Get CONUS background
 18 | usa = states()
 19 | conus = usa %>% filter(!(STUSPS %in% c("AK", "AS", "GU", "HI", "MP", "PR", "VI")))
 20 | 
 21 | # Plot plumes
 22 | p = ggplot() + 
 23 |   geom_sf(data = conus) +
 24 |   geom_sf(data = smoke, fill = "orange", alpha = 0.3) + 
 25 |   theme_void()
 26 | 
 27 | # Save
 28 | ggsave(file.path(path_figures, "figureS02a.png"), 
 29 |        plot = p, width = 12, height = 8)
 30 | 
 31 | #-------------------------------------------------------------------------------
 32 | # Load 10 km grid
 33 | project_grid = read_sf(file.path(path_data, "1_grids", "10km_grid"))
 34 | project_grid = st_as_sf(project_grid) %>% st_transform(st_crs(smoke)) %>% select(id_grid = ID)
 35 | 
 36 | # Get gridded smoke day
 37 | smoke_day = readRDS(file.path(path_data, "smoke_days", paste0("grid_smoke_day_", chosen_year, "_", chosen_month, ".rds")))
 38 | smoke_day = smoke_day %>% 
 39 |   filter(date == ymd(chosen_date)) %>% 
 40 |   mutate(smoke_day = factor(smoke_day, levels = 0:1))
 41 | smoke_day = project_grid %>% left_join(smoke_day, by = "id_grid")
 42 | 
 43 | # Plot gridded smoke day
 44 | p = ggplot(smoke_day, aes(fill = smoke_day)) + 
 45 |   geom_sf(color = NA) +  
 46 |   theme_void() + 
 47 |   scale_fill_manual(values = c("gray50", "orange")) + 
 48 |   labs(fill = "Smoke Day")
 49 | 
 50 | # Save
 51 | ggsave(file.path(path_figures, "figureS02b.png"), 
 52 |        plot = p, width = 12, height = 8)
 53 | 
 54 | #-------------------------------------------------------------------------------
 55 | # AOD Missingness
 56 | aod_na = list.files(file.path(path_data, "2_from_EE", "maiac_AODmissings"),
 57 |                     pattern = sprintf("^aod_pctMissing_10km_subgrid_.*_%s.*\\.csv$", chosen_year),
 58 |                     full.names = T) %>% 
 59 |   map_dfr(read.csv) %>% 
 60 |   rename(id_grid = ID, date = start_date, perc_aod_missing = mean) %>% 
 61 |   filter(date == chosen_date)
 62 | aod_na = project_grid %>% left_join(aod_na, by = "id_grid")
 63 | 
 64 | # Plot
 65 | p = ggplot(aod_na %>% mutate(perc_aod_missing = perc_aod_missing*100), aes(fill = perc_aod_missing)) + 
 66 |   geom_sf(color = NA) +  
 67 |   theme_void() + 
 68 |   labs(fill = "% AOD\nMissing")
 69 | 
 70 | # Save
 71 | ggsave(file.path(path_figures, "figureS02d.png"), 
 72 |        plot = p, width = 12, height = 8)
 73 | 
 74 | #-------------------------------------------------------------------------------
 75 | hysplit <- list.files(file.path(path_data, "HYSPLIT", "10km_grid_2006-2020"),
 76 |                       pattern = paste0(format(ymd(chosen_date), "%Y"), 
 77 |                                        "_",
 78 |                                        format(ymd(chosen_date), "%m")),
 79 |                       full.names = T) %>% 
 80 |   readRDS %>% 
 81 |   mutate(date = as.Date(date, format = "%Y%m%d")) %>% 
 82 |   filter(date == ymd(chosen_date))
 83 | 
 84 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84"))
 85 | 
 86 | hysplit %>% 
 87 |   select(ID = id_grid, num_traj_points_height_1) %>% 
 88 |   left_join(grid_10km %>% select(ID)) %>% 
 89 |   st_as_sf %>% 
 90 |   {ggplot(data = .) + 
 91 |       geom_sf(aes(fill = num_traj_points_height_1, color = num_traj_points_height_1)) + 
 92 |       scale_color_gradientn(colors = c(cmocean::cmocean(name = "oxy", start = 0.2, end = 0.79, direction = -1)(20),
 93 |                                        cmocean::cmocean(name = "oxy", start = 0, end = 0.19, direction = -1)(20)),
 94 |                             name = "trajectory points\nwithin 50km",
 95 |                             aesthetics = c("fill", "color"), 
 96 |                             rescaler = mid_rescaler(50),
 97 |                             guide = guide_colorbar(title.theme = element_text(size = 7),
 98 |                                                    label.theme = element_text(size = 6.5), 
 99 |                                                    barheight = 4, barwidth = 1)) +
100 |       theme_void() + 
101 |       theme(legend.position = c(0.93, 0.35))} %>% 
102 |   ggsave(filename = file.path(path_figures, "figureS02e.png"), 
103 |          ., height = 3.5, width = 5)
104 | 
105 | #-------------------------------------------------------------------------------
106 | # Final Gridded Smoke Days
107 | # Get filled smoke day grid
108 | filled_smoke_day = readRDS(file.path(path_data, "3_intermediate", "all_smoke_days_incl_cloudy.rds"))
109 | filled_smoke_day = filled_smoke_day %>% 
110 |   filter(date == ymd(chosen_date)) %>% 
111 |   select(id_grid = grid_id_10km, date, smoke_day)
112 | filled_smoke_day = project_grid %>% 
113 |   left_join(filled_smoke_day, by = "id_grid") %>% 
114 |   replace_na(list(date = ymd(chosen_date), smoke_day = 0)) %>% 
115 |   mutate(smoke_day = factor(smoke_day, levels = 0:1))
116 | 
117 | # Plot gridded smoke day
118 | p = ggplot(filled_smoke_day, aes(fill = smoke_day)) + 
119 |   geom_sf(color = NA) +  
120 |   theme_void() + 
121 |   scale_fill_manual(values = c("gray50", "orange")) + 
122 |   labs(fill = "Smoke Day")
123 | 
124 | # Save
125 | ggsave(file.path(path_figures, "figureS02f.png"), 
126 |        plot = p, width = 12, height = 8)
127 | 


--------------------------------------------------------------------------------
/scripts/main/05_03_figureS03.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 3.
 8 | # ------------------------------------------------------------------------------
 9 | # trends in plumes
10 | plumes <- readRDS(file.path(path_data, "smoke", "smoke_plumes_sfdf_20050805_20220711.RDS"))
11 | plumes <- readRDS(file.path(path_data, "smoke", "smoke_plumes_sfdf.RDS"))
12 | 
13 | plume_area <- st_area(plumes)
14 | 
15 | plumes %<>% st_drop_geometry() %>% 
16 |   cbind(area = unclass(plume_area)/1e6)
17 | 
18 | plumes %>% 
19 |   filter(area > 0) %>% 
20 |   mutate(year_month = substr(date, 1, 6),
21 |          date = as.Date(date, format = "%Y%m%d"), 
22 |          month = lubridate::month(date), 
23 |          year = lubridate::year(date)) %>% 
24 |   filter(year >= 2006 & year <= 2020) %>%
25 |   group_by(year_month) %>% 
26 |   summarise(start_date = min(date),
27 |             qminimum = min(area), 
28 |             q5 = quantile(area, 0.05), 
29 |             q10 = quantile(area, 0.10), 
30 |             .groups = "drop") %>%
31 |   pivot_longer(starts_with("q"), names_prefix = "q") %>% 
32 |   mutate(name = ifelse(name %in% c("minimum", "max"), name, paste0(name, "th percentile"))) %>% 
33 |   {ggplot() + 
34 |       geom_vline(xintercept = as.Date("2017-12-28"),
35 |                  color = "grey") + 
36 |       geom_label(aes(x = as.Date("2017-12-28"),
37 |                     y = pull(., value) %>% max,
38 |                     label = "GOES-16"), 
39 |                 nudge_y = -5,
40 |                 nudge_x = -65,
41 |                 vjust = 1,
42 |                 fill = "white",
43 |                 color = "grey", 
44 |                 label.size = NA) + 
45 |       geom_vline(xintercept = as.Date("2019-02-28"), 
46 |                  color = "grey") +
47 |       geom_label(aes(x = as.Date("2019-02-28"),
48 |                     y = Inf,
49 |                     label = "GOES-17"), 
50 |                 vjust = 1, 
51 |                 nudge_y = 20,
52 |                 nudge_x = 65,
53 |                 fill = "white",
54 |                 color = "grey", 
55 |                 label.size = NA) + 
56 |       geom_line(data = .,
57 |                 aes(x = start_date,
58 |                     y = value,
59 |                     group = name,
60 |                     color = name)) +
61 |       geom_text(data = group_by(., name) %>% 
62 |                   summarise(mean_val = last(value)) %>% 
63 |                   cbind(start_date = as.Date("2021-01-01")), 
64 |                 aes(x = start_date, 
65 |                     y = mean_val, 
66 |                     label = name, 
67 |                     color = name), 
68 |                 hjust = 0) +
69 |       theme_classic() + 
70 |       coord_cartesian(clip = "off") + 
71 |       scale_color_manual(values = c("black", "darkgreen", "purple")) + 
72 |       xlab("Date") + ylab(bquote("Plume size "(km^2))) +
73 |       theme(legend.position = "none", 
74 |             plot.margin = unit(c(0.75, 2.5, 0.5, 0.25), "cm"))} %>% 
75 |   ggsave(file.path(path_figures, "figureS03.png"), ., 
76 |          width = 6, height = 4)
77 | 


--------------------------------------------------------------------------------
/scripts/main/05_04_figureS04.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Jessica Li
  7 | # Plots supplemental figure 4.
  8 | # ------------------------------------------------------------------------------
  9 | max_nchar = 35
 10 | 
 11 | start_date = "20060101"
 12 | end_date = "20201231"
 13 | all_dates = seq.Date(ymd(start_date), ymd(end_date), by = "day")
 14 | duration_days = 6
 15 | 
 16 | # Fire points
 17 | fire_dates_not_online = readRDS(file.path(path_data, "fire", "fire_dates_not_online.rds"))
 18 | fire_dates_empty_data = readRDS(file.path(path_data, "fire", "fire_dates_empty_data.rds"))
 19 | fire_dates_clusters_too_small = readRDS(file.path(path_data, "fire", "fire_dates_clusters_too_small.rds"))
 20 | 
 21 | # HYSPLIT points
 22 | hysplit_dates_without_start_duration = seq.Date(ymd(start_date), ymd("20060418"), by = "day")
 23 | hysplit_dates_not_online = ymd(readRDS(file.path(path_data, "HYSPLIT", "miscellaneous", "hysplit_dates_not_online.rds")))
 24 | hysplit_dates_gis_corrupt = ymd(readRDS(file.path(path_data, "HYSPLIT", "miscellaneous", "hysplit_dates_gis_corrupt.rds")))
 25 | hysplit_dates_oddly_empty = ymd(readRDS(file.path(path_data, "HYSPLIT", "miscellaneous", "hysplit_dates_oddly_empty.rds")))
 26 | hysplit_na_dates_original = c(hysplit_dates_without_start_duration, hysplit_dates_not_online, hysplit_dates_gis_corrupt, hysplit_dates_oddly_empty)
 27 | hysplit_na_dates = unique(unlist(lapply(hysplit_na_dates_original, function(x) seq.Date(x, x + days(duration_days), by = "day"))))
 28 | class(hysplit_na_dates) = "Date"
 29 | hysplit_na_dates_affected = setdiff(hysplit_na_dates, hysplit_na_dates_original)
 30 | class(hysplit_na_dates_affected) = "Date"
 31 | hysplit_dates_without_start_duration = setdiff(hysplit_dates_without_start_duration, hysplit_dates_not_online)
 32 | class(hysplit_dates_without_start_duration) = "Date"
 33 | 
 34 | # Smoke plumes
 35 | smoke_dates_not_online = ymd(readRDS(file.path(path_data, "smoke", "smoke_dates_not_online.rds")))
 36 | smoke_dates_empty_data = ymd(readRDS(file.path(path_data, "smoke", "smoke_dates_empty_data.rds")))
 37 | smoke_dates_repaired_geometry = ymd(readRDS(file.path(path_data, "smoke", "smoke_dates_repaired_geometry.rds")))
 38 | 
 39 | # Datasets
 40 | dataset = create_node_df(
 41 |   n = 3,
 42 |   label = c("Fire points", "HYSPLIT points", "Smoke plumes")
 43 | )
 44 | 
 45 | # Reasons for missingness
 46 | reasons = create_node_df(
 47 |   n = 9,
 48 |   label = str_wrap(c("Not online", "Empty data", "Clusters too small", 
 49 |                      "No start time nor duration", "Not online", 
 50 |                      "Corrupt shapefile", "Oddly empty", "Affected by previous missing date", 
 51 |                      "Not online"), max_nchar) %>% 
 52 |     paste0("\n", 
 53 |            sapply(list(fire_dates_not_online, fire_dates_empty_data, fire_dates_clusters_too_small,
 54 |                        hysplit_dates_without_start_duration, hysplit_dates_not_online,
 55 |                        hysplit_dates_gis_corrupt, hysplit_dates_oddly_empty, hysplit_na_dates_affected,
 56 |                        smoke_dates_not_online), length))
 57 | )
 58 | 
 59 | # How missingness gets treated
 60 | treatments = create_node_df(
 61 |   n = 4,
 62 |   label = c("Set distance = NA,\narea = 0, num_points = 0",
 63 |             "Exclude from training;\nfill using temporal NN for prediction",
 64 |             "Treat trajectory point counts as MIA",
 65 |             "Exclude from medians and training;\nfill using temporal NN for prediction") %>% 
 66 |     paste0("\n", 
 67 |            c(length(setdiff(fire_dates_not_online, smoke_dates_not_online)) + length(fire_dates_empty_data) + length(fire_dates_clusters_too_small), 
 68 |              length(intersect(fire_dates_not_online, smoke_dates_not_online)),
 69 |              sum(sapply(list(hysplit_dates_without_start_duration, hysplit_dates_not_online, 
 70 |                              hysplit_dates_gis_corrupt, hysplit_dates_oddly_empty, 
 71 |                              hysplit_na_dates_affected),
 72 |                         length)),
 73 |              length(smoke_dates_not_online)))
 74 | )
 75 | nodes = combine_ndfs(dataset, reasons, treatments)
 76 | 
 77 | # Connect nodes
 78 | edges = create_edge_df(
 79 |   from = c(1, 1, 1, 2, 2, 2, 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 80 |   to = c(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13, 13, 15, 15, 15, 15, 15, 16)
 81 | )
 82 | 
 83 | # Make graph
 84 | g = create_graph(
 85 |   nodes_df = nodes,
 86 |   edges_df = edges,
 87 |   attr_theme = "lr"
 88 | ) %>% 
 89 |   set_node_attrs("shape", "rectangle") %>% 
 90 |   set_node_attrs("fillcolor", "white") %>% 
 91 |   set_node_attrs("fixedsize", F) %>% 
 92 |   set_node_attrs("fontcolor", "black") %>% 
 93 |   set_node_attrs("color", "black") %>% 
 94 |   set_edge_attrs("color", "black")
 95 | 
 96 | # Preview
 97 | render_graph(g)
 98 | 
 99 | # Save
100 | export_graph(g, file_name = file.path(path_figures, "figureS04.pdf"), file_type = "pdf")
101 | 


--------------------------------------------------------------------------------
/scripts/main/05_05_figureS05.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 5.
 8 | # ------------------------------------------------------------------------------
 9 | plumes <- readRDS(file.path(path_data, "3_intermediate", "all_smoke_days.rds"))
10 | smoke <- readRDS(file.path(path_data, "3_intermediate", "all_smoke_days_incl_cloudy.rds"))
11 | epa_data <- readRDS(file.path(path_data, "3_intermediate", "station_smokePM.rds")) %>% ungroup
12 | epa_ll <- st_read(file.path(path_data, "epa_station_locations")) %>% 
13 |   rename(grid_id_10km = grid_10km)
14 | 
15 | counties <- tigris::counties(cb = TRUE)
16 | states <- tigris::states(cb = TRUE)
17 | 
18 | # select stations to show timeseries for
19 | set.seed(20202)
20 | station_set <- epa_data %>%
21 |   filter(date >= as.Date("2020-01-01")) %>% 
22 |   group_by(id) %>% 
23 |   summarise(n = n(), 
24 |             max_PM = max(pm25)) %>% 
25 |   filter(n > 100, max_PM > 50) %>% 
26 |   slice_sample(n = 4) %>% 
27 |   pull(id)
28 | 
29 | # identify the counties that the stations fall in for labeling 
30 | station_loc <- cbind(id = epa_ll %>% 
31 |                        filter(id %in% station_set) %>% 
32 |                        pull(id), 
33 |                      counties[st_intersects(epa_ll %>% 
34 |                                               filter(id %in% station_set), 
35 |                                             counties %>% st_transform(st_crs(epa_ll))) %>% 
36 |                                 unlist, ]) %>% 
37 |   left_join(states %>% select(STATEFP, STUSPS) %>% st_drop_geometry) %>% 
38 |   select(id, NAME, STUSPS)
39 | 
40 | epa_data %>% 
41 |   filter(id %in% station_set) %>% 
42 |   filter(date >= as.Date("2020-01-01")) %>% 
43 |   left_join(plumes %>% 
44 |               rename(plume = smoke_day) %>% 
45 |               filter(grid_id_10km %in% unique(epa_ll$grid_id_10km))) %>% 
46 |   mutate(smoke_def = case_when(smoke_day == 1 & plume == 1 ~ "plume", 
47 |                                smoke_day == 1 & is.na(plume) ~ "hysplit + aod", 
48 |                                smoke_day == 0 ~ as.character(NA))) %>% 
49 |   left_join(station_loc %>% st_drop_geometry()) %>% 
50 |   mutate(panel = paste0(NAME, " County, ", STUSPS)) %>% 
51 |   group_by(id) %>% 
52 |   mutate(max_pm = max(pm25)) %>%
53 |   {ggplot(data = ., aes(x = date, y = pm25)) + 
54 |       geom_line() +
55 |       geom_point(data = filter(., smoke_day == 1), 
56 |                  mapping = aes(x = date, y = -0.05*max_pm, 
57 |                                color = smoke_def),
58 |                  size = 0.9) + 
59 |       geom_point(data = filter(., panel == "Douglas County, NE" & date == as.Date("2020-07-04")), 
60 |                  color = "blue", size = 0.9) + 
61 |       scale_color_manual(name = "", values = c("grey", "#BD3106")) +
62 |       facet_wrap(~panel, ncol = 1, scales = "free_y") + 
63 |       xlab("") + ylab(expression(PM[2.5])) +
64 |       theme_classic()} %>% 
65 |   ggsave(file.path(path_figures, "figureS05.png"), ., 
66 |          width = 5, height = 6)
67 | 


--------------------------------------------------------------------------------
/scripts/main/05_06_figureS06.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots supplemental figure 6.
  8 | # ------------------------------------------------------------------------------
  9 | # restart using the data from Kara and Jen 
 10 | dt <- read.csv(file.path(path_data, "IMPROVE", "IMPROVE_CSN_smokeday_data.csv")) %>% 
 11 |   mutate(date = as.Date(paste0(year, "-", month, "-", day), 
 12 |                         format = "%Y-%m-%d")) %>% 
 13 |   filter(date >= as.Date("2006-01-01"))
 14 | 
 15 | species <- c('PM2.5', 'OC','EC', 'SO4', 'Dust') 
 16 | 
 17 | # calculate pm25 anomalies and species anomalies, 
 18 | # how much of the pm25 anomalies are explained/accounted for by the OC anomalies? 
 19 | species_anom <- dt %>% 
 20 |   select(Network, Site, date, all_of(species), smoke_day) %>% 
 21 |   pivot_longer(all_of(species)) %>% 
 22 |   filter(!is.na(value)) %>% 
 23 |   filter(!is.na(smoke_day)) %>%
 24 |   mutate(month = lubridate::month(date), 
 25 |          year = lubridate::year(date), 
 26 |          name = gsub("\\.", "", name)) %>%
 27 |   unite(net_site_species, Network, Site, name) %>% 
 28 |   {left_join(., 
 29 |              nonsmoke_medians(., value, smoke_day, net_site_species, month, year), 
 30 |              by = c("net_site_species", "month", "year"))} %>% 
 31 |   mutate(anom = value - value_med_3yr) %>% 
 32 |   separate(net_site_species, into = c("Network", "Site", "Species"), sep = "_") 
 33 | 
 34 | site_nonsmokePM <- dt %>% 
 35 |   filter(!is.na(smoke_day)) %>%
 36 |   group_by(Network, Site, State, Latitude, Longitude, smoke_day) %>% 
 37 |   summarise(baselinePM = median(`PM2.5`, na.rm = T), 
 38 |             n = n(),
 39 |             .groups = "drop") %>% 
 40 |   pivot_wider(values_from = c(baselinePM, n), 
 41 |               names_from = smoke_day, 
 42 |               names_prefix  = "smokeday") %>% 
 43 |   filter(!is.na(n_smokeday1)) %>% 
 44 |   select(-starts_with("n_smokeday"), -baselinePM_smokeday1) %>% 
 45 |   rename(baselinePM = baselinePM_smokeday0) %>% 
 46 |   filter(State %in% c("HI", "AK") == F)
 47 | 
 48 | pct_anom <- species_anom %>%
 49 |   select(-nobs_3yr) %>%
 50 |   filter(Site != "240239000") %>% # has duplicate obs
 51 |   pivot_wider(id_cols = c(Network, Site, date, smoke_day), 
 52 |               values_from = c(anom, value, value_med_3yr), 
 53 |               names_from = Species) %>% 
 54 |   mutate(#pct_anom_NH4 = anom_NH4/anom_PM25,
 55 |     pct_anom_SO4 = anom_SO4/anom_PM25,
 56 |     pct_anom_OC = anom_OC/anom_PM25,
 57 |     pct_anom_EC = anom_EC/anom_PM25,
 58 |     pct_anom_dust = anom_Dust/anom_PM25) %>%
 59 |   filter(!is.na(value_med_3yr_PM25)) %>%  
 60 |   select(starts_with("pct_anom"), anom_PM25, smoke_day, date, Site, Network) %>% 
 61 |   left_join(site_nonsmokePM %>% mutate(Site = as.character(Site))) %>% 
 62 |   mutate(baselinePM = ifelse(baselinePM > 8, "High", "Low")) %>%
 63 |   pivot_longer(starts_with("pct_anom")) %>% 
 64 |   mutate(day = case_when(smoke_day == 0  ~ "non-smoke day", 
 65 |                          smoke_day == 1 & anom_PM25 <= 0 ~ "smoke day,\nanom < 0", 
 66 |                          smoke_day == 1 & anom_PM25 <= 25 ~ "smoke day,\n0 < anom <= 25 ", 
 67 |                          smoke_day == 1 & anom_PM25 > 25 ~ "smoke day,\n25 < anom"),
 68 |          day = factor(day, levels = c("non-smoke day", "smoke day,\nanom < 0",
 69 |                                       "smoke day,\n0 < anom <= 25 ", "smoke day,\n25 < anom"),
 70 |                       ordered = T),
 71 |          name = gsub("pct_anom_", "", name))
 72 | 
 73 | pct_anom %>% 
 74 |   filter(anom_PM25 != 0) %>%
 75 |   filter(!is.na(baselinePM)) %>%
 76 |   filter(day != "smoke day,\nanom < 0") %>%
 77 |   filter(!is.na(value)) %>% 
 78 |   mutate(name = recode_factor(name, 
 79 |                               "dust" = "Dust", 
 80 |                               "EC" = "Elemental~carbon", 
 81 |                               "SO4" = "SO[4]", 
 82 |                               "OC" = "Organic~carbon", 
 83 |                               .ordered = T)) %>%
 84 |   {ggplot(data =., 
 85 |           aes(x = value,
 86 |               y = ..ndensity..,
 87 |               group = day,
 88 |               color = day,
 89 |               fill = day)) +
 90 |       geom_density(alpha = 0.1, alpha = 0.5, adjust = 2) +
 91 |       theme_classic() +
 92 |       facet_grid(paste0(baselinePM, "~baseline~PM[2.5]") ~ name,
 93 |                  scales = "free",
 94 |                  labeller = label_parsed) +
 95 |       geom_vline(xintercept = 0, color = "grey30", linetype = "dotted") +
 96 |       scale_x_continuous(limits = c(-0.5, 1), # results in 46580/635396 (~7%) obs being dropped
 97 |                          labels = scales::percent) +
 98 |       scale_color_manual(values = c("black", "blue", "red"),
 99 |                          aesthetics = c("color", "fill")) +
100 |       labs(color = "",
101 |            fill = "",
102 |            y = "density") + 
103 |       xlab(bquote(Percent~PM[2.5]~anomaly)) + 
104 |       theme(panel.spacing.x = unit(1, "lines"), 
105 |             strip.text = element_text(size = 12),
106 |             strip.background = element_blank())} %>% 
107 |   ggsave(file.path(path_figures, "figureS06b.png"), ., 
108 |          width = 10, height = 6)
109 | 
110 | states <- tigris::states(cb = TRUE) %>% 
111 |   st_transform(crs = 4326)
112 | 
113 | site_nonsmokePM %>% 
114 |   filter(!is.na(baselinePM)) %>% 
115 |   st_as_sf(coords = c("Longitude", "Latitude"))  %>%
116 |   st_set_crs(4326) %>%
117 |   mutate(baselinePM_class = ifelse(baselinePM > 8, "high", "low"), 
118 |          baselinePM = pmin(baselinePM, 15)) %>%  
119 |   {ggplot(data = ., 
120 |           aes(color = baselinePM,
121 |               fill = baselinePM,
122 |               shape = Network)) + 
123 |       geom_sf(data = states %>% filter(STATEFP %in% nonContig_stateFIPS == F), 
124 |               fill = "white", inherit.aes = FALSE) +
125 |       geom_sf() + 
126 |       scale_color_gradientn(colors = c(cmocean::cmocean(name = "balance", start = 0, end = 0.4)(20),
127 |                                        cmocean::cmocean(name = "balance", start = 0.6, end = 1)(20)),
128 |                             name = "",
129 |                             aesthetics = c("fill", "color"), 
130 |                             rescaler = mid_rescaler(8), 
131 |                             breaks = c(5, 10, 15), 
132 |                             labels = c("5", "10", ">15"),
133 |                             guide = guide_colorbar(label.theme = element_text(size = 7), 
134 |                                                    barheight = 4, barwidth = 1)) + 
135 |       theme_void() + 
136 |       theme(legend.position = c(0.93, 0.25))} %>% 
137 |   ggsave(file.path(path_figures, "figureS06a.png"), 
138 |          ., width = 6, height = 4)
139 | 


--------------------------------------------------------------------------------
/scripts/main/05_07_figureS07.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 7.
 8 | # ------------------------------------------------------------------------------
 9 | aod_df <- readRDS(file.path(path_data, "4_clean", "aod_training.rds"))
10 | grid_orig <- st_read(file.path(path_data, "1_grids", "1km_aod_grid_wgs84_training")) %>% 
11 |   mutate(train_cells = (grid_id %in% unique(aod_df$grid_id_1km)))
12 | 
13 | states <- tigris::states(cb = TRUE) %>% 
14 |   filter(!(STATEFP %in% nonContig_stateFIPS))
15 | 
16 | {ggplot() + 
17 |     geom_sf(data = states, 
18 |             color = "grey30", 
19 |             lwd = 0.1) + 
20 |     geom_sf(data = grid_orig, 
21 |             aes(color = train_cells, 
22 |                 fill = train_cells)) + 
23 |     scale_color_manual(name = "", 
24 |                        values = c("black", "blue"), 
25 |                        labels = c("original 5,000 cells", 
26 |                                   "training locations"), 
27 |                        aesthetics = c("color", "fill")) + 
28 |     theme_void() + 
29 |     theme(legend.position = c(0.75, 0.3), 
30 |           legend.justification = "left", 
31 |           legend.key.size = unit(0.4, 'cm'))} %>% 
32 |   ggsave(file.path(path_figures, "figureS07.png"), 
33 |          ., width = 5, height = 3.5)
34 | 


--------------------------------------------------------------------------------
/scripts/main/05_08_figureS08.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots supplemental figure 8.
  8 | # ------------------------------------------------------------------------------
  9 | crosswalk <- readRDS(file.path(path_data, "1_grids", "grid_crosswalk_1km_10km.rds"))
 10 | 
 11 | aod_train <- readRDS(file.path(path_data, "4_clean", "aod_training.rds"))
 12 | 
 13 | aod_train %<>% select(starts_with("grid"), date, month, aod_anom, lat, lon, fold)
 14 | 
 15 | aod_train_cells <- aod_train$grid_id_1km %>% unique
 16 | 
 17 | # use the extracted aod that we didn't train on to test with
 18 | smoke_missing_dates = readRDS(file.path(path_data, "smoke", "smoke_dates_not_online.rds"))
 19 | smoke_missing_dates = ymd(smoke_missing_dates)
 20 | 
 21 | smoke_days <- readRDS(file.path(path_data, "3_intermediate", "all_smoke_days_incl_cloudy.rds"))
 22 | 
 23 | aod_null_value <- -999999
 24 | 
 25 | aod_test <- map_dfr(list.files(file.path(path_data, "2_from_EE", "maiac_AOD_training"), full.names = TRUE), 
 26 |                     function(x) read_csv(x) %>% filter(!(grid_id %in% aod_train_cells))) %>% 
 27 |   transmute(grid_id_1km = grid_id, 
 28 |             date = as.Date(as.character(start_date), format = "%Y%m%d"),
 29 |             aod = median) %>% 
 30 |   filter(date >= as.Date("2006-01-01")) %>%
 31 |   mutate(month = lubridate::month(date), 
 32 |          year = lubridate::year(date)) %>% 
 33 |   left_join(crosswalk, by = "grid_id_1km") %>%
 34 |   left_join(smoke_days %>% select(-note_smoke_date_not_online), 
 35 |             by = c("date", "grid_id_10km")) %>%
 36 |   replace_na(list(smoke_day = 0)) %>% 
 37 |   filter(aod != aod_null_value) %>%
 38 |   {left_join(., 
 39 |              nonsmoke_medians(filter(., !(date %in% smoke_missing_dates)), 
 40 |                               aod, smoke_day, grid_id_1km, month, year), 
 41 |              by = c("grid_id_1km", "month", "year"))} %>% 
 42 |   mutate(aod_anom = aod - aod_med_3yr) 
 43 | 
 44 | aod_test_cells <- aod_test$grid_id_1km %>% unique()
 45 | 
 46 | all_aod_cells <- c(aod_test_cells, aod_train_cells)
 47 | 
 48 | aod_preds <- readRDS(file.path(path_output, "smokePM", "model", "test_aod_preds.rds"))
 49 | # predicted vs observed, in and out of sample 
 50 | aod_comp<- rbind(aod_train %>% select(grid_id_1km, date, aod_anom) %>% mutate(test_train = "train"), 
 51 |                  aod_test %>% filter(smoke_day == 1) %>% select(grid_id_1km, date, aod_anom) %>% mutate(test_train = "test")) %>% 
 52 |   left_join(aod_preds)
 53 | 
 54 | aod_comp %>% 
 55 |   arrange(test_train) %>%
 56 |   {ggplot(data = ., aes(x= aod_anom, y = aod_anom_pred, color = test_train)) + 
 57 |   geom_abline(slope = 1, intercept = 0, color = "red") + 
 58 |   geom_point(alpha = 0.3) + 
 59 |   ggpubr::stat_cor(aes(label = ..rr.label..), digits = 4,
 60 |                    show_guide = FALSE) +
 61 |   scale_color_manual(values = c("black", "blue"), name = "") +
 62 |   xlab("observed AOD anomalies") + ylab("predicted AOD anomalies") + 
 63 |   theme_classic()} %>% 
 64 |   ggsave(file.path(path_figures, "figureS08a.png"), 
 65 |          ., width = 6, height = 6)
 66 | 
 67 | # variable importance 
 68 | var_import <- readRDS(file.path(path_output, "smokePM", "model", "AOD_var_importance.rds"))
 69 | var_import$variable_importance %>% 
 70 |   left_join(data.frame(feat_name = var_import$feature_names) %>% 
 71 |               mutate(Feature = paste0("f", (1:n()) - 1)))  %>% 
 72 |   mutate(group = case_when(grepl("aod_anom|aot_anom", feat_name) ~ "aerosols",
 73 |                            grepl("fire", feat_name) ~ "fire", 
 74 |                            grepl("traj", feat_name) ~ "hysplit",
 75 |                            feat_name %in% c("lat", "lon") ~ "cross sectional",
 76 |                            feat_name %in% c("dewpoint_temp_2m", "temp_2m", "wind_u", "wind_v", "precip", "pbl_mean",
 77 |                                           "surface_pressure", "sea_level_pressure") ~ "meteorology",
 78 |                            T ~ as.character(NA))) %>% 
 79 |   mutate(feat_name = factor(feat_name, levels = rev(.$feat_name), ordered = T)) %>% 
 80 |   mutate(feat_name = recode(feat_name, 
 81 |                             "lon" = "Longitude", 
 82 |                             "lat" = "Latitude",
 83 |                             "closest_fire_num_points" = "Points in closest fire cluster",
 84 |                             "dewpoint_temp_2m" = "Dewpoint temperature", 
 85 |                             "fire_dist_km" = "Distance to closest fire cluster", 
 86 |                             "closest_fire_area" = "Area of closest fire cluster",
 87 |                             "sea_level_pressure" = "Sea level pressure",
 88 |                             "wind_v" = "Wind speed (eastward)", 
 89 |                             "wind_u" = "Wind speed (westward)", 
 90 |                             "precip" = "Total precipitation", 
 91 |                             "pbl_mean" = "Planetary Boundary Layer (mean)",
 92 |                             "temp_2m" = "Temperature", 
 93 |                             "aot_anom" = "AOT anomaly (current)",
 94 |                             "aot_anom_lag1" = "AOT anomaly (1-day lag)",
 95 |                             "aot_anom_lag2" = "AOT anomaly (2-day lag)")) %>%
 96 |   magrittr::extract(1:15,) %>%
 97 |   {ggplot(data = ., 
 98 |           aes(x = Gain, y = feat_name)) + 
 99 |       geom_segment(aes(yend = feat_name, color = group), xend = 0, lwd = 6) + 
100 |       xlim(0, NA) + 
101 |       scale_color_manual(values = MetBrewer::met.brewer("VanGogh2", 15)[c(15, 3, 7, 9)],
102 |                          name = "") +
103 |       theme_classic() + 
104 |       ylab("") + 
105 |       theme(legend.position = c(0.7, 0.3))} %>% 
106 |   ggsave(file.path(path_figures, "figureS08b.png"), ., 
107 |          width = 4, height = 4)
108 | 


--------------------------------------------------------------------------------
/scripts/main/05_09_figureS09.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 9.
 8 | # ------------------------------------------------------------------------------
 9 | grid <- read_sf(file.path(path_data, "1_grids", "10km_grid"))
10 | pop <- list.files(file.path(path_data, "2_from_EE", "populationDensity_10km_subgrid"), 
11 |                   full.names = T) %>% 
12 |   purrr:::map_dfr(read.csv)
13 | 
14 | grid %<>% left_join(pop)
15 | 
16 | # smoke PM data 
17 | epa_stations <- readRDS(file.path(path_data, "3_intermediate", "station_smokePM.rds")) %>% 
18 |   pull(id) %>% 
19 |   unique
20 | 
21 | epa_ll <- read_sf(file.path(path_data, "epa_station_locations")) %>% 
22 |   st_transform(st_crs(grid)) %>% 
23 |   filter(id %in% epa_stations)
24 | 
25 | grid_station_dists <- st_distance(st_centroid(grid), epa_ll, by_element = FALSE)
26 | 
27 | grid$centroid_station_dist <- apply(grid_station_dists, 1, min)
28 | 
29 | median(grid$centroid_station_dist)/1e3
30 | weighted.mean(grid$centroid_station_dist, grid$mean)/1e3
31 | 
32 | # Plot
33 | pdf(file.path(path_figures, "figureS09.pdf"), 
34 |     width = 10, height = 5)
35 | par(mar = c(5, 5, 6, 2) + 0.1)
36 | hist(grid$centroid_station_dist/1e3, breaks = seq(0, 300, 10),
37 |      xlab = "minimum distance (km)", ylab = "",
38 |      main = "", las = 1)
39 | title(ylab = "Frequency", line = 3.5)
40 | title(main = "Distance from prediction grid centroid to EPA station", line = 4)
41 | 50 %>%
42 |   {abline(v = ., col = "black", lwd = 3)
43 |     text(x = ., y = 15200, label = "merra2\nresolution", col = "black", xpd = T)}
44 | 
45 | median(grid$centroid_station_dist/1e3) %>% 
46 |   {abline(v = ., col = "red", lty = "dashed")
47 |     text(x = .-4, y = 13800, label = "median",
48 |          col = "red", xpd = T)}
49 | 
50 | mean(grid$centroid_station_dist/1e3) %>% 
51 |   {abline(v = ., col = "blue")
52 |     text(x = .+4, y = 14000, label = "mean", col = "blue", xpd = T)}
53 | 
54 | weighted.mean(grid$centroid_station_dist/1e3, grid$mean) %>% 
55 |   {abline(v = ., col = "darkgreen")
56 |     text(x = .-8, y = 14000, label = "pop-weighted\nmean", 
57 |          col = "darkgreen", xpd = T)}
58 | 
59 | dev.off()
60 | 


--------------------------------------------------------------------------------
/scripts/main/05_10_figureS10.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 10.
 8 | # ------------------------------------------------------------------------------
 9 | monitors <- st_read(file.path(path_data, "epa_station_locations")) 
10 | states <- tigris::states() %>% st_transform(st_crs(monitors))
11 | 
12 | monitors$state <- states$STUSPS[st_intersects(monitors, states) %>% unlist]
13 | 
14 | smokePM_data <- readRDS(file.path(path_data, "4_clean", "smokePM_training.rds")) %>% 
15 |   select(id, date, smokePM, fold)
16 | 
17 | cv_preds <- list.files(file.path(path_output, "smokePM", "model"), 
18 |                        pattern = "pred_fold", 
19 |                        full.names = TRUE) %>% 
20 |   purrr::map_dfr(function(x){
21 |     readRDS(x) %>%
22 |       mutate(test_fold = gsub("^smokePM_pred_fold|_drop|-aod_anom_pred|-traj_points|\\.rds$", "", basename(x)), 
23 |              drop_var = gsub("^smokePM_pred_fold[0-4]_drop-?|\\.rds$", "", basename(x))) %>%
24 |       return()
25 |   })
26 | 
27 | model_metrics <- cv_preds %>%
28 |   mutate(smokePM_pred = pmax(0, smokePM_pred)) %>%
29 |   left_join(smokePM_data, by = c("id", "date", "fold")) %>% 
30 |   mutate(test = fold == test_fold) %>%
31 |   eval_metrics(models = drop_var, test_tune = test, 
32 |                obs = smokePM, pred = smokePM_pred, loc_id = id) %>%
33 |   pivot_longer(contains("rank") | contains("value")) %>% 
34 |   separate(name, into = c("metric", "rank_value"), sep = "_(?=rank|value)") %>% 
35 |   pivot_wider(values_from = value, names_from = rank_value) %>%
36 |   separate(metric, into = c("metric", "subset"), sep = "_", extra = "merge") 
37 | 
38 | # comparison between full model, without AOD preds, and without HYSPLIT
39 | model_metrics %>%
40 |   filter(grepl("day", subset)) %>% 
41 |   filter(metric %in% c("rmse", "r2", "wr2")) %>% 
42 |   filter(test == TRUE) %>% 
43 |   mutate(subset = recode_factor(subset, 
44 |                                 "day" = "all days", 
45 |                                 "day_sub50" = "days with\n< 50ug",
46 |                                 "day_over50" = "days with\n> 50ug", 
47 |                                 .ordered = T), 
48 |          metric = recode_factor(metric, 
49 |                                 "r2" = "R^2", 
50 |                                 "wr2" = "within~R^2", 
51 |                                 "rmse" = "RMSE",
52 |                                 .ordered = TRUE),
53 |          drop_var = case_when(drop_var == "" ~ "full model", 
54 |                               drop_var =="aod_anom_pred" ~ "no AOD predictions", 
55 |                               drop_var =="traj_points" ~ "no HYSPLIT"),
56 |          drop_var = factor(drop_var, 
57 |                            levels = c("full model", "no HYSPLIT", "no AOD predictions"), 
58 |                            ordered = TRUE)) %>% 
59 |   {ggplot(data = ., 
60 |           aes(x = subset, y = value, color = drop_var,
61 |               group = drop_var)) + 
62 |       geom_point() + 
63 |       geom_line() + 
64 |       scale_x_discrete(expand = expansion(mult = 0.04)) + 
65 |       ylim(0, NA) + 
66 |       facet_wrap(~metric, scales = "free", strip.position = "left",
67 |                  labeller = label_parsed) + 
68 |       theme_classic() +
69 |       theme(strip.placement = "outside",
70 |             strip.background = element_blank(),
71 |             panel.spacing = unit(-0.1, "lines"),
72 |             strip.switch.pad.wrap = unit(-0.25, "lines")) + 
73 |       scale_color_manual(name = "", values = MetBrewer::met.brewer("Homer2", 5)[c(1,2,4)]) + 
74 |       xlab("") + ylab("")} %>% 
75 |   ggsave(file.path(path_figures, "figureS10.png"), ., 
76 |          width = 8, height= 3)
77 | 
78 | # change in model r2 from aod and hysplit inclusion
79 | model_metrics %>%
80 |   filter(subset == "day" & metric == "r2"  & test == TRUE) %>% 
81 |   mutate(diff = max(value) - value) %>% View
82 | 


--------------------------------------------------------------------------------
/scripts/main/05_11_figureS11.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots station MODIS landcover.
  8 | # ------------------------------------------------------------------------------
  9 | # Load smokePM predictions from held-out folds
 10 | smokePM_preds <- list.files(file.path(path_output, "smokePM", "model"),
 11 |                             pattern = "smokePM_pred",
 12 |                             full.names = TRUE) %>%
 13 |   grep("drop\\.", ., value = TRUE) %>%
 14 |   map_dfr(function(x){
 15 |     readRDS(x) %>% mutate(test_fold = as.numeric(
 16 |       substr(gsub("smokePM_pred_fold", "", basename(x)), 1, 1)))
 17 |   }) %>%
 18 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
 19 | 
 20 | smokePM_data <- readRDS(file.path(path_data, "4_clean", "smokePM_training.rds")) %>% 
 21 |   select(id, date, smokePM, water:wetlands)
 22 | 
 23 | epa_ll <- st_read(file.path(path_data, "epa_station_locations")) %>% 
 24 |   rename(grid_id_10km = grid_10km)
 25 | 
 26 | epa_data <- readRDS(file.path(path_data, "3_intermediate", "station_smokePM.rds")) 
 27 | 
 28 | states <- tigris::states(cb = TRUE)
 29 | 
 30 | station_performance <- smokePM_preds %>% 
 31 |   filter(fold == test_fold) %>% 
 32 |   left_join(smokePM_data %>% select(id, date, smokePM)) %>% 
 33 |   select(-test_fold, -date) %>%
 34 |   nest_by(id) %>% 
 35 |   mutate(n = nrow(data), 
 36 |          fold = unique(data$fold),
 37 |          n_unique_smokePM = length(unique(data$smokePM))) %>% 
 38 |   filter(n > 1 & n_unique_smokePM > 1) %>% 
 39 |   mutate(r2 = fixest::r2(fixest::feols(smokePM ~ smokePM_pred, 
 40 |                                        data = data), 
 41 |                          "r2") %>% unname,
 42 |          max_smokePM = max(data$smokePM),
 43 |          avg_smokePM = mean(data$smokePM),
 44 |          skew_smokePM = e1071::skewness(data$smokePM), 
 45 |          kurt_smokePM = e1071::kurtosis(data$smokePM),
 46 |          range_smokePM = range(data$smokePM) %>% diff,
 47 |          var_smokePM = var(data$smokePM)) %>% 
 48 |   {right_join(epa_ll ,.)} %>% 
 49 |   cbind(.,
 50 |         stations_50km = st_distance(.) %>% 
 51 |           as.matrix %>% 
 52 |           units::set_units(NULL) %>%  
 53 |           magrittr::is_less_than(50000) %>% 
 54 |           rowSums(),
 55 |         stations_100km = st_distance(.) %>% 
 56 |           as.matrix %>% 
 57 |           units::set_units(NULL) %>%  
 58 |           magrittr::is_less_than(100000) %>% 
 59 |           rowSums()) %>% 
 60 |   left_join(epa_data %>% 
 61 |               group_by(id, smoke_day) %>% 
 62 |               summarise(avg_totalPM = mean(pm25), 
 63 |                         var_totalPM = var(pm25),
 64 |                         skew_totalPM = e1071::skewness(pm25),
 65 |                         kurt_totalPM = e1071::kurtosis(pm25),
 66 |                         .groups = "drop") %>% 
 67 |               mutate(smoke_day = recode(smoke_day, 
 68 |                                         "1" = "smoke", 
 69 |                                         "0" = "non_smoke")) %>% 
 70 |               pivot_wider(values_from = ends_with("_totalPM"), 
 71 |                           names_sep = "_",
 72 |                           names_from = smoke_day)) %>% 
 73 |   left_join(smokePM_data %>% select(id, water:wetlands) %>% unique) %>%
 74 |   mutate(class = names(select(st_drop_geometry(.), water:wetlands))[max.col(select(st_drop_geometry(.), water:wetlands))])
 75 | 
 76 | ee_Initialize(email = gee_email)
 77 | 
 78 | lc = ee$ImageCollection("MODIS/006/MCD12Q1") %>% 
 79 |   ee$ImageCollection$filterDate("2013-01-01", "2014-01-01") %>% 
 80 |   ee$ImageCollection$first() %>% 
 81 |   ee$Image$select("LC_Type5")
 82 | 
 83 | if (!file.exists(file.path(path_data, "station_MODIS_landcover.rds"))) {
 84 |   # Takes ~12 minutes
 85 |   start_time = get_start_time()
 86 |   station_lc = map_dfr(1:nrow(station_performance), function(i) {
 87 |     print(i)
 88 |     ee_extract(lc, station_performance[i,], 
 89 |                scale = lc$projection()$nominalScale()$getInfo())
 90 |   }) %>% 
 91 |     select(id, Type5 = LC_Type5)
 92 |   print_time(start_time)
 93 |   saveRDS(station_lc, file.path(path_data, "station_MODIS_landcover.rds"))
 94 | } else {
 95 |   station_lc = readRDS(file.path(path_data, "station_MODIS_landcover.rds"))
 96 | }
 97 | 
 98 | # plot station land cover spatially 
 99 | epa_ll %>% 
100 |   filter(id %in% station_performance$id) %>% 
101 |   left_join(station_lc) %>% 
102 |   {ggplot(data = .) + 
103 |       geom_sf(data = states  %>% filter(!(STATEFP %in% nonContig_stateFIPS)),
104 |               color = "grey10", fill = "grey95") + 
105 |       geom_sf(aes(color = case_when(Type5 == 11 ~ "barren",
106 |                                     Type5 == 5 ~ "shrubland", 
107 |                                     T ~ "other"),
108 |                   size = I(ifelse(Type5 %in% c(5, 11), 1.3, 0.9)))) + 
109 |       scale_color_manual(name = "landcover", 
110 |                          values = c("#6e948c", "black", "#c38f16")) + 
111 |       theme_void()} %>% 
112 |   ggsave(file.path(path_figures, "figureS11.png"), ., width = 6, height = 4)
113 | 
114 | # 0	1c0dff	Water Bodies: at least 60% of area is covered by permanent water bodies.
115 | # 1	05450a	Evergreen Needleleaf Trees: dominated by evergreen conifer trees (>2m). Tree cover >10%.
116 | # 2	086a10	Evergreen Broadleaf Trees: dominated by evergreen broadleaf and palmate trees (>2m). Tree cover >10%.
117 | # 3	54a708	Deciduous Needleleaf Trees: dominated by deciduous needleleaf (larch) trees (>2m). Tree cover >10%.
118 | # 4	78d203	Deciduous Broadleaf Trees: dominated by deciduous broadleaf trees (>2m). Tree cover >10%.
119 | # 5	dcd159	Shrub: Shrub (1-2m) cover >10%.
120 | # 6	b6ff05	Grass: dominated by herbaceous annuals (<2m) that are not cultivated.
121 | # 7	dade48	Cereal Croplands: dominated by herbaceous annuals (<2m). At least 60% cultivated cereal crops.
122 | # 8	c24f44	Broadleaf Croplands: dominated by herbaceous annuals (<2m). At least 60% cultivated broadleaf crops.
123 | # 9	a5a5a5	Urban and Built-up Lands: at least 30% impervious surface area including building materials, asphalt, and vehicles.
124 | # 11	f9ffa4	Non-Vegetated Lands: at least 60% of area is non-vegetated barren (sand, rock, soil) with less than 10% vegetation.
125 | 


--------------------------------------------------------------------------------
/scripts/main/05_12_figureS12.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Plots supplemental figure 12.
  8 | # ------------------------------------------------------------------------------
  9 | states <- tigris::states(cb = TRUE)
 10 | 
 11 | epa_ll <- st_read(file.path(path_data, "epa_station_locations")) %>% 
 12 |   rename(grid_id_10km = grid_10km)
 13 | 
 14 | epa_data <- readRDS(file.path(path_data, "3_intermediate", "station_smokePM.rds"))
 15 | 
 16 | smokePM_preds <- list.files(file.path(path_output, "smokePM", "model"),
 17 |                             pattern = "smokePM_pred",
 18 |                             full.names = TRUE) %>%
 19 |   grep("drop\\.", ., value = TRUE) %>%
 20 |   map_dfr(function(x){
 21 |     readRDS(x) %>%
 22 |       mutate(test_fold = gsub("^smokePM_pred_fold|_drop|-aod_anom_pred|-traj_points|\\.rds$", "", basename(x)))
 23 |   }) %>%
 24 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
 25 | 
 26 | smokePM_data <- readRDS(file.path(path_data, "4_clean", "smokePM_training.rds")) %>% 
 27 |   select(id, date, smokePM)
 28 | 
 29 | station_performance <- smokePM_preds %>% 
 30 |   filter(fold == test_fold) %>% 
 31 |   left_join(smokePM_data %>% select(id, date, smokePM)) %>% 
 32 |   select(-test_fold, -date) %>%
 33 |   nest_by(id) %>% 
 34 |   mutate(n = nrow(data), 
 35 |          fold = unique(data$fold),
 36 |          n_unique_smokePM = length(unique(data$smokePM))) %>% 
 37 |   filter(n > 1 & n_unique_smokePM > 1) %>% 
 38 |   mutate(r2 = fixest::r2(fixest::feols(smokePM ~ smokePM_pred, 
 39 |                                        data = data), 
 40 |                          "r2") %>% unname,
 41 |          max_smokePM = max(data$smokePM),
 42 |          max_smokePM_pred = max(data$smokePM_pred))
 43 | 
 44 | set.seed(909)
 45 | station_set <- station_performance %>% 
 46 |   select(-data) %>%
 47 |   ungroup %>% 
 48 |   filter(n > 100) %>%
 49 |   filter(r2 < 0.1)
 50 | 
 51 | smokePM_preds %>% 
 52 |   filter(fold == test_fold) %>% 
 53 |   filter(id %in% station_set$id) %>% 
 54 |   left_join(station_set %>% select(id, r2)) %>% 
 55 |   left_join(smokePM_data %>% 
 56 |               select(id, date, smokePM)) %>% 
 57 |   group_by(id) %>% 
 58 |   mutate(max_error = abs(smokePM - smokePM_pred) == max(abs(smokePM - smokePM_pred))) %>%
 59 |   mutate(year = lubridate::year(date)) %>% 
 60 |   {ggplot(data = ., aes(x = smokePM, y = smokePM_pred)) + 
 61 |       geom_point(data = filter(., max_error), 
 62 |                  size = 3) +
 63 |       geom_point(aes(color = as.factor(id))) +
 64 |       geom_abline(intercept = 0, slope = 1, color = "grey") +
 65 |       geom_text(data = station_set, 
 66 |                 aes(x = 0.8*max_smokePM, 
 67 |                     y = 0.9*max_smokePM_pred,
 68 |                     label = paste("R^2==", signif(r2, 3), sep = "")),
 69 |                 parse = TRUE,
 70 |                 color = "black") + 
 71 |       geom_text(data = station_set, 
 72 |                 aes(x = 0.8*max_smokePM, 
 73 |                     y = 0.75*max_smokePM_pred,
 74 |                     label = paste0("n== ", n)),
 75 |                 parse = TRUE, 
 76 |                 color = "black") + 
 77 |       facet_wrap(~id, scales = "free", 
 78 |                  nrow = 2, ncol = 4) +
 79 |       theme_classic() + 
 80 |       xlab(expression(observed~smoke~PM[2.5])) + 
 81 |       ylab(expression(predicted~smoke~PM[2.5])) + 
 82 |       theme(legend.position = "none")} %>% 
 83 |   ggsave(file.path(path_figures, "figureS12b.png"), .,
 84 |          width = 10, height = 4)
 85 | 
 86 | worst_obs <- smokePM_preds %>% 
 87 |   filter(fold == test_fold) %>% 
 88 |   filter(id %in% station_set$id) %>% 
 89 |   left_join(smokePM_data %>% 
 90 |               select(id, date, smokePM)) %>% 
 91 |   group_by(id) %>% 
 92 |   mutate(max_error = abs(smokePM - smokePM_pred) == max(abs(smokePM - smokePM_pred)), 
 93 |          max_error_date = date[max_error == T],
 94 |          start_date = max_error_date - as.difftime(30, units = "days"), 
 95 |          end_date = max_error_date + as.difftime(30, units = "days"), 
 96 |          .groups = "drop") %>% 
 97 |   filter(max_error)
 98 | 
 99 | epa_data %>% 
100 |   ungroup %>% 
101 |   filter(id %in% station_set$id) %>% 
102 |   left_join(ungroup(worst_obs) %>% 
103 |               select(id, max_error_date, start_date, end_date)) %>% 
104 |   filter(date >= start_date & date <= end_date) %>%
105 |   group_by(id) %>% 
106 |   mutate(max_pm = max(pm25)) %>% 
107 |   ungroup %>% 
108 |   {ggplot(data = .) + 
109 |       geom_line(aes(x = date, y = pm25)) +
110 |       geom_line(aes(x = date, y = smokePM), color = "red") +
111 |       geom_point(data = filter(., date == max_error_date), 
112 |                  aes(x = date, y = pm25), color = "blue") +
113 |       geom_point(data = filter(., smoke_day == 1),
114 |                  mapping = aes(x = date, y = -0.05*max_pm),
115 |                  color = "grey", inherit.aes = FALSE) +
116 |       facet_wrap(~id, scales = "free", 
117 |                  nrow = 2, ncol = 4) +
118 |       ylab(expression(PM[2.5])) + xlab("") + 
119 |       theme_classic() + 
120 |       theme(legend.position = "none")} %>% 
121 |   ggsave(file.path(path_figures, "figureS12c.png"), ., 
122 |          width = 10, height = 4)
123 | 
124 | {ggplot() +
125 |     geom_sf(data = states %>% filter(!(STATEFP %in% nonContig_stateFIPS))) + 
126 |     geom_sf(data = epa_ll %>% filter(id %in% unique(smokePM_data$id)), 
127 |             size = 1) + 
128 |     geom_sf(data = epa_ll %>% filter(id %in% station_set$id),
129 |             mapping = aes(color = as.factor(id)), 
130 |             size = 3) + 
131 |     theme_void() + 
132 |     theme(legend.position = "none")} %>% 
133 |   ggsave(file.path(path_figures, "figureS12a.png"), .,
134 |          width = 5, height = 3.5)
135 | 
136 | smokePM_data %>% 
137 |   group_by(id) %>% 
138 |   summarise(n = n(), .groups = "drop") %>% 
139 |   summarise(mean_n = mean(n), 
140 |             med_n = median(n))
141 | 


--------------------------------------------------------------------------------
/scripts/main/05_13_figureS13.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 13.
 8 | # ------------------------------------------------------------------------------
 9 | counties <- tigris::counties(cb = TRUE)
10 | epa_data <- readRDS(file.path(path_data, "3_intermediate", "station_smokePM.rds")) %>% ungroup
11 | epa_ll <- st_read(file.path(path_data, "epa_station_locations")) %>% 
12 |   filter(id %in% unique(epa_data$id)) %>%
13 |   rename(grid_id_10km = grid_10km) %>% 
14 |   mutate(county = counties$NAME[st_intersects(geometry, 
15 |                                               counties %>% st_transform(st_crs(geometry))) %>% 
16 |                                   unlist])
17 | 
18 | fire_panels <- data.frame(panel_name = c("November 2018", "October - November 2019", "Fall 2020"), 
19 |                           start_date = as.Date(c("2018-11-01", "2019-10-20", "2020-08-10")), 
20 |                           end_date = as.Date(c("2018-11-30", "2019-11-15", "2020-11-15"))) %>% 
21 |   mutate(panel_name = factor(panel_name, 
22 |                              levels = panel_name, 
23 |                              ordered = T))
24 | 
25 | calfire <- st_read(file.path(path_data, "CAL FIRE FRAP", "fire20_1.gdb"), layer = "firep20_1") %>% 
26 |   mutate(Fire = recode(FIRE_NAME, 
27 |                        "CAMP" = "Camp Fire", 
28 |                        "KINCADE" = "Kincade Fire", 
29 |                        "AUGUST COMPLEX FIRES" = "August Complex", 
30 |                        "CZU LIGHTNING COMPLEX" = "CZU Lightning Complex"),
31 |          panel_name = recode(FIRE_NAME, 
32 |                              "CAMP" = "November 2018",  
33 |                              "KINCADE" = "October - November 2019",
34 |                              "AUGUST COMPLEX FIRES" = "Fall 2020", 
35 |                              "CZU LIGHTNING COMPLEX" = "Fall 2020"), 
36 |          panel_fire_no = recode(FIRE_NAME, 
37 |                                 "CAMP" = 1,  
38 |                                 "KINCADE" = 1,
39 |                                 "AUGUST COMPLEX FIRES" = 2, 
40 |                                 "CZU LIGHTNING COMPLEX" = 1, 
41 |                                 .default = as.numeric(NA)), 
42 |          select_fires = ((FIRE_NAME == "CAMP" & YEAR_ == "2018" & ALARM_DATE > as.Date("2018-11-01")) |
43 |                            (FIRE_NAME == "KINCADE" & YEAR_ == "2019") |
44 |                            FIRE_NAME  %in% c("AUGUST COMPLEX FIRES", "CZU LIGHTNING COMPLEX")))
45 | 
46 | obs_data <- epa_data %>% 
47 |   left_join(epa_ll %>% 
48 |               st_drop_geometry() %>% 
49 |               select(id, county)) %>% 
50 |   filter(county %in% c("Santa Clara", "Fresno", "Sacramento", "Sonoma")) %>% 
51 |   left_join(fire_panels %>% 
52 |               rowwise %>% 
53 |               mutate(all_dates = list(seq.Date(start_date, end_date, by = "day"))) %>% 
54 |               unnest_longer(all_dates), 
55 |             by = c("date" = "all_dates")) %>% 
56 |   filter(!is.na(panel_name)) 
57 |   
58 | obs_data %>%   
59 |   mutate(id = as.factor(id)) %>%
60 |   {ggplot(data = ., aes(x = date, y = smokePM, group = id, color = county)) + 
61 |       geom_line() + 
62 |       geom_linerange(data = left_join(calfire %>% filter(select_fires),
63 |                                       obs_data %>% group_by(panel_name) %>% summarise(max = max(smokePM, na.rm = T))) %>% 
64 |                        mutate(panel_name = factor(panel_name, levels = unique(panel_name), ordered = T)), 
65 |                      aes(xmin = as.Date(ALARM_DATE), 
66 |                          xmax = as.Date(CONT_DATE), 
67 |                          y = -panel_fire_no*0.1*max), 
68 |                      inherit.aes = FALSE) + 
69 |       geom_label(data = left_join(calfire %>% filter(select_fires) %>% st_drop_geometry(),
70 |                                   obs_data %>% group_by(panel_name) %>% summarise(max = max(smokePM, na.rm = T))) %>%
71 |                    mutate(panel_name = factor(panel_name, levels = unique(panel_name), ordered = T),
72 |                           mid_date = as.Date(ALARM_DATE) + floor(difftime(as.Date(CONT_DATE), as.Date(ALARM_DATE), units = "days")/2)),
73 |                  aes(x = mid_date, y = -panel_fire_no*0.1*max, label = Fire), 
74 |                  inherit.aes = FALSE, size = 3, 
75 |                  label.size = NA, label.padding = unit(0.1, "lines")) +
76 |       facet_wrap(~panel_name, scales = "free", 
77 |                  nrow =3) + 
78 |       scale_color_manual(values = MetBrewer::met.brewer("Juarez")[c(1,6,5,3)]) +
79 |       theme_classic() + 
80 |       theme(strip.background = element_blank(), 
81 |             strip.text = element_text(size = 13, face = 2)) + 
82 |       ylab(expression(paste("observed smoke ", PM[2.5]," (",mu, "g/", m^3, ")"))) + 
83 |       xlab("")} %>% 
84 |   ggsave(file.path(path_figures, "figureS13.png"), .,
85 |          width = 6, height = 8)
86 | 


--------------------------------------------------------------------------------
/scripts/main/05_14_figureS14.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 14.
 8 | # ------------------------------------------------------------------------------
 9 | states <- tigris::states(cb = T) %>% filter(STATEFP %in% nonContig_stateFIPS == FALSE)
10 | 
11 | mtbs <- read_sf(file.path(path_data, "mtbs_perimeter_data"))
12 | mtbs %<>% st_make_valid()
13 | 
14 | state_fires <- st_intersection(mtbs, states)
15 | 
16 | test <- state_fires
17 | 
18 | test$state_area = st_area(test$geometry)
19 | 
20 | state_fire_trend <- test %>% st_drop_geometry() %>%
21 |   mutate(year = lubridate::year(Ig_Date)) %>% 
22 |   group_by(year, STUSPS) %>% 
23 |   summarise(n_fire = n(), 
24 |             area_fire = sum(state_area)) %>% 
25 |   mutate(period = case_when(year >= 2006 & year <= 2010 ~ "2006_2010", 
26 |                             year >= 2016 & year <= 2020 ~ "2016_2020", 
27 |                             T ~ as.character(NA))) %>% 
28 |   filter(!is.na(period)) %>% 
29 |   group_by(period, STUSPS) %>%
30 |   summarise(n_fire = mean(n_fire), 
31 |             area_fire = mean(area_fire)) %>% 
32 |   pivot_wider(names_from = period, 
33 |               values_from = contains("fire")) %>% 
34 |   mutate(pct_change_fire_area = (area_fire_2016_2020 - area_fire_2006_2010)/area_fire_2006_2010)
35 | 
36 | states %>% 
37 |   left_join(state_fire_trend) %>% 
38 |   mutate(pct_change_fire_area = unclass(pct_change_fire_area)) %>%
39 |   {ggplot(data =., aes(fill = pct_change_fire_area)) + 
40 |       geom_sf() + 
41 |       scale_fill_gradient2(low = muted("blue"), high = muted("red")) +
42 |       labs(fill = "Percent change\nin burned area") +
43 |       theme_void() + 
44 |       theme(legend.position = c(0.92, 0.32))} %>% 
45 |   ggsave(file.path(path_figures, "figureS14.png"), ., 
46 |          width = 6.75, height = 4)
47 | 


--------------------------------------------------------------------------------
/scripts/main/05_15_figureS15.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 15.
 8 | # ------------------------------------------------------------------------------
 9 | states <- tigris::states(cb = T) %>% filter(STATEFP %in% nonContig_stateFIPS == FALSE)
10 | 
11 | mtbs <- read_sf(file.path(path_data, "mtbs_perimeter_data"))
12 | mtbs %<>% st_make_valid()
13 | mtbs$area <- unclass(st_area(mtbs))/1e6
14 | 
15 | mtbs %>% 
16 |   st_simplify(dTolerance = 0.0005) %>%
17 |   mutate(year = lubridate::year(Ig_Date), 
18 |          state = substr(Event_ID, 1, 2)) %>%
19 |   filter(state %in% c("AK", "HI") == FALSE) %>%
20 |   filter(year >= 2006) %>% 
21 |   # filter(year <= 2010) %>%
22 |   group_by(year) %>% 
23 |   slice_max(BurnBndAc, n = 8) %>% 
24 |   # st_drop_geometry() %>% View
25 |   {ggplot(data = .) + 
26 |       geom_sf(data = states, lwd = 0.05) + 
27 |       geom_sf(color = "red", fill = "red") + 
28 |       facet_wrap(~year, nrow = 3, ncol = 5) + 
29 |       theme_void() + 
30 |       theme(text = element_text(size = 20))} %>% 
31 |   ggsave(file.path(path_figures, "figureS15.png"), 
32 |          ., width = 15, height = 7)
33 | 


--------------------------------------------------------------------------------
/scripts/main/05_16_figureS16.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Plots supplemental figure 16.
 8 | # ------------------------------------------------------------------------------
 9 | states <- tigris::states(cb = T) %>% filter(STATEFP %in% nonContig_stateFIPS == FALSE)
10 | 
11 | mtbs <- read_sf(file.path(path_data, "mtbs_perimeter_data"))
12 | mtbs %<>% st_make_valid()
13 | mtbs$area <- unclass(st_area(mtbs))/1e6
14 | 
15 | mtbs %>%
16 |   st_simplify(dTolerance = 0.0005) %>%
17 |   mutate(year = lubridate::year(Ig_Date),
18 |          state = substr(Event_ID, 1, 2)) %>%
19 |   filter(state %in% c("AK", "HI") == FALSE) %>%
20 |   filter(year >= 2006) %>%
21 |   group_by(state) %>%
22 |   slice_max(BurnBndAc, n = 3) %>%
23 |   {ggplot(data = .) + 
24 |       geom_sf(data = states, lwd = 0.05) + 
25 |       geom_sf(color = "red", fill = "red") + 
26 |       facet_wrap(~year, nrow = 3, ncol = 5) + 
27 |       theme_void() + 
28 |       theme(text = element_text(size = 20))} %>% 
29 |   ggsave(file.path(path_figures, "figureS16.png"), 
30 |          ., width = 15, height = 7)
31 | 


--------------------------------------------------------------------------------
/scripts/main/05_18_figureS18.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | #-------------------------------------------------------------------------------
  6 | # Written by: Jessica Li
  7 | # Compares smoke PM2.5 at PurpleAir monitors vs 10 km grid.
  8 | #-------------------------------------------------------------------------------
  9 | # Choose minimum number of hourly observations
 10 | min_hours = 1
 11 | 
 12 | #-------------------------------------------------------------------------------
 13 | #### Process PurpleAir PM2.5 to monitor-day ####
 14 | # Original hourly data file too large to load, so we run this step on cluster 
 15 | # and load from intermediate saved daily data file
 16 | 
 17 | # # Load clean PurpleAir outdoor monitor data
 18 | # # Have 79837229 observations
 19 | # purpleair = readRDS(file.path(path_data, "PurpleAir", "outdoor_monitor_data_clean_part1.rds")) %>%
 20 | #   mutate(
 21 | #     # bottom code at 0
 22 | #     pm25_out = pmax(pm25_out, 0),
 23 | #     # top code any values 500-1000 at 500
 24 | #     pm25_out = ifelse(pm25_out >= 500 & pm25_out <= 1000, 500, pm25_out),
 25 | #     # set any values > 1000 to NA
 26 | #     pm25_out = ifelse(pm25_out > 1000, NA, pm25_out)
 27 | #   ) %>%
 28 | #   # Drop missing hourly observations
 29 | #   # Have 58312646 observations (73.0%)
 30 | #   drop_na(pm25_out) %>%
 31 | #   # Filter to observations over time period for which we have smoke days
 32 | #   # Have 58279714 observations (99.9%)
 33 | #   # Have 6618 monitors
 34 | #   filter(year >= 2006, year <= 2020) %>%
 35 | #   select(purpleair_id = ID_out, lon = Lon_out, lat = Lat_out, year, month, day, hour, pm25 = pm25_out)
 36 | # 
 37 | # # Aggregate to daily level
 38 | # purpleair = purpleair %>%
 39 | #   # Have 2516222 observations
 40 | #   group_by(purpleair_id, lon, lat, year, month, day) %>%
 41 | #   summarize(pm25 = mean(pm25),
 42 | #             num_hourly_obs = n()) %>%
 43 | #   ungroup() %>%
 44 | #   # Filter to days with minimum number of hourly observations
 45 | #   # Have 2516222 observations (100%)
 46 | #   # Have 6618 monitors (100%)
 47 | #   filter(num_hourly_obs >= min_hours) %>%
 48 | #   group_by(purpleair_id) %>%
 49 | #   mutate(num_daily_obs = n()) %>%
 50 | #   ungroup()
 51 | # 
 52 | # saveRDS(purpleair, file.path(path_data, "PurpleAir", "outdoor_monitor_data_clean_part1_daily.rds"))
 53 | 
 54 | purpleair = readRDS(file.path(path_data, "PurpleAir", "outdoor_monitor_data_clean_part1_daily.rds"))
 55 | 
 56 | #-------------------------------------------------------------------------------
 57 | #### Match PurpleAir monitors and 10 km grid cells ####
 58 | # Load PurpleAir monitor locations
 59 | purpleair_loc = readRDS(file.path(path_data, "PurpleAir", "outdoor_monitor_locs.rds")) %>% 
 60 |   select(purpleair_id = ID_out, lon = Lon_out, lat = Lat_out) %>% 
 61 |   st_as_sf(coords = c("lon", "lat"), crs = 4326)
 62 | 
 63 | # Load 10 km grid
 64 | grid_10km = read_sf(file.path(path_data, "1_grids", "10km_grid")) %>% 
 65 |   select(grid_id_10km = ID)
 66 | 
 67 | # Match each PurpleAir monitor to the overlapping 10 km grid cell
 68 | o = purpleair_loc %>% 
 69 |   st_transform(st_crs(grid_10km)) %>% 
 70 |   st_join(grid_10km) %>% 
 71 |   st_drop_geometry()
 72 | 
 73 | #-------------------------------------------------------------------------------
 74 | #### Classify monitor-days as smoke day or non-smoke day ####
 75 | # Load smoke days
 76 | smoke_days = readRDS(file.path(path_data, "3_intermediate", "all_smoke_days_incl_cloudy.rds")) %>% 
 77 |   select(grid_id_10km, date, smoke_day)
 78 | 
 79 | # Join smoke day classification to PM2.5 observations by grid cell ID and date
 80 | purpleair = purpleair %>% 
 81 |   left_join(o, by = "purpleair_id") %>% 
 82 |   mutate(date = ymd(paste(year, month, day))) %>% 
 83 |   left_join(smoke_days, by = c("grid_id_10km", "date")) %>% 
 84 |   replace_na(list(smoke_day = 0))
 85 | 
 86 | #-------------------------------------------------------------------------------
 87 | #### Calculate smoke PM observations ####
 88 | # For each monitor-month, limit to non-smoke day observations at the monitor in 
 89 | # that calendar month in the three-year window
 90 | background_pm25 = purpleair %>% 
 91 |   filter(smoke_day == 0) %>% 
 92 |   group_by(purpleair_id, month, year) %>% 
 93 |   summarize(pm25 = list(pm25)) %>% 
 94 |   rowwise() %>% 
 95 |   mutate(nobs = length(pm25)) %>% 
 96 |   ungroup() %>% 
 97 |   arrange(purpleair_id, month, year) %>% 
 98 |   group_by(purpleair_id, month) %>% 
 99 |   mutate(pm25_lead = lead(pm25, n = 1, default = list(NA)),
100 |          pm25_lag = lag(pm25, n = 1, default = list(NA)),
101 |          nobs_lead = lead(nobs, n = 1, default = 0),
102 |          nobs_lag = lag(nobs, n = 1, default = 0)) %>% 
103 |   ungroup() %>% 
104 |   rowwise() %>% 
105 |   mutate(pm25_3yr = list(c(pm25, pm25_lag, pm25_lead)),
106 |          nobs_3yr = nobs + nobs_lead + nobs_lag) %>% 
107 |   rowwise() %>% 
108 |   # Obtain background PM2.5 by aggregating (taking the median)
109 |   mutate(pm25_med_3yr = median(unlist(pm25_3yr), na.rm = T)) %>% 
110 |   select(purpleair_id, year, month, pm25_med_3yr, nobs_3yr, nobs, nobs_lead, nobs_lag) %>% 
111 |   filter(nobs > 0, (nobs_lead > 0 | nobs_lag > 0))
112 | 
113 | # For each monitor-smoke day, match to background PM2.5 for the monitor-month
114 | purpleair = purpleair %>% 
115 |   # Have 325490 observations (12.9%)
116 |   # Have 5644 monitors (85.3%)
117 |   filter(smoke_day == 1) %>% 
118 |   # Drop observations with insufficient data for calculating background PM2.5
119 |   # Have 175312 observations (53.9%)
120 |   # Have 2550 monitors (45.2%)
121 |   inner_join(background_pm25, by = c("purpleair_id", "year", "month")) %>% 
122 |   # Subtract background PM2.5 from total PM2.5
123 |   mutate(smokePM = pmax(ifelse(smoke_day == 1, pm25 - pm25_med_3yr, 0), 0))
124 | 
125 | #-------------------------------------------------------------------------------
126 | #### Match to smoke PM predictions ####
127 | # Load 10 km grid smoke PM predictions
128 | preds = readRDS(file.path(path_output, "smokePM", "predictions", "combined", "smokePM_predictions_20060101_20201231.rds")) %>% 
129 |   mutate(smokePM_pred = pmax(smokePM_pred, 0))
130 | 
131 | # Join smoke PM predictions to observations by 10 km grid cell ID and date
132 | df = purpleair %>% 
133 |   left_join(preds, by = c("grid_id_10km", "date"))
134 | 
135 | #-------------------------------------------------------------------------------
136 | #### Compare smoke PM predictions and observations ####
137 | # Plot distribution of PurpleAir monitor-smoke days over time
138 | p_dates = ggplot(df, aes(x = date)) + 
139 |   # Each bin is approximately 2 weeks large
140 |   geom_histogram(bins = ((as.integer(max(df$date) - min(df$date)) + 1) %/% 14) + 1, 
141 |                  closed = "left") + 
142 |   # Enforce limits on date range
143 |   scale_x_date(limits = c(min(df$date) - days(1), max(df$date) + days(1)), 
144 |                oob = squish) + 
145 |   theme_classic() + 
146 |   labs(title = "b) temporal distribution of PurpleAir data",
147 |        y = "count of monitor-smoke days") + 
148 |   theme(plot.title = element_text(face = "bold", hjust = -0.22))
149 | 
150 | # Count smoke days at each monitor
151 | df_sf = df %>% 
152 |   group_by(lon, lat) %>% 
153 |   summarize(num_smoke_days = n()) %>% 
154 |   ungroup() %>% 
155 |   st_as_sf(coords = c("lon", "lat"), crs = 4326)
156 | conus = states() %>% 
157 |   filter(!(NAME %in% c("Alaska", 
158 |                        "American Samoa",
159 |                        "Guam",
160 |                        "Hawaii",
161 |                        "Commonwealth of the Northern Mariana Islands",
162 |                        "Puerto Rico",
163 |                        "United States Virgin Islands")))
164 | 
165 | # Plot map showing spatial distribution of PurpleAir monitors with smoke days
166 | p_map = ggplot() + 
167 |   geom_sf(data = conus) +
168 |   geom_sf(data = df_sf, mapping = aes(color = num_smoke_days), alpha = 0.5) +
169 |   theme_void() +
170 |   labs(title = "c) spatial distribution of PurpleAir data",
171 |        color = "count of smoke days") + 
172 |   theme(legend.position = "bottom",
173 |         plot.title = element_text(face = "bold", hjust = 0.023)) + 
174 |   guides(color = guide_colorbar(title.vjust = 0.8))
175 | 
176 | # Plot scatter and simple linear regression of predictions on observations
177 | p_comparison = ggplot(df, aes(x = smokePM, y = smokePM_pred)) + 
178 |   geom_bin2d(bins = 70) + 
179 |   geom_abline(intercept = 0, slope = 1, color = "grey30") +
180 |   theme_classic() +
181 |   scale_fill_continuous(type = "viridis",
182 |                         trans = "log", 
183 |                         breaks = c(1, 10, 100, 1000, 10000)) + 
184 |   scale_x_continuous(trans = "pseudo_log", 
185 |                      breaks = c(0, 1, 5, 10, 50, 100, 500),
186 |                      expand = c(0, 0)) + 
187 |   scale_y_continuous(trans = "pseudo_log", 
188 |                      breaks = c(0, 1, 5, 10, 50, 100, 500),
189 |                      expand = c(0, 0)) + 
190 |   labs(title = "a) predicted and observed smoke pollution",
191 |        x = expression(observed~smokePM[2.5]~(PurpleAir)),
192 |        y = expression(predicted~smokePM[2.5]),
193 |        caption = paste("count of monitors =", 
194 |                        prettyNum(length(unique(df$purpleair_id)), 
195 |                                  big.mark = ",", scientific = F))) + 
196 |   theme(plot.title = element_text(face = "bold", hjust = -0.3))
197 | 
198 | # Compose into one figure
199 | p = grid.arrange(p_comparison, p_dates, p_map,
200 |                  layout_matrix = matrix(c(1, 1, 1, 1, 2, 2, 2, 2, 
201 |                                           1, 1, 1, 1, 3, 3, 3, 3,  
202 |                                           1, 1, 1, 1, 3, 3, 3, 3),
203 |                                         nrow = 3, byrow = T))
204 | ggsave(file.path(path_figures, "figureS18.png"), 
205 |        plot = p, width = 14, height = 8)
206 | 
207 | # Calculate R2
208 | r2(feols(smokePM_pred ~ smokePM, df), "r2")
209 | 


--------------------------------------------------------------------------------
/scripts/main/06_01_tableS01.R:
--------------------------------------------------------------------------------
  1 | source("scripts/setup/00_01_load_packages.R")
  2 | source("scripts/setup/00_02_load_functions.R")
  3 | source("scripts/setup/00_03_load_settings.R")
  4 | 
  5 | #-------------------------------------------------------------------------------
  6 | # Written by: Marissa Childs
  7 | # Calculates supplementary table 1.
  8 | #-------------------------------------------------------------------------------
  9 | epa_data <- readRDS(file.path(path_data, "3_intermediate", "station_smokePM.rds")) %>% 
 10 |   ungroup
 11 | 
 12 | grid_10km <- st_read(file.path(path_data, "1_grids", "grid_10km_wgs84")) %>% 
 13 |   mutate(grid_index = 1:n())
 14 | monitors <- st_read(file.path(path_data, "epa_station_locations")) %>% 
 15 |   filter(id %in% unique(epa_data$id)) %>% 
 16 |   mutate(monitor_index = 1:n())
 17 | states <- tigris::states() %>% st_transform(st_crs(monitors))
 18 | 
 19 | monitors$state <- states$STUSPS[st_intersects(monitors, states) %>% unlist]
 20 | 
 21 | # Takes 13 minutes to run
 22 | Sys.time()
 23 | cell_monitor_dist <- data.frame(start = c(seq(0, nrow(grid_10km), by = 2000), nrow(grid_10km)) + 1) %>%
 24 |   mutate(end = lead(start) - 1) %>%
 25 |   filter(!is.na(end)) %>%
 26 |   purrr::pmap(function(start, end){
 27 |     print(Sys.time())
 28 |     st_distance(grid_10km[start:end,], monitors) %>%
 29 |       return
 30 |   }) %>%
 31 |   do.call(what = rbind)
 32 | Sys.time()
 33 | 
 34 | smoke_days <- readRDS(file.path(path_data, "3_intermediate", "all_smoke_days_incl_cloudy.rds"))  %>%
 35 |   left_join(readRDS(file.path(path_data, "3_intermediate", "all_smoke_days.rds")) %>%
 36 |               rename(plume_day = smoke_day) %>%
 37 |               select(-note_smoke_date_not_online)) %>%
 38 |   replace_na(list(plume_day = 0))
 39 | 
 40 | smoke_missing_dates = readRDS(file.path(path_data, "smoke", "smoke_dates_not_online.rds"))
 41 | smoke_missing_dates = ymd(smoke_missing_dates)
 42 | 
 43 | # add distance to smoke-day to epa data
 44 | monitor_smoke_dist <- epa_data %>%
 45 |   left_join(monitors %>% st_drop_geometry() %>% select(id, monitor_index)) %>%
 46 |   select(id, grid_id_10km, monitor_index, date, pm25) %>%
 47 |   nest_by(date) %>%
 48 |   rename(monitor_obs = data) %>%
 49 |   left_join(smoke_days %>%
 50 |               left_join(grid_10km %>% st_drop_geometry %>% select(grid_id_10km = ID, grid_index)) %>%
 51 |               nest_by(date) %>%
 52 |               rename(smoke_days = data)) %>%
 53 |   purrr::pmap_dfr(function(date, monitor_obs, smoke_days){
 54 |     if(is.null(smoke_days)){
 55 |       plume_dist = Inf
 56 |       smoke_dist = Inf
 57 |     } else {
 58 |       plume_dist <- cell_monitor_dist[smoke_days %>%
 59 |                                         filter(plume_day == 1) %>%
 60 |                                         pull(grid_index),
 61 |                                       monitor_obs %>%
 62 |                                         pull(monitor_index)] %>%
 63 |         matrix(ncol = nrow(monitor_obs)) %>%
 64 |         matrixStats::colMins()
 65 |       smoke_dist <- cell_monitor_dist[smoke_days %>%
 66 |                                         filter(smoke_day == 1) %>%
 67 |                                         pull(grid_index),
 68 |                                       monitor_obs %>%
 69 |                                         pull(monitor_index)] %>%
 70 |         matrix(ncol = nrow(monitor_obs)) %>%
 71 |         matrixStats::colMins()
 72 |     }
 73 |     monitor_obs %>%
 74 |       mutate(date = date) %>%
 75 |       cbind(plume_dist = plume_dist/1e3) %>%
 76 |       cbind(smoke_dist = smoke_dist/1e3) %>%
 77 |       return
 78 |   })
 79 | 
 80 | saveRDS(monitor_smoke_dist, 
 81 |         file.path(path_data, "monitor_smoke_distance.rds"))
 82 | 
 83 | # monitor_smoke_dist <- readRDS(file.path(path_data, "monitor_smoke_distance.rds"))
 84 | 
 85 | reg_df <- monitor_smoke_dist %>% 
 86 |   left_join(monitors %>% st_drop_geometry() %>%
 87 |               select(id, state)) %>%
 88 |   mutate(year = lubridate::year(date),
 89 |          month = lubridate::month(date)) %>%
 90 |   mutate(plume_dist_factor = case_when(plume_dist == 0 ~ 0,
 91 |                                        plume_dist > 0 & plume_dist <= 100 ~ 1,
 92 |                                        plume_dist > 100 & plume_dist <= 250 ~ 2,
 93 |                                        plume_dist > 250 & plume_dist <= 500 ~ 3,
 94 |                                        plume_dist > 500 & plume_dist <= 750 ~ 4,
 95 |                                        plume_dist > 750 & plume_dist <= 1000 ~ 5,
 96 |                                        plume_dist > 1000 ~ 6),
 97 |          smoke_dist_factor = case_when(smoke_dist == 0 ~ 0,
 98 |                                        smoke_dist > 0 & smoke_dist <= 100 ~ 1,
 99 |                                        smoke_dist > 100 & smoke_dist <= 250 ~ 2,
100 |                                        smoke_dist > 250 & smoke_dist <= 500 ~ 3,
101 |                                        smoke_dist > 500 & smoke_dist <= 750 ~ 4,
102 |                                        smoke_dist > 750 & smoke_dist <= 1000 ~ 5,
103 |                                        smoke_dist > 1000 ~ 6)) %>% 
104 |   mutate(across(ends_with("dist_factor"), function(x){ recode_factor(x, 
105 |                                                                      `0` = "overhead",
106 |                                                                      `1` = "0 - 100",
107 |                                                                      `2` = "100 - 250",
108 |                                                                      `3` = "250 - 500",
109 |                                                                      `4` = "500 - 750",
110 |                                                                      `5` = "750 - 1000",
111 |                                                                      `6` = "1000+",
112 |                                                                      .ordered = TRUE)}))
113 | 
114 | # add in meteorology (temperature, dewpoint temperature, precipitation)
115 | epa_station_grid_cells <- epa_data$grid_id_10km %>% unique
116 | # This may require large amount of memory
117 | meteor <- c(grep("temperature|precipitation|wind", 
118 |                  list.files(file.path(path_data, "ERA5_variables", "Land"), full.names = T), 
119 |                  value = T),
120 |             grep("boundary_layer", 
121 |                  list.files(file.path(path_data, "ERA5_variables", "Global"), full.names = T), 
122 |                  value = T)) %>% 
123 |   paste0("/USA/10km_grid/") %>%
124 |   {paste0(., ifelse(grepl("precipitation", .), "UTC+0000", "UTC-0600"))} %>% 
125 |   list.files(full.names = T) %>% 
126 |   map(function(x) {
127 |     print(x)
128 |     type = basename(x) %>% 
129 |       gsub("daily_|_of_1-hourly", "", .)
130 |     print(type)
131 |     list.files(x, full.names = T) %>% 
132 |       purrr::map_dfr(function(y){
133 |         readRDS(y) %>% 
134 |           filter(id_grid %in% epa_station_grid_cells) %>%
135 |           rename_with(.fn = function(z){paste0(gsub("2m_|10m_", "", z), "_", type)}, 
136 |                       .cols = !c(date, id_grid))
137 |       })
138 |     }) %>% 
139 |   reduce(full_join)
140 | 
141 | reg_df %<>% left_join(meteor, by = c("grid_id_10km" = "id_grid", "date" = "date"))
142 | 
143 | saveRDS(reg_df,
144 |         file.path(path_data, "monitor_smoke_distance_w_met.rds"))
145 | 
146 | # reg_df <- readRDS(file.path(path_data, "monitor_smoke_distance_w_met.rds"))
147 | 
148 | 
149 | etable(feols(pm25 ~ i(plume_dist_factor, ref = "1000+") | 
150 |                id^month + state^year^month, 
151 |              data = reg_df),
152 |        feols(pm25 ~ i(smoke_dist_factor, ref = "1000+") | 
153 |                id^month + state^year^month, 
154 |              data = reg_df),
155 |        feols(pm25 ~ i(plume_dist_factor, ref = "1000+") + 
156 |                ns(temperature_mean, 5) + ns(dewpoint_temperature_mean, 5) + 
157 |                ns(total_precipitation_maximum, 5) + ns(boundary_layer_height_maximum, 5) +
158 |                ns(boundary_layer_height_mean, 5) + ns(boundary_layer_height_minimum, 5) +
159 |                temperature_mean:total_precipitation_maximum | 
160 |                id^month + state^year^month, 
161 |              data = reg_df),
162 |        feols(pm25 ~ i(smoke_dist_factor, ref = "1000+") + 
163 |                ns(temperature_mean, 5) + ns(dewpoint_temperature_mean, 5) + 
164 |                ns(total_precipitation_maximum, 5) + ns(boundary_layer_height_maximum, 5) +
165 |                ns(boundary_layer_height_mean, 5) + ns(boundary_layer_height_minimum, 5) +
166 |                temperature_mean:total_precipitation_maximum | 
167 |                id^month + state^year^month, 
168 |              data = reg_df), 
169 |        keep = "dist_factor",
170 |        subtitles = c("plumes", "plumes + hysplit", "plumes w meteorology", "plumes + hysplit w meteorology"),
171 |        tex = TRUE,
172 |        file = file.path(path_tables, "tableS01.tex"))
173 | 
174 | feols(pm25 ~ i(smoke_dist_factor, ref = "1000+") + 
175 |         ns(temperature_mean, 5) + ns(dewpoint_temperature_mean, 5) + 
176 |         ns(total_precipitation_maximum, 5) + ns(boundary_layer_height_maximum, 5) +
177 |         ns(boundary_layer_height_mean, 5) + ns(boundary_layer_height_minimum, 5) +
178 |         temperature_mean:total_precipitation_maximum | 
179 |         id^month + state^year^month, 
180 |       data = reg_df %>% 
181 |         filter(!(date %in% smoke_missing_dates)) %>% 
182 |         filter(is.finite(plume_dist)))
183 | 


--------------------------------------------------------------------------------
/scripts/main/06_02_tableS04.R:
--------------------------------------------------------------------------------
 1 | source("scripts/setup/00_01_load_packages.R")
 2 | source("scripts/setup/00_02_load_functions.R")
 3 | source("scripts/setup/00_03_load_settings.R")
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | # Written by: Marissa Childs
 7 | # Calculates model performance metrics.
 8 | # ------------------------------------------------------------------------------
 9 | monitors <- st_read(file.path(path_data, "epa_station_locations")) 
10 | states <- tigris::states() %>% st_transform(st_crs(monitors))
11 | 
12 | monitors$state <- states$STUSPS[st_intersects(monitors, states) %>% unlist]
13 | 
14 | smokePM_data <- readRDS(file.path(path_data, "4_clean", "smokePM_training.rds")) %>% 
15 |   select(id, date, smokePM, fold)
16 | 
17 | cv_preds <- list.files(file.path(path_output, "smokePM", "model"), 
18 |                        pattern = "pred_fold", 
19 |                        full.names = TRUE) %>% 
20 |   purrr::map_dfr(function(x){
21 |     readRDS(x) %>%
22 |       mutate(test_fold = gsub("^smokePM_pred_fold|_drop|-aod_anom_pred|-traj_points|\\.rds$", "", basename(x)), 
23 |              drop_var = gsub("^smokePM_pred_fold[0-4]_drop-?|\\.rds$", "", basename(x))) %>%
24 |       return()
25 |   })
26 | 
27 | model_metrics <- cv_preds %>%
28 |   mutate(smokePM_pred = pmax(0, smokePM_pred)) %>%
29 |   left_join(smokePM_data, by = c("id", "date", "fold")) %>% 
30 |   mutate(test = fold == test_fold) %>%
31 |   eval_metrics(models = drop_var, test_tune = test, 
32 |                obs = smokePM, pred = smokePM_pred, loc_id = id) %>%
33 |   pivot_longer(contains("rank") | contains("value")) %>% 
34 |   separate(name, into = c("metric", "rank_value"), sep = "_(?=rank|value)") %>% 
35 |   pivot_wider(values_from = value, names_from = rank_value) %>%
36 |   separate(metric, into = c("metric", "subset"), sep = "_", extra = "merge") 
37 | 
38 | # metrics on different sample for main model, table S10
39 | model_metrics %>% 
40 |   filter(drop_var == "") %>% 
41 |   filter(test == TRUE & metric != "nobs" & metric != "me" &
42 |            !grepl("bin", metric, ignore.case = TRUE)) %>% 
43 |   select(-drop_var, -test, -rank) %>% 
44 |   pivot_wider(names_from = metric, values_from = value)
45 | 
46 | # confirm two decimal places for RMSE on over 50 days
47 | model_metrics %>% 
48 |   filter(drop_var=="", test==T, metric=="rmse", subset=="day_over50") %>% 
49 |   pull(value)
50 | 


--------------------------------------------------------------------------------
/scripts/setup/00_01_load_packages.R:
--------------------------------------------------------------------------------
 1 | library(RCurl)
 2 | library(R.utils)
 3 | library(XML)
 4 | library(rvest)
 5 | library(rlang)
 6 | library(ff)
 7 | 
 8 | library(tigris)
 9 | library(tidycensus)
10 | library(rgee)
11 | 
12 | library(splitr)
13 | 
14 | library(tools)
15 | library(readr)
16 | library(readxl)
17 | library(retry)
18 | 
19 | library(stringr)
20 | library(lubridate)
21 | 
22 | library(parallel)
23 | library(foreach)
24 | library(doParallel)
25 | 
26 | library(FNN)
27 | library(sp)
28 | library(sf) # Linking to GEOS 3.9.1, GDAL 3.3.1, PROJ 8.1.0
29 | library(ncdf4)
30 | library(raster)
31 | library(rgdal)
32 | library(rgeos)
33 | library(SpatialPosition)
34 | library(exactextractr)
35 | 
36 | library(magrittr)
37 | library(tibble)
38 | library(plyr)
39 | library(dplyr)
40 | library(tidyr)
41 | library(purrr)
42 | 
43 | import::from(data.table, .except = c("hour", "month", "wday", "week", "year"))
44 | library(gtable)
45 | library(fixest)
46 | library(splines)
47 | library(matrixStats)
48 | 
49 | library(xgboost)
50 | library(rBayesianOptimization)
51 | 
52 | library(ggplot2)
53 | library(ggpubr)
54 | library(grid)
55 | library(gridExtra)
56 | library(cowplot)
57 | library(scales)
58 | library(RColorBrewer)
59 | library(layer)
60 | library(facetscales)
61 | library(DiagrammeR)
62 | library(DiagrammeRsvg)
63 | library(rsvg)
64 | 


--------------------------------------------------------------------------------
/scripts/setup/00_02_load_functions.R:
--------------------------------------------------------------------------------
  1 | # Timers
  2 | print_time <- function(start, unit = "auto", message = "Time elapsed:") {
  3 |   end <- Sys.time()
  4 |   d <- difftime(time1 = end, time2 = start, units = unit)
  5 |   t <- round(d, digits = 1)
  6 |   u <- units(d)
  7 |   
  8 |   print(paste("Start time:", start))
  9 |   print(paste("End time:", end))
 10 |   message <- paste(message, t, u)
 11 |   print(message)
 12 |   return(d)
 13 | }
 14 | 
 15 | get_start_time <- function(message = "Time started:") {
 16 |   t <- Sys.time()
 17 |   print(paste(message, t))
 18 |   return(t)
 19 | }
 20 | 
 21 | # Calculations
 22 | nonsmoke_medians <- function(data, 
 23 |                              main_var, 
 24 |                              smoke_day, 
 25 |                              spatial_unit, 
 26 |                              temporal_unit, 
 27 |                              temporal_trend){
 28 |   main_var <- enquo(main_var)
 29 |   smoke_day <- enquo(smoke_day)
 30 |   spatial_unit <- enquo(spatial_unit)
 31 |   temporal_unit <- enquo(temporal_unit)
 32 |   temporal_trend <- enquo(temporal_trend)
 33 |   
 34 |   new_name <- paste0(rlang::as_name(main_var), "_med_3yr")
 35 |   
 36 |   full_panel <- expand.grid(id = data %>% pull(!!spatial_unit) %>% unique, 
 37 |                             month = data %>% pull(!!temporal_unit) %>% unique, 
 38 |                             year = data %>% pull(!!temporal_trend) %>% unique) %>% 
 39 |     rename(!!spatial_unit := id, 
 40 |            !!temporal_unit := month, 
 41 |            !!temporal_trend := year) %>% 
 42 |     ungroup
 43 |   
 44 |   data %>% 
 45 |     filter(!is.na(!!main_var) & !!smoke_day == 0) %>%
 46 |     full_join(full_panel) %>%
 47 |     group_by(!!spatial_unit, !!temporal_unit, !!temporal_trend) %>%
 48 |     summarise(main_var = list(!!main_var),
 49 |               nobs = n(),
 50 |               .groups = "drop") %>%
 51 |     arrange(!!spatial_unit, !!temporal_unit, !!temporal_trend) %>%
 52 |     group_by(!!spatial_unit, !!temporal_unit) %>%
 53 |     mutate(main_var_lag = lag(main_var, n = 1, default = list(NA)),
 54 |            main_var_lead = lead(main_var, n = 1, default = list(NA)),
 55 |            nobs_lag = lag(nobs, n = 1, default = 0),
 56 |            nobs_lead = lead(nobs, n = 1, default = 0)) %>%
 57 |     ungroup %>%
 58 |     rowwise %>%
 59 |     mutate(main_var_3yr = list(c(main_var, main_var_lag, main_var_lead)),
 60 |            main_var_med_3yr = median(unlist(main_var_3yr), na.rm = T),
 61 |            nobs_3yr = nobs + nobs_lead + nobs_lag) %>%
 62 |     ungroup %>%
 63 |     transmute(!!spatial_unit, !!temporal_unit, !!temporal_trend,
 64 |               nobs_3yr,
 65 |               !!new_name := main_var_med_3yr)
 66 | }
 67 | 
 68 | eval_metrics <- function(data, models, test_tune, obs, pred, 
 69 |                          loc_id, bins = c(0, 5, 10, 20, 50, 100, 10000)){
 70 |   library(fixest)
 71 |   models <- enquo(models)
 72 |   test_tune <- enquo(test_tune)
 73 |   obs <- enquo(obs)
 74 |   pred <- enquo(pred)
 75 |   loc_id <- enquo(loc_id)
 76 |   
 77 |   data %<>% 
 78 |     rename(obs = !!obs, 
 79 |            pred = !!pred, 
 80 |            loc_id = !!loc_id) %>% 
 81 |     mutate(month = lubridate::month(date),
 82 |            year = lubridate::year(date),
 83 |            month_year = paste0(month, "_", year))
 84 |   day_eval <-  data %>%
 85 |     nest_by(!!models, !!test_tune) %>%
 86 |     # for each model and test vs train folds, calculate metrics
 87 |     mutate(wr2_day = r2(feols(obs ~ pred | loc_id + year,
 88 |                               data = data), "wr2"),
 89 |            r2_day = r2(feols(obs ~ pred,
 90 |                              data = data), "r2"),
 91 |            rmse_day = sqrt(mean((data$obs - data$pred)^2)),
 92 |            me_day = mean(data$pred - data$obs),
 93 |            nobs_day = (data %>% filter(!is.na(pred)) %>% nrow)) %>% 
 94 |     select(-data) 
 95 |   day_eval_sub50 <-  data %>%
 96 |     filter(obs < 50) %>% 
 97 |     nest_by(!!models, !!test_tune) %>%
 98 |     # for each model and test vs train folds, calculate metrics
 99 |     mutate(wr2_day_sub50 = r2(feols(obs ~ pred | loc_id + year,
100 |                                     data = data), "wr2"),
101 |            r2_day_sub50 = r2(feols(obs ~ pred,
102 |                                    data = data), "r2"),
103 |            rmse_day_sub50 = sqrt(mean((data$obs - data$pred)^2)),
104 |            me_day_sub50 = mean(data$pred - data$obs),
105 |            nobs_day_sub50 = (data %>% filter(!is.na(pred)) %>% nrow)) %>% 
106 |     select(-data) 
107 |   day_eval_over50 <-  data %>%
108 |     filter(obs >= 50) %>% 
109 |     nest_by(!!models, !!test_tune) %>%
110 |     # for each model and test vs train folds, calculate metrics
111 |     mutate(wr2_day_over50 = r2(feols(obs ~ pred | loc_id + year,
112 |                                      data = data), "wr2"),
113 |            r2_day_over50 = r2(feols(obs ~ pred,
114 |                                     data = data), "r2"),
115 |            rmse_day_over50 = sqrt(mean((data$obs - data$pred)^2)),
116 |            me_day_over50 = mean(data$pred - data$obs),
117 |            nobs_day_over50 = (data %>% filter(!is.na(pred)) %>% nrow)) %>% 
118 |     select(-data) 
119 |   month_eval <- data %>% 
120 |     group_by(loc_id, month_year, !!models, !!test_tune) %>% 
121 |     summarise(pred = mean(pred), 
122 |               obs = mean(obs),
123 |               year = unique(year),
124 |               .groups = "drop") %>%
125 |     nest_by(!!models, !!test_tune) %>%
126 |     # for each model and test vs train folds, calculate metrics
127 |     mutate(wr2_month = r2(feols(obs ~ pred | loc_id + year,
128 |                                 data = data), "wr2"),
129 |            r2_month = r2(feols(obs ~ pred,
130 |                                data = data), "r2"),
131 |            rmse_month = sqrt(mean((data$obs - data$pred)^2)),
132 |            me_month = mean(data$pred - data$obs),
133 |            nobs_month = (data %>% filter(!is.na(pred)) %>% nrow)) %>% 
134 |     select(-data)
135 |   year_eval <- data %>% 
136 |     group_by(loc_id, year, !!models, !!test_tune) %>% 
137 |     summarise(pred = mean(pred), 
138 |               obs = mean(obs),
139 |               .groups = "drop") %>%
140 |     nest_by(!!models, !!test_tune) %>%
141 |     # for each model and test vs train folds, calculate metrics
142 |     mutate(wr2_year = r2(feols(obs ~ pred | loc_id + year,
143 |                                data = data), "wr2"),
144 |            r2_year = r2(feols(obs ~ pred,
145 |                               data = data), "r2"),
146 |            rmse_year = sqrt(mean((data$obs - data$pred)^2)),
147 |            me_year = mean(data$pred - data$obs),
148 |            nobs_year = (data %>% filter(!is.na(pred)) %>% nrow)) %>% 
149 |     select(-data)
150 |   day_bin <- data %>% 
151 |     mutate(obs_bin = cut(obs, breaks = bins, 
152 |                          right = FALSE, include.lowest = TRUE, 
153 |                          ordered_result = TRUE),
154 |            pred_bin = cut(pred, breaks = bins, 
155 |                           right = FALSE, include.lowest = TRUE, 
156 |                           ordered_result = TRUE)) %>% 
157 |     mutate(across(ends_with("_bin"), as.numeric),
158 |            bin_dist = abs(obs_bin - pred_bin)) %>% 
159 |     group_by(!!models, !!test_tune) %>%
160 |     summarise(binDist_day = mean(bin_dist),
161 |               pctCorrectBin_day = mean(obs_bin == pred_bin),
162 |               wrongBinDist_day = weighted.mean(bin_dist, bin_dist > 0),
163 |               .groups = "drop")
164 |   
165 |   full_join(day_eval,
166 |             day_eval_sub50) %>% 
167 |     full_join(day_eval_over50) %>% 
168 |     full_join(month_eval) %>%
169 |     full_join(year_eval) %>%
170 |     full_join(day_bin) %>%
171 |     group_by(!!test_tune) %>%
172 |     # rank the rmse and r2 for each model fold that was left out
173 |     mutate(across(contains("r2"), list(rank =~ data.table::frankv(.x, order = -1))),
174 |            across(contains("rmse"), list(rank =~ data.table::frankv(.x, order = 1))),
175 |            across(contains("me"), list(rank =~ data.table::frankv(abs(.x), order = 1))),
176 |            across(contains("Dist"), list(rank =~ data.table::frankv(.x, order = 1))),
177 |            across(contains("pctCorrect"), list(rank =~ data.table::frankv(.x, order = -1)))) %>%
178 |     ungroup %>%
179 |     # name the other columns with value to distinguish them from the rank columns
180 |     dplyr::rename_with(function(x){paste0(x, "_value")}, !contains("rank") & !(!!test_tune) & !(!!models)) %>%
181 |     return
182 | }
183 | 
184 | # Plotting
185 | mid_rescaler <- function(mid = 0) {
186 |   function(x, to = c(0, 1), from = range(x, na.rm = TRUE)) {
187 |     scales::rescale_mid(x, to, from, mid)
188 |   }
189 | }
190 | 


--------------------------------------------------------------------------------
/scripts/setup/00_03_load_settings.R:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------
 2 | # Provide your Google Earth Engine email
 3 | gee_email = "INSERT YOUR GEE EMAIL HERE, e.g. jdoe@stanford.edu"
 4 | try(ee_Initialize(email = gee_email))
 5 | 
 6 | # Set the number of cores to use in parallel computing
 7 | num_cores = Sys.getenv("SLURM_CPUS_PER_TASK")
 8 | if (nchar(num_cores) > 0) {
 9 |   num_cores = as.integer(num_cores) - 1
10 | } else {
11 |   num_cores = 1 # default sequential
12 | }
13 | 
14 | # Provide your US Census API Key
15 | key <- "INSERT YOUR US CENSUS BUREAU API KEY HERE"
16 | census_api_key(key)
17 | 
18 | #-------------------------------------------------------------------------------
19 | # Set to location of Dropbox and GitHub folders
20 | path_dropbox = "INSERT PATH TO DROPBOX FOLDER HERE"
21 | path_github = "INSERT PATH TO GITHUB REPO HERE"
22 | 
23 | # File paths based on root folders above
24 | path_data = file.path(path_dropbox, "data")
25 | path_output = file.path(path_dropbox, "output")
26 | path_final = file.path(path_dropbox, "final")
27 | path_tables = file.path(path_github, "tables", "raw")
28 | path_figures = file.path(path_github, "figures", "raw")
29 | path_setup = file.path(path_github, "scripts", "setup")
30 | path_main = file.path(path_github, "scripts", "main")
31 | path_supplementary = file.path(path_github, "scripts", "supplementary")
32 | 
33 | #-------------------------------------------------------------------------------
34 | nonContig_stateFIPS <- c("02","60","66","15","72","78","69")
35 | conus_stusps = setdiff(states()$STUSPS, c("AK", "AS", "GU", "HI", "MP", "PR", "VI"))
36 | 


--------------------------------------------------------------------------------
/tables/clean/tableS01_distance_to_plume_effects.tex:
--------------------------------------------------------------------------------
 1 | \begin{table}[H]
 2 | \caption{Estimated effect on monitor \pmt of being underneath or near a plume or smoke day. The third and fourth columns (labeled w/ met.) control flexibly for meteorological variables including temperature, dewpoint temperature, planetary boundary layer minimum, maximum and mean height. Plume variables are based solely on the presence of HMS plumes, while smoke variables also include days identified as smoke-days based on AOD missingness and HYSPLIT trajectories. All regressions include monitor-month and state-month-year fixed effects.}
 3 | \begin{tabular}{lcccc}
 4 | \tabularnewline\midrule\midrule
 5 | %Dependent Variable:&\multicolumn{4}{c}{pm25}\\
 6 |   & plumes & smoke days & plumes w/ met. & smoke days w/ met.\\
 7 | \midrule
 8 | plume dist: overhead&4.504$^{***}$ &    & 3.099$^{***}$ &   \\
 9 |   &(0.0510) &    & (0.0410) &   \\
10 | plume dist: 0-100&1.272$^{***}$ &    & 0.3164$^{***}$ &   \\
11 |   &(0.0203) &    & (0.0226) &   \\
12 | plume dist: 100-250&1.012$^{***}$ &    & 0.2041$^{***}$ &   \\
13 |   &(0.0150) &    & (0.0139) &   \\
14 | plume dist: 250-500&0.7020$^{***}$ &    & 0.1632$^{***}$ &   \\
15 |   &(0.0127) &    & (0.0104) &   \\
16 | plume dist: 500-750&0.3203$^{***}$ &    & 0.1121$^{***}$ &   \\
17 |   &(0.0117) &    & (0.0094) &   \\
18 | plume dist: 750-1000&0.0608$^{***}$ &    & 0.0619$^{***}$ &   \\
19 |   &(0.0105) &    & (0.0091) &   \\
20 | smoke dist: overhead&   & 4.934$^{***}$ &    & 3.399$^{***}$\\
21 |   &   & (0.0482) &    & (0.0385)\\
22 | smoke dist: 0-100&   & 1.947$^{***}$ &    & 0.8033$^{***}$\\
23 |   &   & (0.0202) &    & (0.0211)\\
24 | smoke dist: 100-250&   & 1.449$^{***}$ &    & 0.5029$^{***}$\\
25 |   &   & (0.0160) &    & (0.0141)\\
26 | smoke dist: 250-500&   & 0.9806$^{***}$ &    & 0.3568$^{***}$\\
27 |   &   & (0.0140) &    & (0.0113)\\
28 | smoke dist: 500-750&   & 0.4757$^{***}$ &    & 0.2037$^{***}$\\
29 |   &   & (0.0127) &    & (0.0103)\\
30 | smoke dist: 750-1000&   & 0.1570$^{***}$ &    & 0.1093$^{***}$\\
31 |   &   & (0.0113) &    & (0.0100)\\
32 | \midrule \emph{Fixed-effects}&   &   &   &  \\
33 | id$\times$month & Yes & Yes & Yes & Yes\\
34 | state$\times$year$\times$month & Yes & Yes & Yes & Yes\\
35 | \midrule
36 | Observations & 4,442,067&4,442,067&4,442,067&4,442,067\\
37 | Within R$^2$ & 0.021599&0.02626&0.125643&0.127239\\
38 | \midrule\midrule\multicolumn{5}{l}{\emph{One-way (id$\times$month) standard-errors in parentheses}}\\
39 | \multicolumn{5}{l}{\emph{Signif. Codes: ***: 0.01, **: 0.05, *: 0.1}}\\
40 | \end{tabular}
41 | \label{tab:plume_accuracy}
42 | \end{table}
43 | 


--------------------------------------------------------------------------------
/tables/clean/tableS02_anomAOD_model_inputs.tex:
--------------------------------------------------------------------------------
 1 | \begin{table}[H]
 2 |     \centering
 3 |     \caption{AOD anomaly model inputs. NED = National Elevation Database, NLCD = National Land Cover Database, PBL = planetary boundary layer.}
 4 |     \begin{tabular}{l|c|c}
 5 |         Feature & Source & Native resolution\\
 6 |         \hline
 7 |         \hline
 8 |         \shortstack[l]{Aerosol optical thickness anomalies on smoke-days\\ (current, 1-day, 2-day, and 3-day lagged)}& MERRA-2 & ~50km\\
 9 |         \hline 
10 |         \shortstack[l]{Elevation \\ (mean and standard deviation in grid cells)} & USGS NED & ~10m \\
11 |         \hline
12 |         \shortstack[l]{Percent of area in each Level 1 land cover class \\ (water, developed, barren, shrubland, herbaceous,\\cultivated, forest, wetlands)} & USGS NLCD & 30m \\
13 |         \hline 
14 |         Distance to nearest fire cluster & HMS fire points & - \\
15 |         \hline 
16 |         \shortstack[l]{Size of nearest fire cluster \\ (area and number of constituent fire points)} & HMS fire points & - \\
17 |         \hline 
18 |         \shortstack[l]{Meteorology \\ (daily mean, max, and min PBL,\\average sea level pressure)} & ERA5 global & ~30km \\
19 |         \hline
20 |         \shortstack[l]{Meteorology \\ (total precipitation, average 2m air temperature,\\ average eastward and northward wind speed,\\ average surface pressure, 2m dewpoint temperature)} & ERA5 land & ~11km \\
21 |         \hline 
22 |         Latitude, Longitude, Month & & - \\
23 |     \end{tabular}
24 |     \label{tab:aodinputs}
25 | \end{table}
26 | 


--------------------------------------------------------------------------------
/tables/clean/tableS03_smokePM_model_inputs.tex:
--------------------------------------------------------------------------------
 1 | \begin{table}[H]
 2 |     \centering
 3 |     \caption{Smoke \pmt Model inputs. NED = National Elevation Database, NLCD = National Land Cover Database}
 4 |     \begin{tabular}{l|c|c}
 5 |         Feature & Source & Native resolution\\
 6 |         \hline
 7 |         \hline
 8 |         \shortstack[l]{Aerosol optical thickness anomalies\\ (current, 1-day, 2-day, and 3-day lagged)}& MERRA-2 & ~50km\\
 9 |         \hline 
10 |         Percent of AOD observations missing & MODIS MAIAC & 1km \\
11 |         \hline
12 |         \shortstack[l]{Predicted aerosol optical depth anomalies \\ (min, max, mean, 25th, 50th, and 75th percentiles)} & \shortstack[l]{predicted, trained on \\ MODIS MAIAC} & - \\
13 |         \hline 
14 |         \shortstack[l]{Elevation \\ (mean and standard deviation in grid cells)} & USGS NED & ~10m \\
15 |         \hline
16 |         \shortstack[l]{Percent of area in each Level 1 land cover class \\ (water, developed, barren, shrubland, herbaceous,\\cultivated, forest, wetlands)} & USGS NLCD & 30m \\
17 |         \hline 
18 |         Distance to nearest fire cluster & HMS fire points & - \\
19 |         \hline 
20 |         \shortstack[l]{Size of nearest fire cluster \\ (area and number of constituent fire points)} & HMS fire points & -\\
21 |         \hline 
22 |         \shortstack[l]{Meteorology \\ (daily mean, max, and min PBL,\\average sea level pressure)} & ERA5 global & ~30km \\
23 |         \hline
24 |         \shortstack[l]{Meteorology \\ (total precipitation, average 2m air temperature,\\ average eastward and northward wind speed,\\ average surface pressure, 2m dewpoint temperature)} & ERA5 land & ~11km \\
25 |         \hline 
26 |         \shortstack[l]{HYSPLIT trajectory points in 50 km buffer \\ (by height quintiles: 0 - 1.1, 1.1-1.8, 1.8 - 2.7,\\2.7 - 4.3, 4.3+ km AGL)} & \shortstack[l]{HYSPLIT simulations\\from HMS HYSPLIT points} & - \\ 
27 |         \hline
28 |         Latitude, Longitude, Month & & - \\
29 |     \end{tabular}
30 |     \label{tab:inputs}
31 | \end{table}
32 | 


--------------------------------------------------------------------------------
/tables/clean/tableS04_model_performance.tex:
--------------------------------------------------------------------------------
 1 | \begin{table}[ht]
 2 | \caption{Out-of-sample smoke \pmt model performance on different subset or aggregations of the data (rows) and different performance metrics (columns). Within $R^2$ controls for monitor and year fixed effects. }
 3 | \centering
 4 | \begin{tabular}{lccc}
 5 |   \hline
 6 |    & within $R^2$ & $R^2$ & RMSE \\ 
 7 |   \hline
 8 |    smoke days & 0.65 & 0.67 & 9.57 \\ 
 9 |    day $<$50\mg & 0.47 & 0.50 & 5.07 \\ 
10 |    day $\geq$50\mg & 0.36 & 0.44 & 70.99 \\ 
11 |    month avg & 0.67 & 0.67 & 3.75 \\ 
12 |    year avg & 0.68 & 0.72 & 2.89 \\ 
13 |    \hline
14 | \end{tabular}
15 | \label{tab:model_performance}
16 | \end{table}
17 | 


--------------------------------------------------------------------------------
/tables/raw/tableS01.tex:
--------------------------------------------------------------------------------
 1 | 
 2 | \begin{tabular}{lcccc}
 3 | \tabularnewline\midrule\midrule
 4 | Dependent Variable: & \multicolumn{4}{c}{pm25}\\
 5 |  & plumes & plumes + hysplit & plumes w meteorology & plumes + hysplit w meteorology \\ 
 6 | Model: & (1) & (2) & (3) & (4)\\
 7 | \midrule \emph{Variables} &   &   &   &  \\
 8 | plume\_dist\_factor $=$ overhead & 4.504$^{***}$ &    & 3.099$^{***}$ &   \\
 9 |   & (0.0510) &    & (0.0410) &   \\
10 | plume\_dist\_factor $=$ 0-100 & 1.272$^{***}$ &    & 0.3164$^{***}$ &   \\
11 |   & (0.0203) &    & (0.0226) &   \\
12 | plume\_dist\_factor $=$ 100-250 & 1.012$^{***}$ &    & 0.2041$^{***}$ &   \\
13 |   & (0.0150) &    & (0.0140) &   \\
14 | plume\_dist\_factor $=$ 250-500 & 0.7020$^{***}$ &    & 0.1632$^{***}$ &   \\
15 |   & (0.0127) &    & (0.0104) &   \\
16 | plume\_dist\_factor $=$ 500-750 & 0.3203$^{***}$ &    & 0.1121$^{***}$ &   \\
17 |   & (0.0117) &    & (0.0094) &   \\
18 | plume\_dist\_factor $=$ 750-1000 & 0.0608$^{***}$ &    & 0.0619$^{***}$ &   \\
19 |   & (0.0105) &    & (0.0091) &   \\
20 | smoke\_dist\_factor $=$ overhead &    & 4.934$^{***}$ &    & 3.399$^{***}$\\
21 |   &    & (0.0482) &    & (0.0385)\\
22 | smoke\_dist\_factor $=$ 0-100 &    & 1.947$^{***}$ &    & 0.8033$^{***}$\\
23 |   &    & (0.0202) &    & (0.0211)\\
24 | smoke\_dist\_factor $=$ 100-250 &    & 1.449$^{***}$ &    & 0.5029$^{***}$\\
25 |   &    & (0.0160) &    & (0.0141)\\
26 | smoke\_dist\_factor $=$ 250-500 &    & 0.9806$^{***}$ &    & 0.3568$^{***}$\\
27 |   &    & (0.0140) &    & (0.0113)\\
28 | smoke\_dist\_factor $=$ 500-750 &    & 0.4757$^{***}$ &    & 0.2037$^{***}$\\
29 |   &    & (0.0127) &    & (0.0103)\\
30 | smoke\_dist\_factor $=$ 750-1000 &    & 0.1570$^{***}$ &    & 0.1093$^{***}$\\
31 |   &    & (0.0113) &    & (0.0100)\\
32 | \midrule \emph{Fixed-effects} &   &   &   &  \\
33 | id-month & Yes & Yes & Yes & Yes\\
34 | state-year-month & Yes & Yes & Yes & Yes\\
35 | \midrule \emph{Fit statistics} &   &   &   &  \\
36 | Observations & 4,442,067 & 4,442,067 & 4,442,067 & 4,442,067\\
37 | R$^2$ & 0.32277 & 0.32600 & 0.39479 & 0.39590\\
38 | Within R$^2$ & 0.02160 & 0.02626 & 0.12564 & 0.12724\\
39 | \midrule\midrule\multicolumn{5}{l}{\emph{Clustered (id-month) standard-errors in parentheses}}\\
40 | \multicolumn{5}{l}{\emph{Signif. Codes: ***: 0.01, **: 0.05, *: 0.1}}\\
41 | \end{tabular}
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------