├── .Rprofile
├── .gitignore
├── CHANGELOG.md
├── README.md
├── crosswalks
├── ar_pwsid_lookup.csv
├── county_fips.csv
├── ri_pwsid_lookup.csv
└── state_fips_to_abbr.csv
├── docs
├── contributing.md
├── credits.md
├── diagrams
│ ├── contributed_pws.drawio
│ ├── flow_diagram.drawio
│ ├── flow_diagram_v2.drawio
│ ├── match_diagrams.drawio
│ └── sl-march-2020.excalidraw
├── frs
│ └── facility-registry-service-best-pick-processing-v-2.0.pdf
└── img
│ ├── contributed_pws.png
│ ├── data_flow_diagram.png
│ ├── data_flow_diagram_v2.png
│ ├── data_sources.png
│ ├── epic_logo.png
│ ├── mapping_diagram.png
│ ├── matches.png
│ ├── matching_diagram.png
│ ├── simplelab_logo.png
│ ├── spatial_assignment.png
│ ├── stacked_match_report.png
│ ├── temm-nation.png
│ ├── tiers_diagram.png
│ └── wadl_logo.jpg
├── etc
└── wsb_labeled_simplified.rds
├── layers
├── epa_regions.csv
└── us_states.geojson
├── renv.lock
├── renv
├── .gitignore
├── activate.R
└── settings.dcf
├── requirements.txt
├── src
├── analysis
│ ├── README.md
│ └── sandbox
│ │ ├── eda
│ │ ├── eda_february.Rmd
│ │ ├── explore_wsb_sdwis.py
│ │ ├── multipolygon_pwsids_in_labeled_data.Rmd
│ │ ├── multipolygon_pwsids_in_labeled_data.html
│ │ └── wholesalers.Rmd
│ │ ├── matching
│ │ ├── match_reports.py
│ │ └── stats.py
│ │ ├── model_explore
│ │ ├── .gitignore
│ │ ├── 02_random_forest.R
│ │ ├── 03_xgboost.R
│ │ ├── README.md
│ │ ├── archive
│ │ │ ├── 01_preprocess.R
│ │ │ └── 04_linear.R
│ │ ├── etc
│ │ │ ├── final_xgb.rds
│ │ │ └── xgb_res.rds
│ │ ├── model_march.Rmd
│ │ └── model_march.html
│ │ ├── report_review
│ │ └── report_changes.Rmd
│ │ └── sanity_checks
│ │ └── 01_convex_hull.R
├── combine_tiers.py
├── downloaders
│ ├── README.md
│ ├── download_contributed_pws.R
│ ├── download_echo.R
│ ├── download_frs.R
│ ├── download_helpers.py
│ ├── download_mhp.R
│ ├── download_sdwis.py
│ ├── download_tigris_ne.R
│ ├── download_ucmr.R
│ └── states
│ │ ├── download_ar_wsb.R
│ │ ├── download_az_wsb.R
│ │ ├── download_ct_wsb.R
│ │ ├── download_il_wsb.R
│ │ ├── download_ks_wsb.R
│ │ ├── download_mo_wsb.R
│ │ ├── download_nc_wsb.R
│ │ ├── download_nj_wsb.R
│ │ ├── download_nm_wsb.R
│ │ ├── download_ok_wsb.R
│ │ ├── download_pa_wsb.R
│ │ ├── download_ri_wsb.R
│ │ ├── download_state_helpers.R
│ │ ├── download_ut_wsb.R
│ │ └── download_wa_wsb.R
├── functions
│ ├── f_clean_whitespace_nas.R
│ └── f_drop_imposters.R
├── match
│ ├── 0-init.py
│ ├── 2-cleansing.py
│ ├── 3-matching.py
│ ├── 4-rank_boundary_matches.py
│ ├── 5-select_modeled_centroids.py
│ ├── helpers.py
│ ├── init_model.sql
│ ├── map_contributed.py
│ ├── map_echo.py
│ ├── map_frs.py
│ ├── map_labeled.py
│ ├── map_mhp.py
│ ├── map_sdwis.py
│ ├── map_tiger.py
│ ├── map_ucmr.py
│ ├── match_scorer.py
│ └── readme.md
├── model
│ ├── 01_preprocess.R
│ ├── 02_linear.R
│ └── README.md
├── run_pipeline.py
└── transformers
│ ├── README.md
│ ├── states
│ ├── transform_wsb_ar.R
│ ├── transform_wsb_az.R
│ ├── transform_wsb_ca.R
│ ├── transform_wsb_ct.R
│ ├── transform_wsb_il.R
│ ├── transform_wsb_ks.R
│ ├── transform_wsb_mo.R
│ ├── transform_wsb_nc.R
│ ├── transform_wsb_nj.R
│ ├── transform_wsb_nm.R
│ ├── transform_wsb_ok.R
│ ├── transform_wsb_pa.R
│ ├── transform_wsb_ri.R
│ ├── transform_wsb_tx.R
│ ├── transform_wsb_ut.R
│ └── transform_wsb_wa.R
│ ├── transform_contributed_pws.R
│ ├── transform_echo.R
│ ├── transform_frs.R
│ ├── transform_labeled.R
│ ├── transform_mhp.R
│ ├── transform_sdwis_geo_areas.py
│ ├── transform_sdwis_helpers.py
│ ├── transform_sdwis_service.py
│ ├── transform_sdwis_ws.py
│ ├── transform_tigris_ne.R
│ └── transform_ucmr.R
└── wsb.Rproj
/.Rprofile:
--------------------------------------------------------------------------------
1 | source("renv/activate.R")
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | .Rapp.history
4 |
5 | # Session Data files
6 | .RData
7 |
8 | # User-specific files
9 | .Ruserdata
10 |
11 | # Example code in package build process
12 | *-Ex.R
13 |
14 | # Output files from R CMD build
15 | /*.tar.gz
16 |
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 |
20 | # RStudio files
21 | .Rproj.user/
22 |
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 |
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 |
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 |
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 |
38 | # Environment Variables
39 | .Renviron
40 | .env
41 |
42 | # local paths to ignore: raw data and staging
43 | /data
44 | /staging
45 | /output
46 | /log
47 |
48 | # pesky OSX
49 | .DS_Store
50 |
51 | # virtual environment
52 | .venv
53 |
54 | # vs code
55 | .vscode
56 |
57 | # pycache
58 | __pycache__
59 |
60 | # package
61 | package-lock.json
62 | package.json
63 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Water Service Boundaries - Change Log
2 |
3 | # 3.0.0 (2022-10-31)
4 | * Adding manually-contributed systems from the Internet of Water's [Github](https://github.com/cgs-earth/ref_pws/raw/main/02_output/contributed_pws.gpkg)
5 | * Refactored to use geopackage through most of pipeline instead of geojson
6 | * Added `geometry_source_detail` column, to document where the data provider got the geometries from
7 |
8 | # 2.4.0 (2022-09-27)
9 | * Added Arkansas labeled boundaries. The original data source did not have water system ids, but a match on names was pretty comprehensive. We supplemented with ~40 manually looked-up water system ids based on the remaining non-matches. There are still 12 systems with shapefiles from the underlying data that did not actually have any water system id that I could match.
10 |
11 | # 2.3.0 (2022-09-02)
12 | * Added Rhode Island labeled boundaries. The original data source did not include PWS ID's, so these were supplemented by manual effort from the EPIC team.
13 |
14 | # 2.2.0 (2022-08-23)
15 | * With version 2.0, we changed logic to eliminate Tier 2b, meaning only one PWS could "own" any particular tiger place. This caused many PWS's that were formerly Tier 2b to fall back to Tier 3. In some cases, these relied on low-quality county or state centroids from Echo, resulting in a less accurate map. In this release, we addressed this problem. For PWS's that (1) have a low-quality centroid and (2) have a matched tiger boundary, but (3) were not selected as the "best" match for that boundary, we overwrite the centroid with a calculated centroid from the top-ranked matched boundary.
16 | * Refactor to preserve all "ranked" boundary matches, not just the "best" match.
17 | * Saving the final "master" records back to the database
18 | * Added "tier" column to the database
19 |
20 |
21 | # 2.1.0 (2022-08-09)
22 | * Improved logic for how "impostors" are calculated. Here is a summary of impacts:
23 |
24 | | Category | Echo | FRS | Reason |
25 | |----------|-------|-----|-----------|
26 | | Rejected only before | 114 | 336 | 20 echo and 1 FRS are now allowed because they're within 50 km of the primacy agency's border. 326 FRS are no longer in the system at all, now that the ECHO's are coming through (the FRS mapping is unusual in that it doesn't load records if they're already coming through via ECHO, since FRS is largely duplicates of ECHO). 9 FRS were rejected for being tribal regions and not recognizing the primacy_agency_code as a state. 91 ECHO's had NULL state. |
27 | | Rejected both times | 6 | 26 | Legit impostors, identified both times. |
28 | | Rejected only after | 292 | 0 | These were previously allowed because the lat/long was consistent with the _address's state_, but not of the _primacy agency_. In the new logic, I do allow it to be outside of the primacy_agency state, but not further than 50 km away. |
29 |
30 |
31 | # 2.0.0 (2022-07-01)
32 | * No longer dropping any PWS's (but some results have tier "none", indicating no geometry)
33 | * Added Utah and Illinois labeled boundaries
34 | * Eliminated Tier 2b by implement ranking and selection of best PWS per Tiger. Roughly 3000 became Tier 2a, remaining 7000 became Tier 3
35 | * Renamed some columns:
36 | * geometry_lat -> centroid_lat
37 | * geometry_lon -> centroid_lon
38 | * geometry_quality -> centroid_quality
39 | * tiger_geoid -> matched_bound_geoid
40 | * tiger_name -> matched_bound_name
41 | * Cleaned up column names in the shapefile
42 | * Improved matching to MHPs, and prevented MHP's from matching to Tiger places
43 | * Pulled in population data for Tiger places, to help deduplicate matches
44 | * Misc bugfixes and performance improvements
45 |
46 | | Tier 1 | Tier 2a | Tier 2b | Tier 3 | None | Total |
47 | |---------|---------|----------|---------|--------|--------|
48 | | 16,896 | 11,526 | 0 | 17,526 | 3,476 | 49,424 |
49 |
50 |
51 | # 1.0.0 (2022-05-02)
52 | Initial release
53 |
54 | | Tier 1 | Tier 2a | Tier 2b | Tier 3 | None | Total |
55 | |---------|---------|----------|---------|-------|---------|
56 | | 14,607 | 9,488 | 10,104 | 10,720 | 0 | 44,919* |
57 |
58 | *Note: 4505 systems were dropped due to missing geometry or not falling within 50 US states.
59 |
--------------------------------------------------------------------------------
/crosswalks/ar_pwsid_lookup.csv:
--------------------------------------------------------------------------------
1 | pws_name,pwsid
2 | SHADY ACRES MOBILE HOME PARK,
3 | AURELLE WATER SYSTEM,
4 | COMMUNITY WATER ASSOCIATION (SALESVILLE),AR0000036
5 | BRADLEY CO RURAL WATER ASSN,AR0000054
6 | FOUR MILE HILL WATER ASSOC,AR0000586
7 | SOUTHWEST ARKANSAS WATER SYST,AR0000889
8 | "PARAGOULD, LIGHT, WATER, & CABLE",AR0000222
9 | CHIDESTER WATERWORKS,AR0000403
10 | CLARK CO COUNTRY WATER FACILIT,AR0000741
11 | MCGEHEE WATERWORKS,AR0000170
12 | STAR CITY WATER WATERWORKS,AR0000318
13 | SHIRLEY WATERWORKS,
14 | PARON-OWENSVILLE WATER AUTH,
15 | LIBERTY - WOODSON / HENSLEY,AR0000471
16 | NAIL SWAIN WATER ASSOCIATION,AR0000856
17 | RAMBO WATER DISTRICT #1 INC,
18 | FURLOW WATER ASSOCIATION,AR0000645
19 | PARKIN RURAL WATER ASSOCIATION,AR0000662
20 | MENA WATER DEPT,AR0000438
21 | BASSETT WATERWORKS,AR0000377
22 | SANDRIDGE-BARDSTOWN WTR ASSOC,
23 | O'KEAN WATERWORKS,AR0000473
24 | BEAVERFORK VOLUNTEER FD WSD,AR0000844
25 | HOT SPRINGS VILLAGE WATER,AR0000208
26 | CENTRAL PUBLIC WATER AUTHORITY,
27 | OLD UNION WATER ASSOCIATION,AR0000559
28 | ODEN- PENCIL BLUFF WATER ASSN.,AR0000616
29 | VANDERVOORT WATERWORKS,
30 | OSAGE POINT MHP,
31 | CLARKSVILLE WATERWORKS,AR0000289
32 | BOIS D'ARC WATER SYSTEM,AR0000178
33 | WIEDERKEHR VILLAGE WATER DEPT,
34 | LAFE REGIONAL WATER DISTRIBUTION DISTRICT,AR0000483
35 | EAST PRAIRIE CNTY PUBLIC WATER AUTHORITY,AR0000458
36 | KINGWOOD MHP,
37 | FRANKLIN-SEBATIAN PWA,AR0001077
38 | BELLA VISTA P.O.A.,AR0000039
39 | RIDGEFIELD ESTATES,AR0000776
40 | JAMES FORK REGIONAL WATER,AR0000513
41 | SOUTH PIKE CO. WATER,AR0000978
42 | MID-ARKANSAS UTILITIES PWA,AR0000725
43 | MONTROSE / BOYDELL WATER SYSTEM,AR0000014
44 | OZAN CREEK RURAL WATER SYSTEM,AR0001078
45 | LAKEVIEW MIDWAY PUBLIC WATER AUTHORITY,AR0000027
46 | CONCORD WATER & SEWER PFB,AR0000147
47 | LONOKE WHITE PUBLIC WATER AUTH,AR0001076
48 | BAXTER-MARION REGIONAL WATER ASSOCIATION,AR0001178
49 | BEDFORD FALLS MHP,
50 | HIGHLAND PUBLIC WATER AUTHORITY,AR0000672
--------------------------------------------------------------------------------
/crosswalks/ri_pwsid_lookup.csv:
--------------------------------------------------------------------------------
1 | ID,H20_DISTRI,NAME,POP_SERVED,PWSID,pws_name,Notes
2 | 1,Block Island Water Department,BLOCK ISLAND,n/a,RI1858430,BLOCK ISLAND WATER COMPANY,
3 | 3,Cumberland Water District,CUMBERLAND,"28,586 (1995)",RI1647530,"CUMBERLAND, TOWN OF",
4 | 4,East Providence Water District,EAST PROVIDENCE,"50,857 (1992)",RI1615610,EAST PROVIDENCE-CITY OF,
5 | 5,East Smithfield Water District,NORTH PROVIDENCE,7450 (1992),RI1592024,PROVIDENCE-CITY OF,Became part of Providence in 2017
6 | 6,Greenville Water District,SMITHFIELD,8100 (1998),RI1858410,GREENVILLE WATER DISTRICT,
7 | 7,Jamestown Water District,JAMESTOWN,5339 (2000),RI1858419,JAMESTOWN WATER DEPARTMENT,
8 | 8,Johnston Water Department,JOHNSTON,4965 (1999),RI1592024,PROVIDENCE-CITY OF,Became part of providence in 2021
9 | 9,Kent County Water Authority,EAST GREENWICH,"63,706 (1993)",RI1559511,KENT COUNTY WATER AUTHORITY,*Their spreadsheet says RI1592021
10 | 10,Kingston Water District,SOUTH KINGSTOWN,3800 (1999),RI1858421,KINGSTON WATER DISTRICT,
11 | 11,Narragansett Water -- North End System,NARRAGANSETT,"12,389 (1997)",RI1858429,NARRAGANSETT WATER DEPT-NORTH END,
12 | 12,Narragansett Water -- South End System,NARRAGANSETT,"12,389 (1997)",RI1858428,NARRAGANSETT WATER SYSTEM-POINT JUDITH,
13 | 13,Newport Water District,MIDDLETOWN,"43,825 (1992)",RI1592010,NEWPORT-CITY OF,
14 | 14,North Kingstown Water District,NORTH KINGSTOWN,"26,821 (2000)",RI1559517,NORTH KINGSTOWN TOWN OF,
15 | 15,North Tiverton Water District,TIVERTON,8587 (1990),RI1592018,NORTH TIVERTON FIRE DISTRICT,
16 | 16,Pascoag Water and Fire District,BURRILLVILLE,3900 (2000),RI1592020,"PASCOAG UTILITY DISTRICT, WATER DIVISION",
17 | 17,Pawtucket Water Supply Board,CENTRAL FALLS,"109,042 (2000)",RI1592021,PAWTUCKET-CITY OF,
18 | 18,Portsmouth Water and Fire District,PORTSMOUTH,"15,797 (1994)",RI1592022,PORTSMOUTH WATER & FIRE DISTRICT,
19 | 19,Providence Water Supply Board,PROVIDENCE,"267,157 (1992)",RI1592024,PROVIDENCE-CITY OF,
20 | 20,RIEDC Water District,NORTH KINGSTOWN,5000 (1999),RI1559517,NORTH KINGSTOWN TOWN OF,*Might be covered by RI1559517 or RI1559511
21 | 21,Richmond Water District,RICHMOND,n/a,RI1000040,"RICHMOND, TOWN OF","*Probably RI1000040, but could be RI2980480, RI1647529, RI2980447 or any combination"
22 | 22,Smithfield Water Supply Board,SMITHFIELD,8900 (1998),RI1615616,SMITHFIELD WATER SUPPLY BOARD,
23 | 23,South Kingstown WD(Middlebridge W.System,SOUTH KINGSTOWN,3888 (1992),RI1000015,SOUTH KINGSTOWN-MIDDLEBRIDGE,
24 | 24,South Kingstown WD(South Shore W. System,SOUTH KINGSTOWN,3888 (1992),RI1615623,SOUTH KINGSTOWN-SOUTH SHORE,
25 | 25,Stone Bridge Fire District,TIVERTON,2125 (1993),RI1615619,STONE BRIDGE FIRE DISTRICT,
26 | 26,Tiverton Water District,TIVERTON,n/a,RI1900042,"TIVERTON WATER AUTHORITY, TOWN HALL","*Might be any combination of RI2980001, RI2051311, RI1900042, RI2980003 "
27 | 27,University of Rhode Island Water Facilit,SOUTH KINGSTOWN (URI),5000 (1999),RI1858422,UNIVERSITY OF RHODE ISLAND,
28 | 28,Warwick Water Department,WARWICK,"75,107 (1990)",RI1615627,WARWICK-CITY OF,
29 | 29,Woonsocket Water Division,WOONSOCKET & N. SMITHFIELD,n/a,RI1559518,WOONSOCKET WATER DIVISION,*Spreadsheet says RI1559512
30 | 30,Harrisville Water and Fire District,BURRILLVILLE,2637 (2000),RI1858411,HARRISVILLE FIRE DISTRICT,
31 | 31,Lincoln Water Commission,LINCOLN,"18,301 (1992)",RI1858423,LINCOLN WATER COMMISSION,
32 | 32,Westerly Water Supply System,WESTERLY,"26,842 (1993)",RI1559512,WESTERLY WATER DEPARTMENT,
33 | 33,North Smithfield Water Department,NORTH SMITHFIELD,n/a,RI1615614,SLATERSVILLE PUBLIC SUPPLY,Slatersville according to system website
34 | 34,United Water Rhode Island,SOUTH KINGSTOWN/NARRAGANSETT,"16,700 (1992)",RI1615624,VEOLIA WATER WAKEFIELD RHODE ISLAND INC,
35 | 55,Bristol County Water District,BARRINGTON,"48,853(1993)",RI1647515,BRISTOL COUNTY WATER AUTHORITY,
--------------------------------------------------------------------------------
/crosswalks/state_fips_to_abbr.csv:
--------------------------------------------------------------------------------
1 | state,code
2 | AK,02
3 | AL,01
4 | AR,05
5 | AS,60
6 | AZ,04
7 | CA,06
8 | CO,08
9 | CT,09
10 | DC,11
11 | DE,10
12 | FL,12
13 | GA,13
14 | GU,66
15 | HI,15
16 | IA,19
17 | ID,16
18 | IL,17
19 | IN,18
20 | KS,20
21 | KY,21
22 | LA,22
23 | MA,25
24 | MD,24
25 | ME,23
26 | MI,26
27 | MN,27
28 | MO,29
29 | MS,28
30 | MT,30
31 | NC,37
32 | ND,38
33 | NE,31
34 | NH,33
35 | NJ,34
36 | NM,35
37 | NV,32
38 | NY,36
39 | OH,39
40 | OK,40
41 | OR,41
42 | PA,42
43 | PR,72
44 | RI,44
45 | SC,45
46 | SD,46
47 | TN,47
48 | TX,48
49 | UT,49
50 | VA,51
51 | VI,78
52 | VT,50
53 | WA,53
54 | WI,55
55 | WV,54
56 | WY,56
57 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributor Guidelines
2 |
3 | This document is meant as a primer on contributing to this repository. We strive to uphold a high standard of open-source collaboration through this repository. Please read this document before contributing to the repository. Also read Github's [code of conduct](https://github.com/github/docs/blob/a980e8037dfca61dea25796f542ea5c0fad93ee9/CODE_OF_CONDUCT.md) for reference to best practices interacting with others through the repository.
4 |
5 | ## Types of Contributions
6 |
7 | ### Issue
8 |
9 | Issues enable task-tracking. If you identify something that is incorrect or doesn't run, you can create an issue that is clearly labeled. If want to work on a contribution or refactor via a pull request, you should first create an issue that outlines the problem you are working on and obtain a green-light by the project maintainer(s).
10 |
11 | ### Pull Request
12 |
13 | Pull requests allow users to suggest a change to the repository. Pull requests will only be reviewed and incorporated if they were first agreed upon in an Issue by the project maintainer(s). Furthermore, all PRs should link to an issue(s) closed by the PR for clarity.
14 |
15 | ## How to Contribute
16 |
17 | ### Issue
18 |
19 | #### Creating an Issue
20 |
21 | Create a **new issue** if you find a problem you want to fix, identify an error in the code, or propose a refactor. Be sure to review existing issues first before posting, to ensure you do not double post. Follow the following steps to post an issue:
22 |
23 | - Clearly label new issue; e.g. if you have an issue about the transformer for AZ, label the issue accordingly: `transformer/az {descriptive message}`.
24 | - Use "labels" (e.g., "transformer", "downloader", "analysis", "bugfix", etc.) to clearly mark what part of the pipeline your issue addresses.
25 | - Include a clear description of the issue.
26 |
27 | For issues concerning larger refactors or feature developments, indicate that the issue needs review as a proposal and be sure to include your proposed plan.
28 |
29 | Issues should not be used to make feature requests.
30 |
31 | #### Solving an Issue
32 |
33 | If you review existing issues and find one you would like to solve, assign yourself the issue and propose the solution through a pull request linked to that issue.
34 |
35 | ### Pull Requests
36 |
37 | If you are solving an existing issue or working on an issue you created, always code on a branch from `develop`. When you stage and commit your code to the branch, you can create a pull request. You may also create a pull request from a forked repository.
38 |
39 | Please follow the pull request instructions from creative commons, [here](https://opensource.creativecommons.org/contributing-code/pr-guidelines/).
40 |
41 | ### Forking
42 |
43 | If you wish to manage your own version of this repository, you can fork the repository. Learn more about forking [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/about-forks).
44 |
--------------------------------------------------------------------------------
/docs/credits.md:
--------------------------------------------------------------------------------
1 | # Credits
2 |
3 | The repository and code herein was organized, directed, and developed by SimpleLab, Inc. As this is an MIT License, the repository code and data herein can be reused and re-purposed.
4 |
5 |
6 |
7 | [SimpleLab website](www.gosimplelab.com)
8 |
9 | ## Collaboration
10 |
11 | Water Data Lab contributed technical code and methods development for the initial development of this repository and the TEMM methodology.
12 |
13 |
14 |
15 | [WaDL website](https://www.waterdatalab.com/)
16 |
17 | Environmental Policy Innovation Center (EPIC) financed and supported engagement with the initial development of this repository from February-April, 2022 as part of their efforts with the Justice40 Initiative.
18 |
19 |
20 |
21 |
22 | [EPIC website](https://www.policyinnovation.org/)
23 |
24 | Internet of Water (IoW) provided technical advising and feedback on the approach and is collaborating with as part of the broader effort to expand use and improvement of water service boundaries.
25 |
26 |
27 |
28 |
29 | [IoW](https://internetofwater.org/)
30 |
31 | For more information about this project, please contact Jess Goddard at \.
32 |
--------------------------------------------------------------------------------
/docs/diagrams/contributed_pws.drawio:
--------------------------------------------------------------------------------
1 | 5Vrbkto4EP2WfaBq98FTvmMehyG33ewm2dlkkqct2RagGttyZDFAvj4tWzaWbcBMMOwFqsBuyS2p+/RRt2Bk3cWbVwyly99piKORqYebkTUbmaZh6C58Ccm2kHiOWQgWjISy005wT75hKdSldEVCnCkdOaURJ6kqDGiS4IArMsQYXavd5jRSR03RArcE9wGK2tIHEvKlXIU53slfY7JYliMb7qRoiVHZWa4kW6KQrmsi68XIumOU8uIq3tzhSBivtMvDm+1D9PbRffXrh+wr+jj97a8/PmmFspenPFItgeGEP1u19wGZL8w7bbX6qnkP8d/Lb7O0VP2EopW0l1wr35YGxEl4K/wAd0GEsowEI2u65HEEAgMuGV0lIRaD6HCHN4R/li3i+ouQ3zjybrapdZtt5U0xJA5bjjyyXqNyAqAX0xhztoXn1js3O9J1y5qHSxnDEeLkSR0TSbQtKnXVCO8pgdmYehkZrtQj48K0JqqKjK5YgOVTdb80FFX42qeII7bAvKUILmrL3olyt58AAbMDAm4ExppmKUoULLhfVwLs04BGlI2sW2HFhf8zzB+G1suvX3KXQqQmXJujmETboutrHD1hTgJUa89yvhCthplu6g3FoKIloSxGUa3tCTGC4DsiC8RXTLDLwX4BSvd1WUtgiEZbL8CoR5hzzDRYfkCSRftJytIlSqRKs5BxvOEaghklhTgA6GJWayMQJIkcSS+XmrdwBsrmoL8cKcFFKwA5p7/aMGvKQnVilS5Yi/9IQJ3QmXFGH7EmQ0Hp56PgcZEHrdbwo2l7hQtNeyIvnNKbudYQB5RB0NBE40sSPCY4k9MjCeGktE+zb82XB/vVpqP0m0cU8aZxQpKlEdqW3SMCDab+E4lTyjgSlp7uAAtXC/H9JgnJEwlXKIq2MFzCGfFXHMjL1N8/3MPn/RKleE4iwJQMAoipIg4KDS16zMQTcBnSYBXnHp6ul4Tje3CSkK9hW1UJ0y8I861fCmC86K5YO2i05o54CzlMsCYvXiIuc+fWWtz8Vc3uCTMw7Vm4VGppcl25udeo1tA7uNbT99OqwmOnkpZhtVjrzbsHUBUsMg0jBrMy9T9xSjPCqRxP8dqaxBHKsdSw/sv8NaAtm3xvTDqM6XYYs7Lw+a1pn74HtINLg36fgALDPJpHYpkuigX4i0/RDMwqmKtsSPysaNkTa4XYZz1n8MxJzxBHeXcgIshBega+YC81sCX9zyruF+CB/S66lQ0xCUPx+BT2LPIN+bkqsemkYnPPvepMR85M6FpxWmyOuWo15CUL1pFbEWMP0B6IqL2oNVTQ2h2YtTswaw8GWbcF2Xvg/gi/RT4oXGf+P5YAbKcHAYwvSwDjcxHAjK4T2K7DIn/YpTb5bT2gi0cCGvv5zn1VArhjGHGRPsxJgoSxYtiz/6c04J5CA1ZPGnAHA67XAdyLF7BgUrb9XL+pPSVud4/ld2cpfGGNeYXZI0sqKshneP1CpfS4kV7azy2lrSOKhi6l9aHQqJ+AxhBly1yBoUBzh8Yv9bZrQVPvC809eclloGlNVES5zS24LzQtt5lANRQNDc09BVOx1el5vv56GzIK9SzD7XTp7FVu6GAvtLuqWc/0rUGrWbORgFldCdhFy1mzqwA7D3FUZFFF/Y8Rh35t4ui7p5lXJQ574qjxbjyXOFopl3VZ4mgXWrdhSESJn6fKKaMBBjSKql6HVB8+VxnWApS1WaSdK/dNaPvn1F20pIbGYLTSyD6sjuy461zHGopVrLbrWi4RJ8HpXovIH+VkiTKqjq5+tAL2XNVS5a9xNUt1mmrAQqKdur1LcaJJPhL7ZCaQZ+oR2ubfl98y5/O5GQRdW2bo+q7Te8s8AJaDdHnZPdHqOpM4O3pPsoWEa0e60IlWQ/dubNdwLE9+jocyVTu5u5ipxgdNZXVb4JjlxoOBqh3mH6vdSr+9QBCfK+89AIS9vKt17EiTDvPb7s3EHMoD7b8c1Dxwid/XrukB7/oOMNu82jehy1re+RdldCfRV/X3J1tJVLQ2f3ldacpg9DU5TvSn1YlqzbcrGm3lwMkzjxyAws17zAisUviycSh6Y+c/W9cORnXj2PlTftdUOTpeWx5MhM93KmVd9b9HzlhBpddQ0Le0NEwV3cb4wqWlMyyahzvBOIiy4+DRrwoe1emu9UzwQFjr9ZepYsmxbgxn7E3sseV6dnOaAyPLbqd5P4is/+55Wl/UGldGbeOnnUljj+2L26Yiq6no2dCE292fgYvuu79UWy++Aw==
--------------------------------------------------------------------------------
/docs/diagrams/flow_diagram_v2.drawio:
--------------------------------------------------------------------------------
1 | 7V1bc6M4Fv41qZp5aApduD0m7stsVc9OajI7mdmXLRlkm24MbpCTeH79CnMxSMKWEyBObLqqg4WQzPnOXUf4Ck2WT19Sslr8mgQ0uoJm8HSFPl5BCDC0+Z+8ZVO0eKZVNMzTMCg77Rruwn9o2WiWreswoFmrI0uSiIWrdqOfxDH1WauNpGny2O42S6L2rCsyp1LDnU8iufU+DNiiaHUtc9f+Cw3ni2pmYJZXlqTqXDZkCxIkj40m9OkKTdIkYcXZ8mlCo5x4FV2K+z53XK2/WEpjpnND5jAn/uX2z6X3Hf5IYIym998+uG4xzAOJ1uUTl9+WbSoSpMk6Dmg+inmFbh4XIaN3K+LnVx856LxtwZYR/wT4aUCyxbZv/mGWxOwzWYZRDv4kWachTfnY/6aP5cVJEiXpdhZkbw/enrE0+V4TG+Y9wyiqesZJTOtOjdst07WcG35FJktJqQeaMvrUaCrJ9IUmS8rSDe9SXsV2yaMlz6ISwccdAwCr7LJogu9hA5UEJSXfzevRd9DwkxKdI5ACSIGUHfGpb0J+Ms9P7hhh9MMqTR7CHDBoRmRKo+1ZQBip+vP561sktDmNWBvSNq1LABSYkCicx/yjzynPcUY3OcVDLkrX5YVlGAT5NEoeanNZF3P0gG4luiW4AEIDeBLAjgJfbA2FrXNYCrn6WOWn/iYKOaFSdFgWpwVJv07rBuJ/n28J/dua8WFohXChdoElAMspP5vNoO+rJC6wp7bVEybQbIMCsSxyrqmQOHMwSFSKURC338ljLmMJZ/Hq2jStLnI60myTMbp8Q2LXC5Rt5cltpSxcCigrJds7ksg+jOR17HPCku1zFJrS/Ikac+MK8jnNCY2zdcZPbiNOu/yExMH2/w1bhPE890kWhOUtYZBfnlHC1inlZzSeczGjadGruG1ZOknTdRgF/MLPIzFIRGfsZLUysFCLbTC2JbaptcQofIPhYaVM4+A6dzJz8YtIloV+GyJN6aJBywmVidQggqWgQdWW0oiw8KHtuqoIU85wm4Qxa4iuBwTRFWibcRfOp+VdTVdTGAgdGoiRdE6ZNNAWp/qxXwCdylc6L+g8z7B6Ak8eamj48PnB1/Z/EHy25B0YaGjorE5jO92ZtqIhtya5sSwMDm/9sc5j4Zt2oFg3V7f/kZI4myXpkqbZ1j6LnleW+vy7s1Y3sU/D5BZfQ2qenrirJthikx+TSU8uHBRCpEZc25AHe1RrrPLiTiNECizqBlgVIrlwivpykPDJhUhYFbXKGYk4IGnAqRfIUliGThxDMi885HONmvCpRU1YIzH4vmwwFhJDEm11bTB2RfWpZ4M5Lcmm0W2Vd8j0v3A1zw72YsReDbx1dq41RgPxhQ01/eqjGQN1TDQsZ6hcvw4LvUoTn2aZhn3uMMf7VxaEFKbrU3UKc+pauRD1o8EFz1thnvEeJu1dgVsaOeV3JqgCAi+IgW1BVJE5bhhldWefXx5GNTigaszH+FC4uNe8A4CrJznw+pUwf3FVZTGrpd7u2OoSbekpDssx2s4fkFUHGNX5szx9VT5ysEUBD7cclTL3bAeRnoItW1Qlrx5sVcH2vmDrNk2etuqWsK3cV4GUuSU84dogO6+oyvaEqMqTYRxVsGygQPFdG2UbCL7oM51n2xaN+zBBlTiPPUJMZZ9dTCVyBfCe66iJwdlQbIHUX3hQvnC6k+m18r6vdD1NH0Kf7pR9/nQR2WwvPoZscSVXA5k//VE4iODnYqF7mXt4qg6w7NBYwFZ1Q5el7JxXcNvq1Hma1lK2or5oMLMDuksg+lqV+Zg8xlFCgv2LMkGz1ytHCQGdkfV25LcSKLTXZR1FlAAUKhsPxVWehjNzRgkfYCoLCUdN+njwAkkTEluWkXHxODvPDmPBIettFUUcqK9kOVbPM6hn52msZr8vvpA8/ufyhehdDebxC/OMwxcaS+qFJ7UskrFvJpszZEE/gi2kqrqE10ryeBql436ynOaWFJosbDjDFyBbIufK5ntcJFV5cLU4FqshFxQVKHZssBkTSGCqYpV+I+CCEepSpssa2FGhrbDeqkiZDFVuCKb25vMsW7nLL3dfrfX//vvPj/iDht+eh0Srjget96OSadXd7CBTdxpJcJgUyrB2olprUMNJ0WGi8FHCVdbFTw0GJtmq2K47C59yDpMCRiv/J3NdcagCyX06bB/E2kzqGgDKVXmoruJpMaZU29MbChoh/pmgUPl7ruGcAC46of5Z4eLVdQ2vCctxdayl2TwmzN5DvROJvRsCUifGDPjMFTdgWoYtIA0AqmcYqUJKo0Ll/GC13zamlRN4AfUQEG8NWB2f5aXpztPHVhcNbXTFooRXQFbeqjm5lrA9jfj1OJ7R8Hwq/9MwbUVg0JDkJi/x7kgEqb8ITa4Y+fPc4LBMw3FVkVoXIK6Bh0PElYNmwzDODBJlJGBBAyKLS45ZHraMDTINbLt9gKN8EI195kNneYDVrupQZb6AqtRYfKVCf/mFI4q/g8RfL7dPenTtt7iJllB3piwQsH2XTmf63Ik70OrW39jAzQPJbx6xsII3LcNqHvZAeCANr/h94FHXWrYk4tThOaIK6l3AA6pXOr4RfMYowdlD2BOJPaAjmJm91Ncvv27fN3C0gcZ4MczpYym+K6snMEH1GsKx0NTYMHwGaIpv7OkHTM2qqN6w1Chc+nx794YqJIYyn1jY7IlNw20dkvXEqgWTsaynRi3T3cf7f12Q5cZQ3Mer4RmpsIW2YbcOPBS4GuVNlPnbfMXZowuQebrwKh9Rw7wOXsNiicZNzgMpt7YPlo+zX8uFtE7K6XC5GXKwtzue52V0DLOpOR07puW6HHfoYQAEKzWwT+LAw1CfcmReya92Igt5RovcDpaErco2Cr5E0d0u0RrKmXA0QvH3AUhVCusJJuPkEdLQju8KIQC8twWQxsvMs8Xq/flrRyOLDWGlvQujprumLk4bDVyNIJpP9i1L4gvA0EZipYyG8KoRPuSo9Iewztag7OGCLgQQSQJ8SvAqn9KWQMrd/bvyY5KyRTJPYhJ92rUK5Nz1+ZokqxLXb5SxTfnTWGTNkjbq9ClkfzXO/26cf3wqx91+2LQwEyIR5Q6OIg7Yw9BF3k9G8aWrFcJ+ECzq3P5CByWQGor4PyyMQrY5g19bUmQG9DMAYgJAUak/1E8tKZGFGqvpRVZkz+O/PDFiCW6n6ifGkMrR7KVQXkkZMOb2nsn2kFlPewPDIabsxr4TE2/0/T1qHI7LUOlVQe9X9scQa6x3ufa6ZcGW627l4Qa2KYrazH24amce3xq0mmhogzvcL5yoYZRd+Nv7Ow3D3r/X3acOrFK2hgkb6VwHyu9SGLRuk3/c/SJqAdnud2XRp/8D
--------------------------------------------------------------------------------
/docs/frs/facility-registry-service-best-pick-processing-v-2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/frs/facility-registry-service-best-pick-processing-v-2.0.pdf
--------------------------------------------------------------------------------
/docs/img/contributed_pws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/contributed_pws.png
--------------------------------------------------------------------------------
/docs/img/data_flow_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/data_flow_diagram.png
--------------------------------------------------------------------------------
/docs/img/data_flow_diagram_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/data_flow_diagram_v2.png
--------------------------------------------------------------------------------
/docs/img/data_sources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/data_sources.png
--------------------------------------------------------------------------------
/docs/img/epic_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/epic_logo.png
--------------------------------------------------------------------------------
/docs/img/mapping_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/mapping_diagram.png
--------------------------------------------------------------------------------
/docs/img/matches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/matches.png
--------------------------------------------------------------------------------
/docs/img/matching_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/matching_diagram.png
--------------------------------------------------------------------------------
/docs/img/simplelab_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/simplelab_logo.png
--------------------------------------------------------------------------------
/docs/img/spatial_assignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/spatial_assignment.png
--------------------------------------------------------------------------------
/docs/img/stacked_match_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/stacked_match_report.png
--------------------------------------------------------------------------------
/docs/img/temm-nation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/temm-nation.png
--------------------------------------------------------------------------------
/docs/img/tiers_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/tiers_diagram.png
--------------------------------------------------------------------------------
/docs/img/wadl_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/wadl_logo.jpg
--------------------------------------------------------------------------------
/etc/wsb_labeled_simplified.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/etc/wsb_labeled_simplified.rds
--------------------------------------------------------------------------------
/layers/epa_regions.csv:
--------------------------------------------------------------------------------
1 | region,state
2 | 1,CT
3 | 1,ME
4 | 1,MA
5 | 1,NH
6 | 1,RI
7 | 1,VT
8 | 2,NJ
9 | 2,NY
10 | 2,PR
11 | 2,VI
12 | 3,DE
13 | 3,DC
14 | 3,MD
15 | 3,PA
16 | 3,VA
17 | 3,WV
18 | 4,AL
19 | 4,FL
20 | 4,GA
21 | 4,KY
22 | 4,MS
23 | 4,NC
24 | 4,SC
25 | 4,TN
26 | 5,IL
27 | 5,IN
28 | 5,MI
29 | 5,MN
30 | 5,OH
31 | 5,WI
32 | 6,AR
33 | 6,LA
34 | 6,NM
35 | 6,OK
36 | 6,TX
37 | 7,IA
38 | 7,KS
39 | 7,MO
40 | 7,NE
41 | 8,CO
42 | 8,MT
43 | 8,ND
44 | 8,SD
45 | 8,UT
46 | 8,WY
47 | 9,AZ
48 | 9,CA
49 | 9,HI
50 | 9,NV
51 | 9,AS
52 | 9,MP
53 | 9,FM
54 | 9,GU
55 | 9,MH
56 | 9,PW
57 | 10,AK
58 | 10,ID
59 | 10,OR
60 | 10,WA
61 |
--------------------------------------------------------------------------------
/renv/.gitignore:
--------------------------------------------------------------------------------
1 | library/
2 | local/
3 | cellar/
4 | lock/
5 | python/
6 | staging/
7 |
--------------------------------------------------------------------------------
/renv/settings.dcf:
--------------------------------------------------------------------------------
1 | bioconductor.version:
2 | external.libraries:
3 | ignored.packages:
4 | package.dependency.fields: Imports, Depends, LinkingTo
5 | r.version:
6 | snapshot.type: implicit
7 | use.cache: TRUE
8 | vcs.ignore.cellar: TRUE
9 | vcs.ignore.library: TRUE
10 | vcs.ignore.local: TRUE
11 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.2
2 | pandas==1.4.1
3 | python-dotenv==0.19.2
4 | openpyxl==3.0.9
5 | sqlalchemy==1.4.31
6 | psycopg2==2.9.3
7 | geoalchemy2==0.6.3
8 | tabulate==0.8.9
9 |
10 | # Optional
11 | ipykernel==6.9.0
12 |
--------------------------------------------------------------------------------
/src/analysis/README.md:
--------------------------------------------------------------------------------
1 | # Sandbox
2 |
3 | The sandbox houses EDA, sanity checks, feature engineering experiments, and other ad hoc analysis that should remain separate from the pipeline in `/src`.
--------------------------------------------------------------------------------
/src/analysis/sandbox/eda/explore_wsb_sdwis.py:
--------------------------------------------------------------------------------
1 | """
2 | This code explores the relationship between the state WSB data and the SDWIS data.
3 |
4 | Updated 3/21/22
5 |
6 | Make a dataframe that:
7 | - compares percentages of pwsid matching between WSB and SDWIS data
8 | - displays pwsid duplicate counts for states with staged WSB data
9 | """
10 |
11 | #%%
12 |
13 | import geopandas as gpd
14 | import pandas as pd
15 | import os
16 | from dotenv import load_dotenv
17 | import re
18 |
19 |
20 | # File path and data import
21 | load_dotenv()
22 |
23 | staging_path = os.environ["WSB_STAGING_PATH"]
24 |
25 | # Helper: Divides and returns a percent
26 |
27 | def get_pc(num, denom):
28 | return round((num/denom)*100, 1)
29 |
30 | #%% get list of paths/filenames for staged state wsb data
31 | staging_file_list = [file for file in os.listdir(staging_path) if re.search(r"wsb_labeled_\w\w.gpkg", file)]
32 | num_states = len(staging_file_list)
33 |
34 | #%% read in sdwis data
35 | sdwis = pd.read_csv(os.path.join(staging_path, 'sdwis_water_system.csv'))
36 |
37 | # filter for systems with active community water systems (reduces by 90%)
38 | sdwis = sdwis[(sdwis['pws_activity_code'] == 'A') &
39 | (sdwis['pws_type_code'] == 'CWS')]
40 |
41 | #%% compare wsb staging data with sdwis
42 | nested_list = []
43 |
44 | for i, staging_file in enumerate(staging_file_list):
45 | print(f'\rComparing WSB and SDWIS data for state {i+1}/{num_states}...', end='')
46 |
47 | # read in staged state wsb data
48 | # select state from sdwis data
49 | state_wsb = gpd.read_file(os.path.join(staging_path, staging_file))
50 | state = staging_file[:2].upper()
51 | state_sdwis = sdwis[sdwis['primacy_agency_code'] == state]
52 |
53 | # df id columns
54 | id_wsb = state_wsb['pwsid']
55 | id_sdwis = state_sdwis['pwsid']
56 |
57 | # df lengths
58 | len_wsb = len(state_wsb)
59 | len_sdwis = len(state_sdwis)
60 |
61 | # wsb id % matching to sdwis id
62 | wsb_matching_to_sdwis = len(state_wsb[state_wsb['pwsid'].isin(id_sdwis)])
63 |
64 | # sdwis id % matching to wsb id
65 | sdwis_matching_to_wsb = len(state_sdwis[state_sdwis['pwsid'].isin(id_wsb)])
66 |
67 | nested_list.append([state,
68 | get_pc(wsb_matching_to_sdwis, len_wsb),
69 | get_pc(sdwis_matching_to_wsb, len_sdwis),
70 | get_pc(len_wsb, len_sdwis),
71 | len(id_wsb) - len(set(id_wsb)),
72 | len(id_sdwis) - len(set(id_sdwis))])
73 |
74 | print('done.')
75 |
76 | wsb_sdwis_matches = pd.DataFrame(nested_list,
77 | columns=['state',
78 | '% WSB IDs \nin SDWIS',
79 | '% SDWIS IDs \nin WSB',
80 | 'WSB % size \nof SDWIS',
81 | 'WSB dup IDs', 'SDWIS dup IDs'])
82 |
83 | #%% print table
84 |
85 | print(wsb_sdwis_matches.to_markdown(tablefmt='pretty'))
86 |
87 | # %%
88 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/eda/multipolygon_pwsids_in_labeled_data.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "multipolygon pwsid in labeled geometries"
3 | output: html_document
4 | ---
5 |
6 | _Rich Pauloo_
7 | _Last updated `r Sys.time()`_
8 |
9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(echo = TRUE,
11 | out.width = "100%",
12 | message = FALSE,
13 | error = FALSE,
14 | warning = FALSE)
15 | ```
16 |
17 | There are duplicate pwsid in labeled data, and these polygons are adjacent. Thus they should be joined in transformers.
18 |
19 | For instance:
20 |
21 | ```{r}
22 | library(tidyverse)
23 | library(sf)
24 | library(fs)
25 | library(mapview)
26 |
27 | # mapview option for render
28 | mapviewOptions(fgb = FALSE)
29 |
30 | # data input location for modeling is the post-transformer staging path
31 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
32 |
33 | # read labeled data and ignore NA pwsid
34 | wsb_labeled <- st_read(path(staging_path, "wsb_labeled.geojson")) %>%
35 | filter(!is.na(pwsid))
36 |
37 | # multipolygon count (dc) data frame
38 | mc <- st_drop_geometry(wsb_labeled) %>%
39 | count(pwsid, sort = TRUE) %>%
40 | filter(n > 1)
41 |
42 | mc
43 |
44 | # multipolygon ids (mid) and data (md), remove 3 NA pwsid
45 | mid <- mc$pwsid
46 | md <- filter(wsb_labeled, pwsid %in% mid) %>%
47 | filter(!is.na(pwsid))
48 |
49 | # plot dupes for visual inspection
50 | mapview(md, zcol = "pwsid")
51 | ```
52 |
53 |
54 | Visual inspection of duplicate pwsid indicates they are few in number (`r nrow(md)` rows and `r length(unique(md$pwsid))` unique values) and typically spatially adjacent, but in other cases, can be separated by considerable distance. This issue is addressed in the pre-modeling transformer, currently in `src/analysis/sandbox/model_explore/01_preprocess.R`. Eventually, this preprocess program will be moved to `src/predict` and the code that accomplishes the cleaning (pending consideration) may be moved to a transformer helper function. The cleaning currently implemented:
55 |
56 | 1. unions duplicate pwsid geometries
57 | 2. groups by pwsid and sums area
58 | 3. recalculates radius from these areas (no convex hull, which would inflate radii for multipolygon systems with considerable space between polygons)
59 | 4. recalculates centroids x and y (as before, these are suspect and less meaningful for non-adjacent multipolgyon systems)
60 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/eda/wholesalers.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Wholesaler EDA"
3 | output:
4 | html_document:
5 | highlight: zenburn
6 | code_folding: hide
7 | ---
8 |
9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(warning = FALSE, message = FALSE, out.width = "100%")
11 | ```
12 |
13 | **Wholesalers**: to keep or not to keep?
14 |
15 | Labeled data seems to indicate that these should be kept, as there are clear urban areas covered by wholesalers without internal water system boundaries.
16 |
17 | ```{r}
18 | library(tidyverse)
19 | library(sf)
20 | library(fs)
21 | library(mapview)
22 |
23 | mapviewOptions(fgb = FALSE)
24 |
25 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
26 |
27 | # cols to keep from sdwis data
28 | cols_keep <- c("pwsid", "is_wholesaler_ind",
29 | "primacy_type", "primary_source_code")
30 |
31 | # read sdwis data and only keep the specified columns
32 | sdwis <- path(staging_path, "sdwis_water_system.csv") %>%
33 | read_csv(col_select = all_of(cols_keep))
34 |
35 | # clean labeled wsb
36 | wsb_labeled_clean <- st_read(path(staging_path, "wsb_labeled_clean.gpkg"))
37 |
38 | # plot
39 | wsb_labeled_clean %>%
40 | left_join(sdwis) %>%
41 | mapview(zcol = "is_wholesaler_ind")
42 | ```
43 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/matching/stats.py:
--------------------------------------------------------------------------------
1 | # Let's use the labeled data to check some hypotheses.
2 |
3 | #%%
4 |
5 | import os
6 | import pandas as pd
7 | import geopandas as gpd
8 | import sqlalchemy as sa
9 | from dotenv import load_dotenv
10 |
11 | load_dotenv()
12 |
13 | DATA_PATH = os.environ["WSB_STAGING_PATH"] + "/../outputs"
14 | EPSG = os.environ["WSB_EPSG"]
15 |
16 | # Connect to local PostGIS instance
17 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
18 |
19 | PROJ = os.environ["WSB_EPSG_AW"]
20 |
21 | #%%
22 | # Load up the data sources
23 |
24 | supermodel = gpd.GeoDataFrame.from_postgis(
25 | "SELECT * FROM pws_contributors WHERE source_system NOT IN ('ucmr');",
26 | conn, geom_col="geometry")
27 |
28 | candidates = supermodel[supermodel["source_system"].isin(["tiger", "mhp"])].set_index("contributor_id")
29 | labeled = supermodel[supermodel["source_system"] == "labeled"]
30 |
31 | matches = pd.read_sql("SELECT * FROM matches;", conn)
32 |
33 |
34 | candidates = candidates.to_crs(PROJ)
35 | labeled = labeled.to_crs(PROJ)
36 |
37 |
38 | # Q: Which match type leads to the best results?
39 | # Q: Are MHP matches good?
40 | # Q: Are MHP points better than ECHO points?
41 | # Q: What centroid_quality's result in good vs bad spatial matches? Perhaps there are some we could exclude.
42 |
43 | #%%
44 |
45 | # Q: Which match type leads to the best results?
46 |
47 | # I need to get the labeled polygon in one series and the TIGER polygons + match types in another series
48 | # Then join them on PWSID and find the distance between polygons
49 | # Then score the match rules: If distance is 0 it gets a point, otherwise not
50 | # Assign a percentage correctness
51 |
52 | s1 = gpd.GeoSeries(
53 | labeled[["pwsid", "geometry"]]
54 | .loc[labeled["master_key"].isin(matches["master_key"])]
55 | .set_index("pwsid")
56 | ["geometry"])
57 |
58 | # TIGER and MHP candidates (note that this index will not be unique)
59 | candidate_matches = gpd.GeoDataFrame(matches
60 | .join(candidates[["source_system", "geometry"]], on="candidate_contributor_id")
61 | .rename(columns={"master_key": "pwsid"})
62 | .set_index("pwsid")
63 | [["geometry", "match_rule", "source_system"]])
64 |
65 | # Filter to only the PWS's that appear in both series
66 | # 7,423 match
67 |
68 | s1 = s1.loc[s1.index.isin(candidate_matches.index)]
69 | candidate_matches = candidate_matches.loc[candidate_matches.index.isin(s1.index)]
70 |
71 |
72 | # This gives a couple warnings, but they're OK
73 | # "Indexes are different" - this is because tiger_matches has duplicated indices (multiple matches to the same PWS)
74 | # "Geometry is in a geographic CRS" - Projected CRS's will give more accurate distance results, but it's fine for our purposes.
75 | distances = s1.distance(candidate_matches, align=True)
76 |
77 | # Not sure what causes NA. Filter only non-NA
78 | distances = distances[distances.notna()]
79 | distances.name = "distance"
80 |
81 | # re-join to the match table
82 | candidate_matches = candidate_matches.join(distances, on="pwsid", how="inner")
83 |
84 | # Assign a score
85 | PROXIMITY_BUFFER = 1000
86 | candidate_matches["score"] = candidate_matches["distance"] < PROXIMITY_BUFFER
87 |
88 |
89 | #%%
90 | # How did our match rules (and combos of rules) perform for TIGER?
91 | (candidate_matches
92 | .loc[candidate_matches["source_system"] == "tiger"]
93 | .groupby(["match_rule", "source_system"])
94 | .agg(
95 | points = ("score", "sum"),
96 | total = ("score", "size")
97 | ) #type:ignore
98 | .eval("score = points / total")
99 | .sort_values("score", ascending=False))
100 |
101 | # This suggests that our MHP matching is pretty bad.
102 | # However, this only includes MHP's that matched to labeled bounds. And labeled bounds are likely municipalities / other big water systems, not MHP's.
103 | # So perhaps we're filtering to only the bad matches?
104 |
105 |
106 | #%%
107 | candidate_matches
108 |
109 | #%%
110 | distances
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/.gitignore:
--------------------------------------------------------------------------------
1 | /model_march_files
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/02_random_forest.R:
--------------------------------------------------------------------------------
1 | # fit a random forest -----------------------------------------------------
2 |
3 | library(tidyverse)
4 | library(tidymodels)
5 | library(sf)
6 | library(fs)
7 | library(vip)
8 |
9 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
10 |
11 | # read full dataset
12 | d <- read_csv(path(staging_path, "model_input_clean.csv"))
13 |
14 | # unlabeled data (du) and labeled data (dl)
15 | du <- d %>% filter(is.na(radius))
16 | dl <- d %>% filter(!is.na(radius))
17 |
18 | # plit labeled data (dl) into train and test with stratified random sampling
19 | # in each of the radius quartiles to account for the lognormal distribution
20 | # of the response variable (radius) and avoid overfitting to small radius obs
21 | set.seed(55)
22 | dl_split <- initial_split(dl, prop = 0.8, strata = radius)
23 | train <- training(dl_split)
24 | test <- testing(dl_split)
25 |
26 | # model and workflow
27 | rf_mod <-
28 | rand_forest(trees = 1000) %>%
29 | set_engine("ranger", importance = "impurity") %>%
30 | set_mode("regression")
31 |
32 | rf_wflow <-
33 | workflow() %>%
34 | add_formula(
35 | radius ~
36 | population_served_count +
37 | # importantly, the RF can have correlated predictors, so we add
38 | # service connections, and don't need to account for interactions
39 | service_connections_count +
40 | # use the cleaned owner type code from preprocess.R, which converts
41 | # 2 "N" owner type codes to "M" so that models can evaluate
42 | owner_type_code_clean +
43 | is_wholesaler_ind +
44 | satc
45 | ) %>%
46 | add_model(rf_mod)
47 |
48 | # fit the random forest model
49 | rf_fit <- fit(rf_wflow, train)
50 |
51 | # show variable importance
52 | rf_fit %>%
53 | extract_fit_parsnip() %>%
54 | vip(geom = "point")
55 |
56 | # predict on test set
57 | rf_test_res <- test %>%
58 | # select(radius) %>%
59 | bind_cols(predict(rf_fit, test))
60 |
61 | # plot residuals
62 | rf_test_res %>%
63 | ggplot(aes(log10(radius), log10(.pred), color = owner_type_code)) +
64 | geom_point(alpha = 0.4) +
65 | geom_abline(lty = 2, color = "red") +
66 | labs(y = "Predicted radius (log10)", x = "Radius (log10)") +
67 | # scale and size the x- and y-axis uniformly
68 | coord_obs_pred()
69 |
70 | # RMSE
71 | rf_metrics <- metric_set(rmse, rsq, mae)
72 | rf_metrics(rf_test_res, truth = log10(radius), estimate = log10(.pred))
73 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/03_xgboost.R:
--------------------------------------------------------------------------------
1 | # fit a random forest -----------------------------------------------------
2 |
3 | library(tidyverse)
4 | library(tidymodels)
5 | library(sf)
6 | library(fs)
7 | library(vip)
8 | library(here)
9 |
10 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
11 |
12 | # read full dataset
13 | d <- read_csv(path(staging_path, "model_input_clean.csv"))
14 |
15 | # unlabeled data (du) and labeled data (dl)
16 | du <- d %>% filter(is.na(radius))
17 | dl <- d %>% filter(!is.na(radius))
18 |
19 | # plit labeled data (dl) into train and test with stratified random sampling
20 | # in each of the radius quartiles to account for the lognormal distribution
21 | # of the response variable (radius) and avoid overfitting to small radius obs
22 | set.seed(55)
23 | dl_split <- initial_split(dl, prop = 0.8, strata = radius)
24 | train <- training(dl_split)
25 | test <- testing(dl_split)
26 |
27 | # model and workflow
28 | xgb_mod <-
29 | boost_tree(
30 | trees = 1000,
31 | tree_depth = tune(),
32 | min_n = tune(),
33 | # loss_reduction = tune(),
34 | # sample_size = tune(),
35 | # mtry = tune(),
36 | learn_rate = tune()
37 | ) %>%
38 | set_engine("xgboost") %>%
39 | set_mode("regression")
40 |
41 | # hyperparameter space
42 | xgb_grid <- grid_latin_hypercube(
43 | tree_depth(),
44 | min_n(),
45 | # loss_reduction(),
46 | # sample_size = sample_prop(),
47 | # finalize(mtry(), train),
48 | learn_rate(),
49 | size = 30
50 | )
51 |
52 | xgb_wflow <-
53 | workflow() %>%
54 | add_formula(
55 | radius ~
56 | population_served_count +
57 | # importantly, the RF can have correlated predictors, so we add
58 | # service connections, and don't need to account for interactions
59 | service_connections_count +
60 | # use the cleaned owner type code from preprocess.R, which converts
61 | # 2 "N" owner type codes to "M" so that models can evaluate
62 | owner_type_code_clean +
63 | is_wholesaler_ind +
64 | satc
65 | ) %>%
66 | add_model(xgb_mod)
67 |
68 | # CV
69 | set.seed(123)
70 | xgb_folds <- vfold_cv(train, strata = radius)
71 |
72 | # tune the model
73 | doParallel::registerDoParallel()
74 |
75 | set.seed(234)
76 | xgb_res <- tune_grid(
77 | xgb_wflow,
78 | resamples = xgb_folds,
79 | grid = xgb_grid,
80 | control = control_grid(save_pred = TRUE)
81 | )
82 |
83 | # save for use in report
84 | write_rds(xgb_res, here("src/analysis/sandbox/model_explore/etc/xgb_res.rds"))
85 |
86 | # visualize model performance across tuning grid
87 | xgb_res %>%
88 | collect_metrics() %>%
89 | filter(.metric == "rsq") %>%
90 | select(mean, min_n:learn_rate) %>%
91 | pivot_longer(min_n:learn_rate,
92 | values_to = "value",
93 | names_to = "parameter"
94 | ) %>%
95 | ggplot(aes(value, mean, color = parameter)) +
96 | geom_point(alpha = 0.8, show.legend = FALSE) +
97 | facet_wrap(~parameter, scales = "free_x") +
98 | labs(x = NULL, y = "rsq")
99 |
100 | show_best(xgb_res, "rsq")
101 |
102 | # select best model
103 | final_xgb <- finalize_workflow(
104 | xgb_wflow, select_best(xgb_res, "rsq")
105 | )
106 |
107 | final_xgb
108 |
109 | # save for later use in report
110 | write_rds(final_xgb, here("src/analysis/sandbox/model_explore/etc/final_xgb.rds"))
111 |
112 | # fit the final xgboost model on training data
113 | xgb_fit <- fit(final_xgb, train)
114 |
115 | # show variable importance
116 | xgb_fit %>%
117 | extract_fit_parsnip() %>%
118 | vip(geom = "point")
119 |
120 | # predict on test set
121 | xgb_test_res <- test %>%
122 | select(radius) %>%
123 | bind_cols(predict(xgb_fit, test))
124 |
125 | # plot residuals
126 | xgb_test_res %>%
127 | ggplot(aes(log10(radius), log10(.pred))) +
128 | geom_point(alpha = 0.4) +
129 | geom_abline(lty = 2, color = "red") +
130 | labs(y = "Predicted radius (log10)", x = "Radius (log10)") +
131 | # scale and size the x- and y-axis uniformly
132 | coord_obs_pred()
133 |
134 | # RMSE
135 | xgb_metrics <- metric_set(rmse, rsq, mae)
136 | xgb_metrics(xgb_test_res, truth = log10(radius), estimate = log10(.pred))
137 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/README.md:
--------------------------------------------------------------------------------
1 | # "model_explore" Little Sandbox
2 |
3 | This little sandbox houses model exploration scripts used to prototype the final code in `src/model` and the March 2022 report summarizing construction of the TEMM data layer.
4 |
5 | ## Table of contents
6 |
7 | * `model_march.Rmd` summarizes construction of the TEMM data layer and uses flat files in `/etc` to render.
8 | * `02_random_forest.R` fits the random forest model.
9 | * `03_xgboost.R` fits the xgboost model.
10 |
11 | * `/archive` has two scripts:
12 | - `01_preprocess.R` -> migrated to and superseded by `src/model/01_preprocess.R`
13 | - `04_linear.R` -> migrated to and superseded by `src/model/02_linear.R`
14 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/archive/01_preprocess.R:
--------------------------------------------------------------------------------
1 | # preprocess data for model -----------------------------------------------
2 |
3 | library(tidyverse)
4 | library(sf)
5 | library(fs)
6 |
7 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
8 |
9 | # this is the critical service connection count below which (inclusive) we
10 | # assume that the value is nonsensical, and impute it based on population.
11 | # We also assume that population counts less than n_max are unreasonable,
12 | # and only work with populations >= 15 (at least one per service connection)
13 | n_max <- 15
14 | cat("Preparing to mean impute service connection count",
15 | "for all values >=", n_max, ".\n")
16 |
17 | # j stands for joined data, read and rm rownumber column, then drop
18 | # observations without a centroid or with nonsensical service connections
19 | j <- read_csv(path(staging_path, "matched_output.csv")) %>%
20 | filter(!is.na(centroid_lat) | !is.na(centroid_lon)) %>%
21 | # filter out systems with < n_max population count - 243 (0.5%),
22 | filter(population_served_count > n_max)
23 | cat("Read", nrow(j), "matched outputs with >=",
24 | n_max, "connection & population count.\n")
25 |
26 |
27 | # mean impute service connections == 0 with linear model ------------------
28 |
29 | # A 2022-03-08 meeting with IoW/BC/EPIC recommended filtering out
30 | # wholesalers and water systems with a zero population count. However,
31 | # many water systems have service connection counts (e.g., between 0 and 10),
32 | # but very high population (e.g., in the hundreds to thousands), wholesalers
33 | # in labeled data are primarily found in WA and TX, and wholesalers
34 | # typically occupy urban areas and do not contain smaller pwsids. Thus, we
35 | # retain all observations and mean impute suspect (between 0 and N) service
36 | # connections.
37 |
38 | # we learned in the Feb 2022 EDA (sandbox/eda/eda_february.Rmd) that
39 | # population served and service connection count had outliers that were
40 | # likely incorrect. Here we highlight "bad" high leverage points
41 | # j %>%
42 | # mutate(
43 | # grp = ifelse(
44 | # population_served_count %in% 0:n_max, "bad", "good"
45 | # )
46 | # ) %>%
47 | # ggplot(aes(service_connections_count, population_served_count)) +
48 | # geom_point(aes(color = grp), alpha = 0.5) +
49 | # geom_smooth(method = "lm")
50 |
51 | # linear model for imputing service connections from population served
52 | # Only train on population served >= n_max (community water systems)
53 | jm <- j %>% filter(service_connections_count >= n_max,
54 | population_served_count >= n_max)
55 |
56 | # simple linear model for imputing service connection count and b1 slope
57 | m <- lm(service_connections_count ~ population_served_count, data = jm)
58 | b1 <- coefficients(m)["population_served_count"]
59 |
60 | # predict, & change the y-intercept to 0 to avoid negative connections
61 | j <- j %>%
62 | mutate(
63 | service_connections_count = ifelse(
64 | service_connections_count < n_max,
65 | ceiling(population_served_count * b1),
66 | service_connections_count)
67 | )
68 | cat("Mean imputed service connection count.\n")
69 |
70 |
71 | # read labeled data with recalculated area, centroid for multipolygon pwsids --
72 |
73 | # read wsb_labeled_clean
74 | wsb_labeled_clean <- st_read(path(staging_path, "wsb_labeled_clean.gpkg"))
75 |
76 | # rm geometry and other unnecessary (for model) cols from clean wsb labels
77 | vars_keep <- c("pwsid", "radius")
78 |
79 | wsb_labeled_clean_df <- wsb_labeled_clean %>%
80 | select(all_of(vars_keep)) %>%
81 | st_drop_geometry()
82 |
83 |
84 | # join clean wsb labeled data to matched output and write -----------------
85 |
86 | # add other data, including SDWIS
87 |
88 | # cols to keep from sdwis data
89 | cols_keep <- c("pwsid", "is_wholesaler_ind",
90 | "primacy_type", "primary_source_code")
91 |
92 | # read sdwis data and only keep the specified columns
93 | sdwis <- path(staging_path, "sdwis_water_system.csv") %>%
94 | read_csv(col_select = all_of(cols_keep))
95 |
96 | # ensure non-duplicate pwsid in SDIWS pre-join
97 | cat("Detected", length(unique(sdwis$pwsid)), "unique pwsids", "and",
98 | nrow(sdwis), "rows in SDWIS. Numbers must equal for safe join.\n")
99 |
100 | # join to matched output, and lose 378/13435 (2.8% of labeled data) which
101 | # is not in combined_output.csv
102 | d <- j %>%
103 | left_join(wsb_labeled_clean_df, by = "pwsid") %>%
104 | left_join(sdwis)
105 | cat("Joined matched output, labeled data, and sdwis data.\n")
106 |
107 | # sanity row count equivalence pre and post join (this is FALSE when, for
108 | # instance, duplicate pwsid are present)
109 | cat("Row count equivalence pre and post-join is", nrow(d) == nrow(j), "\n")
110 |
111 |
112 | # apply cleaning informed by EDA ------------------------------------------
113 |
114 | d <- d %>%
115 | mutate(
116 | # when radius == 0, make it NA
117 | radius = ifelse(radius == 0, NA, radius),
118 | # split type codes in the "python list" into chr vectors
119 | satc = strsplit(service_area_type_code, ", "),
120 | # map over the list to remove brackets ([]) and quotes (')
121 | satc = map(satc, ~str_remove_all(.x, "\\[|\\]|'")),
122 | # sort the resulting chr vector
123 | satc = map(satc, ~sort(.x)),
124 | # collapse the sorted chr vector
125 | satc = map_chr(satc, ~paste(.x, collapse = "")),
126 | # convert the sorted chr vector to factor with reasonable level count
127 | satc = fct_lump_prop(satc, 0.02),
128 | satc = as.character(satc),
129 | satc = ifelse(is.na(satc), "Other", satc),
130 | # convert T/F is_wholesaler_ind to character for dummy var prep
131 | is_wholesaler_ind = ifelse(is_wholesaler_ind == TRUE,
132 | "wholesaler", "not wholesaler"),
133 | # make native american owner types (only 2 present) public/private (M)
134 | owner_type_code = ifelse(owner_type_code == "N", "M", owner_type_code)
135 | )
136 | cat("Cleaned data according to EDA-generated insights.\n")
137 |
138 | # write for modeling
139 | write_csv(d, path(staging_path, "matched_output_clean.csv"))
140 | cat("Wrote clean preprocessed data for modeling to staging path.\n")
141 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/archive/04_linear.R:
--------------------------------------------------------------------------------
1 | # linear model ------------------------------------------------------------
2 |
3 | library(tidyverse)
4 | library(tidymodels)
5 | library(sf)
6 | library(fs)
7 |
8 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
9 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
10 |
11 | # read dataset and log transform the response - only for linear model
12 | d <- read_csv(path(staging_path, "matched_output_clean.csv")) %>%
13 | mutate(radius = log10(radius),
14 | # multiply correlated predictors
15 | density = population_served_count * service_connections_count)
16 |
17 | # unlabeled data (du) and labeled data (dl)
18 | du <- d %>% filter(is.na(radius))
19 | dl <- d %>% filter(!is.na(radius))
20 |
21 | # split labeled data (dl) into train and test with stratified random sampling
22 | # in each of the radius quartiles to account for the lognormal distribution
23 | # of the response variable (radius) and avoid overfitting to small radius obs
24 | set.seed(55)
25 | dl_split <- initial_split(dl, prop = 0.8, strata = radius)
26 | train <- training(dl_split)
27 | test <- testing(dl_split)
28 |
29 | # lm recipe
30 | lm_recipe <-
31 | # specify the model - interaction terms come later
32 | recipe(
33 | radius ~
34 | service_connections_count +
35 | owner_type_code +
36 | satc +
37 | is_wholesaler_ind,
38 | data = train
39 | ) %>%
40 | # convert predictors to log10
41 | step_log(service_connections_count, base = 10) %>%
42 | # encode categorical variables
43 | step_dummy(all_nominal_predictors()) %>%
44 | # specify interaction effects
45 | step_interact(~service_connections_count:starts_with("owner_type_code")) %>%
46 | step_interact(~service_connections_count:starts_with("satc")) %>%
47 | step_interact(~service_connections_count:starts_with("is_wholesaler_ind"))
48 |
49 | # specify model and engine for linear model and rf
50 | lm_mod <- linear_reg() %>% set_engine("lm")
51 |
52 | # lm workflow
53 | lm_wflow <-
54 | workflow() %>%
55 | add_model(lm_mod) %>%
56 | add_recipe(lm_recipe)
57 |
58 | # fit the linear model on the training set
59 | lm_fit <- fit(lm_wflow, train)
60 |
61 | # predict on the test set and bind mean predictions and CIs
62 | lm_test_res <- test %>%
63 | select(radius) %>%
64 | bind_cols(predict(lm_fit, test)) %>%
65 | bind_cols(predict(lm_fit, test, type = "conf_int"))
66 |
67 | # plot residuals
68 | lm_test_res %>%
69 | ggplot(aes(radius, .pred)) +
70 | geom_point(alpha = 0.4) +
71 | geom_abline(lty = 2, color = "red") +
72 | labs(y = "Predicted radius (log10)", x = "Radius (log10)") +
73 | # scale and size the x- and y-axis uniformly
74 | coord_obs_pred()
75 |
76 | # RMSE
77 | lm_metrics <- metric_set(rmse, rsq, mae)
78 | lm_metrics(lm_test_res, truth = radius, estimate = .pred)
79 |
80 |
81 | # apply modeled radii to centroids for all data and write -----------------
82 |
83 | # read matched output for centroid lat/lng
84 | matched_output_clean <- path(staging_path, "matched_output_clean.csv") %>%
85 | read_csv(col_select = c("pwsid", "centroid_lat", "centroid_lon")) %>%
86 | st_as_sf(coords = c("centroid_lon", "centroid_lat"), crs = epsg)
87 |
88 | # fit the model on all data, apply the spatial buffer, and write
89 | t3m <- d %>%
90 | select(pwsid, radius) %>%
91 | bind_cols(predict(lm_fit, d)) %>%
92 | bind_cols(predict(lm_fit, d, type = "conf_int", level = 0.95)) %>%
93 | # exponentiate results back to median (unbiased), and 5/95 CIs
94 | mutate(across(where(is.numeric), ~10^(.x))) %>%
95 | # add matched output lat/lng centroids and make spatial
96 | left_join(matched_output_clean, by = "pwsid") %>%
97 | st_as_sf() %>%
98 | # convert to projected metric CRS for accurate, efficient buffer.
99 | # The project CRS (4326) is inappropriate because units are degrees.
100 | st_transform(3310)
101 |
102 | # create buffers for median, CI lower, and CI upper (5/95) predictions
103 | # (in units meters) and then transform back into projet CRS
104 | t3m_med <- st_buffer(t3m, t3m$.pred ) %>% st_transform(epsg)
105 | t3m_cil <- st_buffer(t3m, t3m$.pred_lower) %>% st_transform(epsg)
106 | t3m_ciu <- st_buffer(t3m, t3m$.pred_upper) %>% st_transform(epsg)
107 |
108 | # paths to write modeled data
109 | path_t3m_med <- path(staging_path, "tier3_median.gpkg")
110 | path_t3m_cil <- path(staging_path, "tier3_ci_upper_95.gpkg")
111 | path_t3m_ciu <- path(staging_path, "tier3_ci_lower_05.gpkg")
112 |
113 | # write and delete layer if it already exists
114 | st_write(t3m_med, path_t3m_med, delete_dsn = TRUE)
115 | st_write(t3m_cil, path_t3m_cil, delete_dsn = TRUE)
116 | st_write(t3m_ciu, path_t3m_ciu, delete_dsn = TRUE)
117 |
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/etc/final_xgb.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/src/analysis/sandbox/model_explore/etc/final_xgb.rds
--------------------------------------------------------------------------------
/src/analysis/sandbox/model_explore/etc/xgb_res.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/src/analysis/sandbox/model_explore/etc/xgb_res.rds
--------------------------------------------------------------------------------
/src/analysis/sandbox/sanity_checks/01_convex_hull.R:
--------------------------------------------------------------------------------
1 | # sanity check convex hull error
2 |
3 | library(tidyverse)
4 | library(here)
5 | library(fs)
6 | library(sf)
--------------------------------------------------------------------------------
/src/combine_tiers.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import pandas as pd
5 | import geopandas as gpd
6 | import sqlalchemy as sa
7 | import match.helpers as helpers
8 | from dotenv import load_dotenv
9 | from shapely.geometry import Polygon
10 |
11 | load_dotenv()
12 |
13 | STAGING_PATH = os.environ["WSB_STAGING_PATH"]
14 | OUTPUT_PATH = os.environ["WSB_OUTPUT_PATH"]
15 | EPSG = os.environ["WSB_EPSG"]
16 |
17 | # Connect to local PostGIS instance
18 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
19 |
20 | #%%
21 | # load geometries for each tier -------------------------------------------
22 |
23 | print("Loading geometries for Tiers 1-3...")
24 |
25 | # Tier 1: LABELED (and CONTRIBUTED) boundaries
26 | t1 = gpd.GeoDataFrame.from_postgis("""
27 | SELECT pwsid, centroid_lat, centroid_lon, centroid_quality, geometry, geometry_source_detail
28 | FROM pws_contributors
29 | WHERE
30 | source_system IN ('labeled', 'contributed') AND
31 | NOT st_isempty(geometry)
32 | ORDER BY source_system, pwsid;""",
33 | conn, geom_col="geometry")
34 |
35 | # If there are duplicates, it's likely because we have a contributed AND a labeled bound.
36 | # Take only the contributed.
37 | before_count = len(t1)
38 | t1 = t1.drop_duplicates(subset="pwsid", keep="first")
39 |
40 | if len(t1) < before_count:
41 | print(f"Prioritized {before_count - len(t1)} contributed records over labeled in T1.")
42 |
43 | print("Retrieved Tier 1: Labeled boundaries.")
44 |
45 | # Tier 2: MATCHED boundaries (only the best)
46 | t2 = gpd.GeoDataFrame.from_postgis("""
47 | SELECT
48 | m.master_key AS pwsid,
49 | t.source_system_id AS matched_bound_geoid,
50 | t.name AS matched_bound_name,
51 | t.centroid_lat,
52 | t.centroid_lon,
53 | t.centroid_quality,
54 | t.geometry,
55 | t.geometry_source_detail
56 | FROM matches_ranked m
57 | JOIN pws_contributors t ON m.candidate_contributor_id = t.contributor_id
58 | WHERE
59 | m.best_match AND
60 | t.source_system = 'tiger'""",
61 | conn, geom_col="geometry")
62 |
63 | print("Retrieved Tier 2: Matched boundaries.")
64 |
65 | # Tier 3: MODELED boundaries - use median result geometry but bring in CIs
66 | t3 = (gpd
67 | .read_file(os.path.join(STAGING_PATH, "tier3_median.gpkg"))
68 | [[
69 | "pwsid", ".pred_lower", ".pred", ".pred_upper",
70 | "centroid_lat", "centroid_lon", "centroid_quality",
71 | "geometry", "geometry_source_detail"
72 | ]]
73 | .rename(columns={
74 | ".pred_lower": "pred_05",
75 | ".pred": "pred_50",
76 | ".pred_upper": "pred_95"
77 | })) #type:ignore
78 |
79 | print("Retrieved Tier 3: Modeled boundaries.")
80 |
81 | #%%
82 |
83 | # Assign tier labels
84 | t1["tier"] = 1
85 | t2["tier"] = 2
86 | t3["tier"] = 3
87 |
88 | #%%
89 | # Pull in base attributes from SDWIS ----------------------------------
90 |
91 | # read and format matched output
92 | print("Reading SDWIS for base attributes...")
93 |
94 | base = pd.read_sql(f"""
95 | SELECT *
96 | FROM pws_contributors
97 | WHERE source_system = 'sdwis';""", conn)
98 |
99 | base = base.drop(columns=[
100 | "tier", "centroid_lat", "centroid_lon", "centroid_quality",
101 | "geometry", "geometry_source_detail"])
102 |
103 | # Overwrite the contributor_id
104 | base["contributor_id"] = "master." + base["pwsid"]
105 | base["source_system"] = "master"
106 |
107 | #%%
108 | # combine tiers -----------------------------------------------------------
109 |
110 | # Combine geometries from Tiers 1-3
111 | # Where we have duplicates, prefer Tier 1 > 2 > 3
112 | combined = gpd.GeoDataFrame(pd
113 | .concat([t1, t2, t3])
114 | .sort_values(by="tier") #type:ignore
115 | .drop_duplicates(subset="pwsid", keep="first")
116 | [["pwsid", "tier", "centroid_lat", "centroid_lon", "centroid_quality",
117 | "geometry", "geometry_source_detail", "pred_05", "pred_50", "pred_95"]])
118 |
119 | # Join again to get matched boundary info
120 | # we do this to get boundary info for ALL tiers
121 | combined = combined.merge(
122 | t2[["pwsid", "matched_bound_geoid", "matched_bound_name"]], on="pwsid", how="left")
123 |
124 | # Fix data types
125 | combined["matched_bound_geoid"] = combined["matched_bound_geoid"].astype(pd.Int64Dtype())
126 |
127 | # Join to base
128 | temm = gpd.GeoDataFrame(
129 | base.merge(combined, on="pwsid", how="left"),
130 | crs=f"epsg:{EPSG}")
131 |
132 | # Allow NA when we have no geometry
133 | temm["tier"] = temm["tier"].astype(pd.Int64Dtype())
134 |
135 | # Replace empty geometries
136 | temm.loc[temm["geometry"].is_empty | temm["geometry"].isna(), "geometry"] = Polygon([]) #type:ignore
137 |
138 | # Verify - We should have the same number of rows in df and in temm
139 | assert len(temm) == len(base)
140 |
141 | print("Combined a spatial layer using best available tiered data.\n")
142 |
143 | #%%
144 |
145 | # Save to the database
146 | helpers.load_to_postgis("master",
147 | temm.drop(columns=["matched_bound_geoid", "matched_bound_name", "pred_05", "pred_50", "pred_95"]))
148 |
149 | #%%
150 | # Export
151 |
152 | # The file outputs have a subset of columns
153 | columns = [
154 | "pwsid", "name", "primacy_agency_code", "state", "city_served",
155 | "county", "population_served_count", "service_connections_count",
156 | "service_area_type_code", "owner_type_code",
157 | "is_wholesaler_ind", "primacy_type",
158 | "primary_source_code", "tier",
159 | "centroid_lat", "centroid_lon", "centroid_quality",
160 | "geometry", "geometry_source_detail", "pred_05", "pred_50", "pred_95"]
161 |
162 | # Backwards compatibility
163 | output = (temm[columns]
164 | .rename(columns={
165 | "name": "pws_name",
166 | "state": "state_code",
167 | "county": "county_served"
168 | }))
169 |
170 | #%%
171 | # paths to write
172 | path_geopkg = os.path.join(OUTPUT_PATH, "temm.gpkg")
173 | output.to_file(path_geopkg, driver="GPKG")
174 |
175 | print("Wrote data to geopackage.\n")
--------------------------------------------------------------------------------
/src/downloaders/download_contributed_pws.R:
--------------------------------------------------------------------------------
1 | # Download contributed public water system boundaries ---------------------
2 | library(fs)
3 |
4 | # path to save raw data
5 | data_path <- Sys.getenv("WSB_DATA_PATH")
6 |
7 | # Allow for longer timeout for download file
8 | options(timeout = 10000)
9 |
10 | # Data Source: Current Github managed by CGS/IOW, where final, accepted
11 | # individual contributor public water systems are added to SL's base map layer
12 |
13 | contributed_pws_url <- paste0("https://github.com/cgs-earth/ref_pws/raw/main/02_output/",
14 | "contributed_pws.gpkg")
15 |
16 | # create dir to store file, download, and un-zip
17 | dir_create(path(data_path, "contributed_pws"))
18 |
19 | # local path to download files
20 | file_contributed_pws <- path(data_path, "contributed_pws/contributed_pws.gpkg")
21 |
22 | download.file(contributed_pws_url, file_contributed_pws, mode="wb")
23 |
24 | cat("Downloaded contributed PWS data.\n")
25 |
--------------------------------------------------------------------------------
/src/downloaders/download_echo.R:
--------------------------------------------------------------------------------
1 | # download ECHO admin data -----------------------------------------------
2 |
3 | library(glue)
4 | library(fs)
5 |
6 | # path to save raw data
7 | data_path <- Sys.getenv("WSB_DATA_PATH")
8 |
9 | # Allow for longer timeout for download file
10 | options(timeout = 10000)
11 |
12 | # Data Source: EPA's ECHO Exporter ZIP and SDWA zip (in case it is useful)
13 | echo_url <- paste0("https://echo.epa.gov/files/echodownloads/",
14 | "echo_exporter.zip")
15 | # create dir to store file, download, and un-zip
16 | dir_create(path(data_path, "echo"))
17 |
18 | # local path to download files
19 | file_echo <- path(data_path, "echo/echo_exporter.zip")
20 |
21 | download.file(echo_url, file_echo)
22 |
23 | unzip(file_echo, exdir = path(data_path, "echo"))
24 |
25 | cat("Downloaded and unzipped ECHO data.\n")
26 |
--------------------------------------------------------------------------------
/src/downloaders/download_frs.R:
--------------------------------------------------------------------------------
1 | # download FRS centroids --------------------------------------------------
2 |
3 | library(fs)
4 |
5 | # path to save raw data
6 | data_path <- Sys.getenv("WSB_DATA_PATH")
7 |
8 | # Allow for longer timeout to map download file
9 | options(timeout = 10000)
10 |
11 | # Data Source: EPA's Facility Reigstry Service (FRS)
12 | frs_url <- paste0("https://edg.epa.gov/data/public/OEI/FRS/",
13 | "FRS_Interests_Download.zip")
14 |
15 | # create dir to store file, download, and un-zip
16 | dir_create(path(data_path, "frs"))
17 | download.file(frs_url, path(data_path, "frs/frs.zip"))
18 | unzip(path(data_path, "frs/frs.zip"), exdir = path(data_path, "frs"))
19 | cat("Downloaded and unzipped FRS data.\n")
20 |
--------------------------------------------------------------------------------
/src/downloaders/download_helpers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Tue Feb 1 11:04:42 2022
5 |
6 | @author: nb, jjg
7 | """
8 |
9 | import os
10 | import pandas as pd
11 | import glob
12 |
13 |
14 |
15 | def create_dir(path, dir):
16 |
17 | """
18 | A function that creates a directory for downloading data.
19 |
20 | Inputs:
21 | -path: file path relative to root project path
22 | -dir: name of directory
23 |
24 | Output: a folder for data downloads.
25 | """
26 |
27 | dir_path = os.path.join(path, dir)
28 |
29 | if os.path.exists(dir_path):
30 | print(f'Directory {dir} exists.')
31 | else:
32 | os.mkdir(dir_path)
33 | print(f'Created directory {dir}.')
34 |
35 | return dir_path
36 |
37 |
38 |
39 | def get_row_count(directory, file):
40 |
41 | """
42 | A function that counts the final rows in a given csv.
43 |
44 | Inputs:
45 | -directory: directory file path relative to root path
46 | -file: name of file
47 |
48 | Output: row count of input file
49 |
50 | """
51 | path = os.path.join(directory, file)
52 | with open(path) as f:
53 | row_count = sum(1 for line in f)
54 | return row_count
55 |
56 |
57 | def write_aria_download_txt(download_txt_name, path, base_filename, table_filter=None,
58 | step_size=10000, count_cur=0, count_end=200):
59 | """
60 | Write aria download text file for base_filename in path. Default file step size
61 | (size of each partial download) is 10000 rows. Downloads up to count_end (default 200) files.
62 | Tables with more than 2MM rows require adjustments to step size and count_end.
63 |
64 | Inputs:
65 | -download_txt_name: name of download text file name supplied in download_with_aria()
66 | -path: folder directory for a given download (e.g. data/sdwis)
67 | -base_filename: filename withint download folder (e.g. WATER_SYSTEM)
68 | -table_filter: optional filter to SDWIS tables, e.g. filter by state code
69 | -step_size: step size of each partial download; default is 10000 rows
70 | -count_end: last step in download (default is 200 files)
71 |
72 | Note: EPA Download is inclusive. If URL includes 'ROWS/0:2',
73 | it downloads three rows (indices 0, 1, 2).
74 | """
75 |
76 | if table_filter:
77 | base_url = f'https://data.epa.gov/efservice/{base_filename}/{table_filter}/ROWS'
78 | else:
79 | base_url = f'https://data.epa.gov/efservice/{base_filename}/ROWS'
80 |
81 | urls_txt_path = os.path.join(path, download_txt_name)
82 |
83 | with open(urls_txt_path, 'w') as f:
84 | while count_cur < count_end:
85 |
86 | row_start = count_cur * step_size
87 | row_end = row_start + step_size - 1
88 | rows = f'{str(row_start)}:{str(row_end)}'
89 |
90 | url = f"{base_url}/{rows}/csv"
91 | f.write(url + '\n')
92 |
93 | filename = f'{base_filename}_{count_cur}.csv'
94 | f.write(f' out={filename}' + '\n')
95 |
96 | count_cur += 1
97 |
98 | return urls_txt_path
99 |
100 |
101 | def download_with_aria(data_path, filename, table_filter=None, count_end=200):
102 |
103 | """
104 | Create text file based on filename and base url to direct downloader.
105 | Download url files using aria download text file for base_filename in path.
106 |
107 | Inputs:
108 | -data_path: directory file path relative to root path where downloads happen
109 | -file: name of file
110 | -table_filter: optional filter to SDWIS tables, e.g. filter by state code
111 |
112 | Outputs: a folder of csv files in increments of step_size rows.
113 |
114 | Note: setting --auto-file-renaming=false prevents data from being appended to existing
115 | downloads; a new download requires manual deletion of the previous downloads.
116 | """
117 |
118 | # Create subdirectory
119 | dir_path = create_dir(data_path,filename)
120 |
121 | # Make text file of chunked aria urls and filenames
122 | aria_download_filename = f'aria_download_{filename}.txt'
123 |
124 | urls_txt_path = write_aria_download_txt(aria_download_filename, dir_path, filename, table_filter, count_end=200)
125 |
126 | # Download with aria
127 | os.system(f'aria2c --input-file={urls_txt_path} --dir={dir_path} --auto-file-renaming=false')
128 |
129 |
130 | def stitch_files(filename, data_path):
131 |
132 | """
133 | Create single csv file based on a folder of downloaded csvs.
134 |
135 | Inputs:
136 | -data_path: directory file path relative to root path where downloads happen
137 | -filename: name of file
138 |
139 | Outputs: a single csv file in the root project directory for use in transformers.
140 | """
141 |
142 | extension = 'csv'
143 | csv_file_path = os.path.join(data_path, filename)
144 | os.chdir(csv_file_path)
145 |
146 | all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
147 |
148 | #combine all files in the list
149 | combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
150 |
151 | #export to csv
152 | combined_csv.to_csv(f"../{filename}.csv", index=False, encoding='utf-8-sig')
153 |
--------------------------------------------------------------------------------
/src/downloaders/download_mhp.R:
--------------------------------------------------------------------------------
1 | # Download mobile home parks point data -----------------------------------
2 |
3 | library(fs)
4 |
5 | # path to save raw data
6 | data_path <- Sys.getenv("WSB_DATA_PATH")
7 |
8 | # Allow for longer timeout to map download file
9 | options(timeout = 10000)
10 |
11 | # Data Source: MPH ArcGIS geojson water system boundary
12 | mhp_url <- paste0("https://opendata.arcgis.com/datasets/",
13 | "4cdbccc5c538452aa91ceee277c460f9_0.geojson")
14 |
15 | # create dir to store file and download
16 | dir_create(path(data_path, "mhp"))
17 | download.file(mhp_url, path(data_path, "/mhp/mhp.geojson"))
18 | cat("Downloaded mobile home park point data.\n")
19 |
--------------------------------------------------------------------------------
/src/downloaders/download_sdwis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Tue Feb 1 11:06:58 2022
5 |
6 | @author: nb, jjg
7 | """
8 |
9 |
10 | # Libraries
11 | import os, sys
12 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
13 | from downloaders.download_helpers import create_dir, get_row_count
14 | from downloaders.download_helpers import download_with_aria, stitch_files
15 | from dotenv import load_dotenv
16 |
17 | load_dotenv()
18 |
19 | #%%
20 | # Create file directory
21 | data_path = os.environ["WSB_DATA_PATH"]
22 | directory = 'sdwis'
23 |
24 | # Set output directory
25 | create_dir(data_path, directory)
26 |
27 | sdwis_data_path = os.path.join(data_path, "sdwis")
28 |
29 | #%% Download smaller files to sdwis directory
30 |
31 | # SERVICE_AREA, GEOGRAPHIC_AREA
32 |
33 | filenames = ['SERVICE_AREA', 'GEOGRAPHIC_AREA']
34 |
35 |
36 | for filename in filenames:
37 | if os.path.exists(os.path.join(sdwis_data_path, filename + ".csv")):
38 | pass
39 | print(f"{filename}.csv exists, skipping download.")
40 |
41 | else:
42 | print(f'Downloading {filename}')
43 |
44 | base_url = f'https://data.epa.gov/efservice/{filename}/ROWS/0:100000000/csv'
45 |
46 | os.system(f'aria2c --out={filename}.csv --dir={sdwis_data_path} {base_url} --auto-file-renaming=false')
47 |
48 | # Print row count
49 | row_count = get_row_count(sdwis_data_path, f'{filename}.csv')
50 | print(f'Row count of {filename}.csv: {row_count}')
51 |
52 |
53 | #%% Download larger files
54 | # While the smaller files above work without timing out, SDWIS has a 10K query limit
55 | # on tables; the following script could be used for the above tables as well, but currently
56 | # are limited to the larger of the 4 files to avoid time outs
57 |
58 | # Current working assumption is that there are no more than 2MM rows for
59 | # water_system and water_system_facility; this could theoretically change over time
60 | # and the analyst would need to adjust the default value
61 |
62 |
63 | #%% Download WATER_SYSTEM
64 | filename = 'WATER_SYSTEM'
65 |
66 |
67 | if os.path.exists(os.path.join(sdwis_data_path, filename + "/")):
68 | pass
69 | print(f"{filename} folder exists, skipping download.")
70 |
71 | else:
72 | download_with_aria(sdwis_data_path,filename, count_end=200)
73 |
74 | # Stitch and count rows
75 | if not os.path.exists(os.path.join(sdwis_data_path, f'{filename}.csv')):
76 | stitch_files(filename, sdwis_data_path)
77 | directory = os.path.join(sdwis_data_path, f'{filename}/')
78 | row_count = get_row_count(sdwis_data_path, f'{filename}.csv')
79 | print(f'Row count of {filename}.csv: {row_count}')
80 |
81 | else:
82 | print(f'{filename}.csv already exists and will not re-stitch.')
83 |
--------------------------------------------------------------------------------
/src/downloaders/download_tigris_ne.R:
--------------------------------------------------------------------------------
1 | # download TIGRIS places and Natural Earth (ne) coastline -----------------
2 |
3 | library(fs)
4 | library(sf)
5 | library(tigris)
6 | library(rmapshaper)
7 | library(readr)
8 | library(tidyverse)
9 | library(tidycensus)
10 |
11 |
12 | # path to save raw data
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | census_api_key <- Sys.getenv("CENSUS_API_KEY")
15 |
16 | # Tell tidycensus our key (don't forget to activate it first!)
17 | census_api_key(census_api_key)
18 |
19 | # download large files without timeout error
20 | options(timeout = 100000, tigris_use_cache = TRUE)
21 | states_list <- c(state.abb, "DC")
22 |
23 | # create dirs
24 | dir_create(path(data_path, "tigris"))
25 | dir_create(path(data_path, "ne/ocean"))
26 |
27 | # download all TIGRIS places, simplify polygons, save
28 | places <- tigris::places(states_list)
29 |
30 | places <- places %>%
31 | rmapshaper::ms_simplify(
32 | keep_shapes = TRUE,
33 | # https://github.com/ateucher/rmapshaper/issues/83
34 | # and https://github.com/ateucher/rmapshaper#using-the-system-mapshaper or
35 | # https://docs.npmjs.com/resolving-eacces-permissions-errors-when-installing-packages-globally
36 | sys = TRUE)
37 |
38 | write_rds(places, path(data_path, "tigris/tigris_places.rds"))
39 | cat("Downloaded and wrote TIGRIS places.\n")
40 |
41 | # download and write population data for TIGRIS places
42 | pop <- get_decennial(
43 | geography = "place", # census-designated places
44 | state = states_list,
45 | year = 2020,
46 | variables = "P1_001N", # selects population data for 2020
47 | geometry = FALSE,
48 | cb = FALSE
49 | ) %>%
50 | select(
51 | geoid = GEOID,
52 | name = NAME,
53 | population = value
54 | ) %>%
55 | write_csv(., path(data_path, "tigris/tigris_pop.csv"))
56 |
57 | # download and unzip Natural Earth oceans polygons, used to
58 | # remove water bodies from TIGRIS places in the transformer
59 | url_ne <- paste0("https://www.naturalearthdata.com/",
60 | "http//www.naturalearthdata.com/",
61 | "download/10m/physical/ne_10m_ocean.zip")
62 | download.file(url_ne,
63 | destfile = path(data_path, "ne/ocean/ocean.zip"))
64 |
65 | unzip(zipfile = path(data_path, "ne/ocean/ocean.zip"),
66 | exdir = path(data_path, "ne/ocean/ne-ocean-10m"))
67 | cat("Downloaded and wrote Natural Earth Oceans.\n")
68 |
--------------------------------------------------------------------------------
/src/downloaders/download_ucmr.R:
--------------------------------------------------------------------------------
1 | # download UCMR occurrence data -----------------------------------------------
2 |
3 | library(glue)
4 | library(fs)
5 |
6 | # Allow for longer timeout for download file
7 | options(timeout = 10000)
8 |
9 | # path to save raw data
10 | data_path <- Sys.getenv("WSB_DATA_PATH")
11 |
12 | # Data Source: UCMR Program, which records zip codes served
13 | ucmr3_url <- paste0("https://www.epa.gov/sites/default/files/2017-02/",
14 | "ucmr-3-occurrence-data.zip")
15 |
16 | ucmr4_url <- paste0("https://www.epa.gov/sites/default/files/2020-04/",
17 | "ucmr_4_occurrence_data.zip?VersionId=",
18 | "m3C_dKBtBPyz35yDVL_1uZLjGjHiZtwf")
19 |
20 | # create dir to store downloaded files
21 | dir_create(path(data_path, "ucmr"))
22 |
23 | # local paths to download files
24 | file_ucmr3 <- path(data_path, "ucmr/ucmr3.zip")
25 | file_ucmr4 <- path(data_path, "ucmr/ucmr4.zip")
26 |
27 | # download and unzip
28 | download.file(ucmr3_url, file_ucmr3)
29 | download.file(ucmr4_url, file_ucmr4, mode="wb")
30 |
31 | unzip(file_ucmr3, exdir = path(data_path, "ucmr"))
32 | unzip(file_ucmr4, exdir = path(data_path, "ucmr"))
33 |
34 | cat("Downloaded and unzipped UCMR3 and UCMR4 data.\n")
35 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_ar_wsb.R:
--------------------------------------------------------------------------------
1 | # Download AR water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Arkansas ArcGIS shapefile water system boundary
6 | url <- paste0("https://geostor-vectors.s3.amazonaws.com/Utilities/SHP/",
7 | "PUBLIC_WATER_SYSTEMS.zip")
8 |
9 | download_wsb(url, "ar")
10 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_az_wsb.R:
--------------------------------------------------------------------------------
1 | # Download AZ water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Arizona ArcGIS geojson water system boundary
6 | url <- paste0("https://opendata.arcgis.com/datasets/",
7 | "9992e59e46bb466584f9694f897f350a_0.geojson")
8 |
9 | download_wsb(url, "az")
10 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_ct_wsb.R:
--------------------------------------------------------------------------------
1 | # Download CT water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Connecticut ArcGIS shapefile water system boundary
6 | url <- paste0("https://portal.ct.gov/-/media/Departments-and-Agencies/",
7 | "DPH/dph/drinking_water/GIS/",
8 | "Buffered_Community_PWS_Service_Areas.zip")
9 |
10 | download_wsb(url, "ct")
11 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_il_wsb.R:
--------------------------------------------------------------------------------
1 | # Download IL water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: IL Geospatial Data Clearinghouse
6 | url <- paste0("https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISWS/Hydrology/zips/",
7 | "Illinois_Municipal_Water_Use_2012.zip")
8 |
9 | download_wsb(url, "il")
10 |
11 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_ks_wsb.R:
--------------------------------------------------------------------------------
1 | # Download KS water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Kansas ArcGIS shapefile water system boundary
6 | url <- paste0("https://data.kansasgis.org/catalog/",
7 | "administrative_boundaries/shp/pws/PWS_bnd_2021_0430.zip")
8 |
9 | download_wsb(url, "ks")
10 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_mo_wsb.R:
--------------------------------------------------------------------------------
1 | # Download MO water service boundaries ------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Missouri ArcGIS geojson water system boundary
6 | url <- paste0("https://opendata.arcgis.com/datasets/",
7 | "c3bee75a86e04856b28d7f1ce2a24e6f_0.geojson")
8 |
9 | download_wsb(url, "mo")
10 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_nc_wsb.R:
--------------------------------------------------------------------------------
1 | # Download NC water service boundaries ------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: North Carolina ArcGIS geojson water system boundary
6 | url <- paste0("https://opendata.arcgis.com/datasets/",
7 | "58548b90bdfd4148829103ac7f4db9ce_4.geojson")
8 |
9 | download_wsb(url, "nc")
10 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_nj_wsb.R:
--------------------------------------------------------------------------------
1 | # Download NJ water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: New Jersey ArcGIS geojson water system boundary
6 | url <- paste0("https://opendata.arcgis.com/datasets/",
7 | "00e7ff046ddb4302abe7b49b2ddee07e_13.geojson")
8 |
9 | download_wsb(url, "nj")
10 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_nm_wsb.R:
--------------------------------------------------------------------------------
1 | # Download NM water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: New Mexico ArcGIS geojson water system boundary
6 | url <- paste0("https://catalog.newmexicowaterdata.org/dataset/",
7 | "5d069bbb-1bfe-4c83-bbf7-3582a42fce6e/resource/",
8 | "ccb9f5ce-aed4-4896-a2f1-aba39953e7bb/download/pws_nm.geojson")
9 |
10 | download_wsb(url, "nm")
11 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_ok_wsb.R:
--------------------------------------------------------------------------------
1 | # Download OK water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Oklahoma ArcGIS geojson water system boundary
6 | url <- paste0("https://opendata.arcgis.com/datasets/",
7 | "d015bc14d3b84b8985ff3a4fd55c0844_0.geojson")
8 |
9 | download_wsb(url, "ok")
10 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_pa_wsb.R:
--------------------------------------------------------------------------------
1 | # Download PA water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Pennsylvania ArcGIS geojson water system boundary
6 | url <- "https://www.pasda.psu.edu/json/PublicWaterSupply2022_01.geojson"
7 |
8 | download_wsb(url, "pa")
9 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_ri_wsb.R:
--------------------------------------------------------------------------------
1 | # Download RI water system data -------------------------------------------
2 | library(geojsonsf)
3 | library(fs)
4 | library(sf)
5 | library(urltools)
6 |
7 | # Path to save raw data
8 | data_path <- Sys.getenv("WSB_DATA_PATH")
9 |
10 | # Data Source: Arkansas ArcGIS shapefile water system boundary
11 | url <- paste0("https://risegis.ri.gov/hosting/rest/services/RIDEM/RI_DrinkingWater_ServiceAreas/",
12 | "FeatureServer/1/query?where=1%3D1&outFields=*&f=geojson")
13 |
14 | # Use geojson reader for ESRI Rest end points to get data
15 | ri <- geojson_sf(url)
16 |
17 | # Create outputted file directory
18 | dir_path <- paste0(data_path, "/boundary/ri")
19 | dir_create(dir_path)
20 |
21 | # Write RI geojson
22 | path_out <- paste0(dir_path, "/ri.geojson")
23 | if(file_exists(path_out)) file_delete(path_out)
24 |
25 | st_write(ri, path_out)
26 |
27 |
28 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_state_helpers.R:
--------------------------------------------------------------------------------
1 | # State WSB downloader helper functions -----------------------------------
2 |
3 | # suppress warning: package ‘fs’ was built under R version ...
4 | suppressWarnings(suppressMessages(library(fs)))
5 |
6 | # path to save raw data
7 | data_path <- Sys.getenv("WSB_DATA_PATH")
8 |
9 | # function to download url
10 | # file_ext argument is optional; if not provided, generated from url ending
11 | download_wsb <- function(url, state, file_ext) {
12 |
13 | cat("Starting download for", toupper(state), "boundary data...\n\n")
14 |
15 | # create outputted file directory
16 | dir_path <- path(data_path, paste0("boundary/", state))
17 | dir_create(dir_path)
18 |
19 | # get file extension to create outputted file name and path
20 | if (missing(file_ext)) {
21 | file_ext <- sub(".*\\.", "", url)
22 | }
23 | file_name <- paste0(state, ".", file_ext)
24 | file_path <- path(dir_path, file_name)
25 |
26 | # download url
27 | download.file(url, file_path)
28 |
29 | # unzip if the downloaded file has a zip extension
30 | if (file_ext == "zip") {
31 | unzip_wsb(file_path, dir_path, state)
32 | } else {
33 | cat("Downloaded", toupper(state), "boundary data.\n\n\n")
34 | }
35 | }
36 |
37 | # function to unzip file
38 | unzip_wsb <- function(file_path, dir_path, state) {
39 | # unzip file
40 | unzip(zipfile = file_path, exdir = dir_path)
41 | cat("Downloaded and unzipped", toupper(state), "boundary data.\n\n\n")
42 | }
43 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_ut_wsb.R:
--------------------------------------------------------------------------------
1 | # Download UT water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Utah ArcGIS geojson water system boundary
6 | url <- paste0("https://services.arcgis.com/ZzrwjTRez6FJiOq4/arcgis/rest/",
7 | "services/CulinaryWaterServiceAreas/FeatureServer/0/",
8 | "query?outFields=*&where=1%3D1&f=geojson")
9 |
10 | download_wsb(url, "ut", "geojson")
11 |
--------------------------------------------------------------------------------
/src/downloaders/states/download_wa_wsb.R:
--------------------------------------------------------------------------------
1 | # Download WA water system data -------------------------------------------
2 |
3 | source(here::here("src/downloaders/states/download_state_helpers.R"))
4 |
5 | # Data Source: Washington ArcGIS Geodatabase water system boundary
6 | url <- paste0("https://opendata.arcgis.com/datasets/",
7 | "b09475f47a5a46ca90fe6a168fb22e6d_0.geojson")
8 |
9 | download_wsb(url, "wa")
10 |
--------------------------------------------------------------------------------
/src/functions/f_clean_whitespace_nas.R:
--------------------------------------------------------------------------------
1 | # trim whitespace and replace common NA values with actual NAs
2 | f_clean_whitespace_nas <- function(df){
3 |
4 | # if df is spatial, detach geometry before cleaning cols
5 | if(sum(class(df) == "sf") == 1) {
6 | geom = df$geometry
7 | df = st_drop_geometry(df)
8 | }
9 |
10 | # apply whitespace and NA cleaning
11 | df = dplyr::mutate_all(df, stringr::str_trim, "both") |>
12 | # all whitespace becomes "", so the next pattern handles all cases
13 | dplyr::mutate_all(dplyr::na_if, "") |>
14 | dplyr::mutate_all(dplyr::na_if, "NULL") |>
15 | dplyr::mutate_all(dplyr::na_if, "NA") |>
16 | dplyr::mutate_all(dplyr::na_if, "N/A")
17 |
18 | # reattach geometry if the object is spatial
19 | if(exists("geom")) {
20 | df = st_as_sf(bind_cols(df, geometry = geom))
21 | }
22 |
23 | return(df)
24 | }
25 |
--------------------------------------------------------------------------------
/src/functions/f_drop_imposters.R:
--------------------------------------------------------------------------------
1 | # drops imposters, which are geometries that report being in one state
2 | # but that actually are located in another. This function filters to
3 | # valid rows where input geom falls within state geoms (non-imposters),
4 | # and sinks a log file of invalid geoms (imposters) to review.
5 | # See GH Issue #45: https://github.com/SimpleLab-Inc/wsb/issues/45
6 |
7 | f_drop_imposters <- function(d, path_log){
8 |
9 | # error if the supplied input is not sf
10 | if(!"sf" %in% class(d)){
11 | stop("Input object is not of type `sf`.", call. = FALSE)
12 | }
13 |
14 | # error if the supplied object does not have a PWSID field name
15 | if(!"state" %in% colnames(d)){
16 | stop("Column `state` missing from input object.", call. = FALSE)
17 | }
18 |
19 | # if the log path doesn't exist, create it
20 | if(!dir_exists(here::here("log"))) dir_create(here::here("log"))
21 |
22 | # reported state name
23 | d = mutate(d, state_reported = state)
24 |
25 | # create state name to abbreviation key with built-in R objects
26 | key = tibble(name = state.name, state_intersection = state.abb)
27 |
28 | # pull usa state geometries, project to input data CRS
29 | usa = USAboundaries::us_states(resolution = "high") %>%
30 | st_transform(st_crs(d)$epsg) %>%
31 | select(state_intersection = state_abbr, geometry) %>%
32 | suppressMessages()
33 |
34 | # spatial join input data to usa state polygons.
35 | # filter to valid and invalid geometries for returned objects
36 | cat("Joining input object geometries to USA state polygons...")
37 | d_joined = st_join(d, usa)
38 | cat("done.\n\n")
39 |
40 | # valid geometries: reported state == intersected state
41 | d_valid = d_joined %>%
42 | filter(
43 | state_reported == state_intersection |
44 | # also return when an input geometry doesn't intersect the USA geom
45 | is.na(state_intersection)
46 | ) %>%
47 | select(-state_reported, -state_intersection)
48 |
49 | # imposters: reported state != intersected state
50 | d_imposter = d_joined %>%
51 | filter(
52 | state_reported != state_intersection |
53 | # also return when a state isn't reported
54 | is.na(state_reported)
55 | ) %>%
56 | select(state_reported, state_intersection, everything()) %>%
57 | st_drop_geometry()
58 |
59 | # print stats on valid/invalid geometries
60 | nrow_d = nrow(d_joined) %>% formatC(big.mark = ",")
61 | nrow_dv = nrow(d_valid) %>% formatC(big.mark = ",")
62 | nrow_imp = nrow(d_imposter) %>% formatC(big.mark = ",")
63 | p_valid = round(((nrow(d_valid)/nrow(d_joined))*100), 2)
64 |
65 | cat(nrow_dv, "/", nrow_d, "rows are valid", "(",
66 | p_valid, "% of input data).\n\n")
67 |
68 | # sink invalid pwsids (even with dupes, e.g. FRS) to a log file
69 | write_csv(d_imposter, path_log)
70 | cat("Wrote", nrow_imp, "imposters for review to", path_log, "\n")
71 |
72 | # return valid geometries as an object
73 | return(d_valid)
74 | cat("Returned valid goemetries in return object.\n\n")
75 | }
76 |
--------------------------------------------------------------------------------
/src/match/0-init.py:
--------------------------------------------------------------------------------
1 | """
2 | This scripts simply sets up the database.
3 | """
4 | #%%
5 |
6 | import os
7 | from dotenv import load_dotenv
8 | import sqlalchemy as sa
9 |
10 | load_dotenv()
11 |
12 | # Connect to local PostGIS instance
13 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
14 |
15 | #%%
16 |
17 | # Read in the SQL and execute against the database
18 | this_folder = os.path.dirname(__file__)
19 |
20 | with open(this_folder + "/init_model.sql") as file:
21 | sql = file.read()
22 |
23 | conn.execute(sql)
--------------------------------------------------------------------------------
/src/match/2-cleansing.py:
--------------------------------------------------------------------------------
1 | #%%
2 | import os
3 | import pandas as pd
4 | import geopandas as gpd
5 | from dotenv import load_dotenv
6 | import sqlalchemy as sa
7 |
8 | load_dotenv()
9 |
10 | pd.options.display.max_columns = None
11 |
12 | EPSG = os.environ["WSB_EPSG"]
13 | PROJ = os.environ["WSB_EPSG_AW"]
14 |
15 | # Connect to local PostGIS instance
16 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
17 |
18 |
19 | def _run_cleanse_rule(conn, rule_name: str, sql: str):
20 | result = conn.execute(sql)
21 | print(f"Ran cleanse rule '{rule_name}': {result.rowcount} rows affected")
22 |
23 | #%%
24 | # First apply a bunch of SQL cleanses
25 |
26 | PO_BOX_REGEX = r'^P[\. ]?O\M\.? *BOX +\d+$'
27 |
28 | # Upper-case columns
29 | for col in [
30 | "name", "address_line_1", "address_line_2", "city", "state",
31 | "county", "city_served", "centroid_quality"
32 | ]:
33 | _run_cleanse_rule(conn,
34 | f"Upper-case {col}",
35 | f"""
36 | UPDATE pws_contributors
37 | SET {col} = UPPER({col})
38 | WHERE
39 | {col} ~ '[a-z]';
40 | """)
41 |
42 | _run_cleanse_rule(conn,
43 | "NULL out nonexistent zip code '99999'",
44 | f"""
45 | UPDATE pws_contributors
46 | SET zip = NULL
47 | WHERE
48 | zip = '99999';
49 | """)
50 |
51 | _run_cleanse_rule(conn,
52 | "Remove PO BOX from address_line_1",
53 | f"""
54 | UPDATE pws_contributors
55 | SET
56 | address_quality = 'PO BOX',
57 | address_line_1 = NULL
58 | WHERE
59 | address_line_1 ~ '{PO_BOX_REGEX}';
60 | """)
61 |
62 | _run_cleanse_rule(conn,
63 | "Remove PO BOX from address_line_2",
64 | f"""
65 | UPDATE pws_contributors
66 | SET
67 | address_quality = 'PO BOX',
68 | address_line_2 = NULL
69 | WHERE
70 | address_line_2 ~ '{PO_BOX_REGEX}';
71 | """)
72 |
73 | _run_cleanse_rule(conn,
74 | "If there's an address in line 2 but not line 1, move it",
75 | f"""
76 | UPDATE pws_contributors
77 | SET
78 | address_line_1 = address_line_2,
79 | address_line_2 = NULL
80 | WHERE
81 | (address_line_1 IS NULL OR address_line_1 = '') AND
82 | address_line_2 IS NOT NULL;
83 | """)
84 |
85 | _run_cleanse_rule(conn,
86 | "Standardize geometry quality",
87 | f"""
88 | UPDATE pws_contributors
89 | SET centroid_quality = 'ZIP CODE CENTROID'
90 | WHERE
91 | centroid_quality = 'ZIP CODE-CENTROID';
92 | """)
93 |
94 | #%%
95 | #####################
96 | # Handle Impostors
97 | #####################
98 |
99 | print("Checking for impostors...")
100 |
101 | # Pull data from the DB
102 | df = gpd.GeoDataFrame.from_postgis("""
103 | SELECT
104 | contributor_id,
105 | source_system,
106 | state,
107 | primacy_agency_code,
108 | geometry
109 | FROM pws_contributors
110 | WHERE
111 | source_system IN ('echo', 'frs') AND
112 | geometry IS NOT NULL AND
113 | NOT st_isempty(geometry)
114 | """, conn, geom_col="geometry"
115 | ).set_index("contributor_id")
116 |
117 | # Convert to projected
118 | df = df.to_crs(PROJ)
119 |
120 | # How many entries where primacy_agency_code differs from primacy_agency? 738
121 | # How many entries where primacy_agency_code is numeric? 379
122 | # Entries where state is numeric? 0
123 | # Entries where state is null? 0
124 |
125 | # In cases where primacy_agency_code is numeric, sub in the state
126 | mask = df["primacy_agency_code"].str.contains(r"\d\d", regex=True)
127 | df.loc[mask, "primacy_agency_code"] = df.loc[mask]["state"]
128 |
129 | #%%
130 |
131 | # Read in state boundaries and convert to projected CRS
132 | states = (gpd
133 | .read_file("../layers/us_states.geojson")
134 | [["stusps", "geometry"]]
135 | .rename(columns={"stusps": "state"})
136 | .set_index("state")
137 | .to_crs(PROJ))
138 |
139 | #%%
140 |
141 | # Series 1 is pwsid + geometry
142 | s1 = df["geometry"]
143 |
144 | # Series 2 is generic state bounds joined to each pwsid on primacy_agency_code
145 | s2 = (df
146 | .drop(columns="geometry")
147 | .join(states, on="primacy_agency_code")
148 | ["geometry"])
149 |
150 | # Calculate the distance between the supplied boundary and the expected state
151 | distances = s1.distance(s2, align=True)
152 |
153 | # Any that are >50 kilometers are impostors
154 | impostors = (df
155 | .loc[distances[(distances > 50_000)].index]
156 | .to_crs("epsg:" + EPSG)
157 | .reset_index())
158 |
159 | print(f"Found {len(impostors)} impostors.")
160 |
161 | #%%
162 | # Save to the database
163 | impostors.to_postgis("impostors", conn, if_exists="replace")
164 |
165 | #%%
166 | # Remove the address, lat/lon, and geometry when it's an "impostor"
167 | conn.execute("""
168 | UPDATE pws_contributors
169 | SET
170 | address_line_1 = NULL,
171 | address_line_2 = NULL,
172 | city = NULL,
173 | state = NULL,
174 | zip = NULL,
175 | geometry = 'GEOMETRYCOLLECTION EMPTY',
176 | centroid_lat = NULL,
177 | centroid_lon = NULL
178 | WHERE
179 | contributor_id IN (SELECT contributor_id FROM impostors);
180 | """, conn)
181 |
182 | print("Null'd out impostor addresses and lat/lon.")
--------------------------------------------------------------------------------
/src/match/4-rank_boundary_matches.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import numpy as np
5 | import pandas as pd
6 | import sqlalchemy as sa
7 | from dotenv import load_dotenv
8 |
9 | from match.match_scorer import MatchScorer
10 |
11 | load_dotenv()
12 |
13 | STAGING_PATH = os.environ["WSB_STAGING_PATH"]
14 | EPSG = os.environ["WSB_EPSG"]
15 | PROJ = os.environ["WSB_EPSG_AW"]
16 |
17 | # Connect to local PostGIS instance
18 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
19 |
20 | #%%
21 | matches = pd.read_sql("""
22 | SELECT
23 | m.master_key,
24 | m.candidate_contributor_id,
25 | m.match_rule,
26 | s.name AS sdwis_name,
27 | s.population_served_count AS sdwis_pop,
28 | c.name AS tiger_name,
29 | c.population_served_count AS tiger_pop
30 | FROM matches m
31 | JOIN pws_contributors c ON m.candidate_contributor_id = c.contributor_id AND c.source_system = 'tiger'
32 | JOIN pws_contributors s ON s.master_key = m.master_key AND s.source_system = 'sdwis';
33 | """, conn)
34 |
35 | print("Read matches from database.")
36 |
37 |
38 | #%% ##########################
39 | # Generate some TIGER match stats
40 | ##############################
41 |
42 | # How often do we match to multiple tigers?
43 | pws_to_tiger_match_counts = (matches
44 | .groupby("master_key")
45 | .size())
46 |
47 | pws_to_tiger_match_counts.name = "pws_to_tiger_match_count"
48 |
49 | # Let's also do it the other direction
50 | tiger_to_pws_match_counts = (matches
51 | .groupby("candidate_contributor_id")
52 | .size())
53 |
54 | tiger_to_pws_match_counts.name = "tiger_to_pws_match_count"
55 |
56 | # 1850 situations with > 1 match
57 | print(f"{(pws_to_tiger_match_counts > 1).sum()} PWS's matched to multiple TIGERs")
58 |
59 | # 3631 TIGERs matched to multiple PWSs
60 | print(f"{(tiger_to_pws_match_counts > 1).sum()} TIGER's matched to multiple PWS's")
61 |
62 | #%% #########################
63 | # Figure out our strongest match rules
64 | #############################
65 | scorer = MatchScorer()
66 | scored_matches = scorer.score_tiger_matches(matches)
67 |
68 | #%%
69 | """
70 | Use the "scored" data to determine which rules (and combos of rules)
71 | are most effective.
72 | """
73 |
74 | # Assign a "rank" to each match rule and combo of match rules
75 | match_rule_ranks = (matches
76 | .join(scored_matches, on=["master_key", "candidate_contributor_id"])
77 | .groupby(["match_rule"])
78 | .agg(
79 | points = ("score", "sum"),
80 | total = ("score", "size")
81 | )) #type:ignore
82 |
83 | match_rule_ranks["score"] = match_rule_ranks["points"] / match_rule_ranks["total"]
84 | match_rule_ranks = match_rule_ranks.sort_values("score", ascending=False)
85 | match_rule_ranks["match_rule_rank"] = np.arange(len(match_rule_ranks))
86 |
87 | print("Identified best match rules based on labeled data.")
88 |
89 | #%% ###########################
90 | # Rank all PWS<->TIGER matches
91 | ###############################
92 |
93 | # Assign the match rule ranks back to the matches
94 | matches_ranked = matches.join(
95 | match_rule_ranks[["match_rule_rank"]], on="match_rule", how="left")
96 |
97 | # Flag any that have name matches
98 | matches_ranked["name_match"] = matches.apply(lambda x: x["tiger_name"] in x["sdwis_name"], axis=1)
99 |
100 | # Flag the best population within each TIGER match set
101 | # (Note this should be done AFTER removing the best PWS->TIGER, if we're doing that)
102 | matches_ranked["pop_diff"] = abs(matches["tiger_pop"] - matches["sdwis_pop"])
103 |
104 | # To get PWS<->TIGER to be 1:1, we'll rank on different metrics
105 | # and then select the top one. We need to do this twice:
106 | # Once to make PWS->Tiger N:1 and then to make Tiger->PWS 1:1
107 |
108 | #%%
109 | # Through experimentation, this seemed to be the best ranking:
110 | # name_match, match_rule_rank, pop_diff
111 | # and selecting within the candidate_contributor groups first,
112 | # master_key groups second.
113 |
114 | # Assign numeric ranks to every match
115 | matches_ranked = (matches_ranked
116 | .sort_values(
117 | ["name_match", "match_rule_rank", "pop_diff"],
118 | ascending=[False, True, True])
119 | # Re-number and bring that index into the df
120 | # This gives us a simple column to rank on
121 | .reset_index(drop=True)
122 | .reset_index(drop=False)
123 | .rename(columns={"index": "overall_rank"}))
124 |
125 | # I guess this is technically unnecessary, cause it's equivalent to sorting on overall_rank...
126 | # but maybe it make things a little clearer?
127 | matches_ranked["master_group_ranking"] = \
128 | (matches_ranked
129 | .groupby("master_key")
130 | ["overall_rank"]
131 | .rank("dense")
132 | .astype("int"))
133 |
134 | #%%
135 | # Identify the 1-1 matches using the overall_rank
136 | best_matches = (matches_ranked
137 | .sort_values(["overall_rank"])
138 | .drop_duplicates(subset="candidate_contributor_id", keep="first")
139 | .drop_duplicates(subset="master_key", keep="first")).index
140 |
141 | matches_ranked["best_match"] = matches_ranked.index.isin(best_matches)
142 |
143 | #%%
144 |
145 | print("Scoring 1:1 matches...")
146 |
147 | # Score it. how'd we do?
148 | scored_best_matches = scorer.score_tiger_matches(
149 | matches_ranked
150 | .loc[matches_ranked["best_match"]]
151 | [["master_key", "candidate_contributor_id"]])
152 |
153 | # ~ 96%
154 | score = scored_best_matches["score"].sum() * 100 / len(scored_best_matches)
155 |
156 | print(f"Boundary match score: {score:.2f}")
157 |
158 | #%%
159 | matches_ranked.to_sql("matches_ranked", conn, if_exists="replace", index=False)
160 |
--------------------------------------------------------------------------------
/src/match/5-select_modeled_centroids.py:
--------------------------------------------------------------------------------
1 | """
2 | This script takes centroids from ECHO, FRS, UCMR, and MHP
3 | and tries to select the best one to feed into the model
4 | for each PWSID.
5 | """
6 |
7 | #%%
8 |
9 | import os
10 | import numpy as np
11 | import pandas as pd
12 | import geopandas as gpd
13 | import sqlalchemy as sa
14 | from shapely.geometry import Polygon
15 | from dotenv import load_dotenv
16 |
17 | import match.helpers as helpers
18 |
19 | load_dotenv()
20 |
21 | STAGING_PATH = os.environ["WSB_STAGING_PATH"]
22 | EPSG = os.environ["WSB_EPSG"]
23 | PROJ = os.environ["WSB_EPSG_AW"]
24 |
25 | # Connect to local PostGIS instance
26 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
27 |
28 |
29 | #%%
30 | # Load up the data sources
31 |
32 | print("Pulling in data from database...", end="")
33 |
34 | sdwis = gpd.GeoDataFrame.from_postgis("""
35 | SELECT *
36 | FROM pws_contributors
37 | WHERE source_system = 'sdwis';""",
38 | conn, geom_col="geometry")
39 |
40 | stack = pd.read_sql("""
41 |
42 | -- ECHO, FRS, and UCMR area all already-labeled with PWS
43 | SELECT
44 | c.contributor_id, c.source_system, c.master_key,
45 | c.centroid_lat, c.centroid_lon, c.centroid_quality,
46 | 1 as master_group_ranking
47 | FROM pws_contributors c
48 | WHERE source_system IN ('echo', 'frs', 'ucmr')
49 |
50 | UNION ALL
51 |
52 | -- Since we don't know PWSID's for MHP and TIGER, we need
53 | -- to join to matches to sub in their matcheda MK's
54 |
55 | -- Join MHP to matches
56 | SELECT
57 | c.contributor_id, c.source_system, m.master_key,
58 | c.centroid_lat, c.centroid_lon, c.centroid_quality,
59 | 1 as master_group_ranking
60 | FROM pws_contributors c
61 | JOIN matches m ON m.candidate_contributor_id = c.contributor_id
62 | WHERE source_system = 'mhp'
63 |
64 | UNION ALL
65 |
66 | -- Join Tiger to matches
67 | SELECT
68 | c.contributor_id, c.source_system, m.master_key,
69 | c.centroid_lat, c.centroid_lon, c.centroid_quality,
70 | -- This helps us decide the best tiger match
71 | m.master_group_ranking
72 | FROM pws_contributors c
73 | JOIN matches_ranked m ON m.candidate_contributor_id = c.contributor_id
74 | WHERE source_system = 'tiger'
75 |
76 | ORDER BY master_key;""",
77 | conn)
78 |
79 | print("done.")
80 |
81 | # Add sourcing notes to the geometries
82 | stack["centroid_quality"] = stack["source_system"].str.upper() + ": " + stack["centroid_quality"]
83 |
84 |
85 | #%% ###########################
86 | # Find the best centroid from the candidate contributors
87 | ###############################
88 |
89 | # Ranking:
90 | # Best MHP >
91 | # Echo (if not state or county centroid) >
92 | # UCMR >
93 | # Boundary >
94 | # Echo (if state or county centroid)
95 |
96 | # We want the best centroid from all contributors.
97 | # Assign a ranking:
98 | # MHP = 1
99 | # Echo = 2 if not state/county centroid
100 | # FRS = 3
101 | # UCMR = 4
102 | # Boundary = 5
103 | # Echo = 6 if state/county centroid
104 |
105 | stack["system_rank"] = stack["source_system"].map({
106 | "mhp": 1,
107 | "echo": 2,
108 | "frs": 3,
109 | "ucmr": 4,
110 | "tiger": 5
111 | })
112 |
113 | # Change Echo to 6 if state/county centroid
114 | mask = (
115 | (stack["source_system"] == "echo") &
116 | (stack["centroid_quality"].isin(["ECHO: STATE CENTROID", "ECHO: COUNTY CENTROID"])))
117 |
118 | stack.loc[mask, "system_rank"] = 6
119 |
120 | #%%
121 | # In case there are multiple matches from the same system,
122 | # we need tiebreakers.
123 | # Go by:
124 | # 1) System Ranking
125 | # 2) match_rank
126 | # 3) contributor_id (tiebreaker - to ensure consistency)
127 |
128 | # Note that only MHP and Tiger could potentially have multiple matches
129 |
130 | # Keep only the first entry in each subset
131 | best_centroid = (stack
132 | .sort_values([
133 | "master_key",
134 | "system_rank",
135 | "master_group_ranking",
136 | "contributor_id"])
137 | .drop_duplicates(subset="master_key", keep="first")
138 | .set_index("master_key"))
139 |
140 |
141 | #%% ##########################
142 | # Generate the final table
143 | ##############################
144 |
145 | # Start with SDWIS as the base, but drop/override a few columns
146 | output = (sdwis
147 | .drop(columns=["centroid_lat", "centroid_lon", "centroid_quality"])
148 | .assign(
149 | contributor_id = "modeled." + sdwis["pwsid"],
150 | source_system = "modeled",
151 | source_system_id = sdwis["pwsid"],
152 | master_key = sdwis["pwsid"],
153 | tier = 3,
154 | geometry_source_detail = "Modeled"
155 | ))
156 |
157 |
158 | # Supplement with best centroid
159 | output = (output
160 | .merge(best_centroid[[
161 | "centroid_lat",
162 | "centroid_lon",
163 | "centroid_quality",
164 | ]], on="master_key", how="left"))
165 |
166 | # Verify: We should still have exactly the number of pwsid's as we started with
167 | if not (len(output) == len(sdwis)):
168 | raise Exception("Output was filtered or denormalized")
169 |
170 | print("Joined several data sources into final output.")
171 |
172 | #%%
173 | output = gpd.GeoDataFrame(output)
174 | output["geometry"] = Polygon([])
175 | output = output.set_crs(epsg=EPSG, allow_override=True)
176 |
177 | #%% ########################
178 | # Save back to the DB
179 | ############################
180 |
181 | helpers.load_to_postgis("modeled", output)
--------------------------------------------------------------------------------
/src/match/helpers.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Optional
3 |
4 | import sqlalchemy as sa
5 | import pandas as pd
6 | from dotenv import load_dotenv
7 |
8 | load_dotenv()
9 |
10 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
11 |
12 |
13 | def load_to_postgis(source_system: str, df: pd.DataFrame):
14 |
15 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
16 | TARGET_TABLE = "pws_contributors"
17 |
18 | print(f"Removing existing {source_system} data from database...", end="")
19 | conn.execute(f"DELETE FROM {TARGET_TABLE} WHERE source_system = '{source_system}';")
20 | print("done")
21 |
22 | print(f"Loading {source_system} to database...", end="")
23 | df.to_postgis(TARGET_TABLE, conn, if_exists="append")
24 | print("done.")
25 |
26 |
27 | def get_pwsids_of_interest():
28 |
29 | sdwis = pd.read_csv(
30 | DATA_PATH + "/sdwis_water_system.csv",
31 | usecols=["pwsid", "pws_activity_code", "pws_type_code"],
32 | dtype="string")
33 |
34 | # Filter to only active community water systems
35 | # Starts as 400k, drops to ~50k after this filter
36 | # Keep only "A" for active
37 | return sdwis.loc[
38 | (sdwis["pws_activity_code"].isin(["A"])) &
39 | (sdwis["pws_type_code"] == "CWS")
40 | ]["pwsid"]
41 |
--------------------------------------------------------------------------------
/src/match/init_model.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS pws_contributors;
2 |
3 | CREATE TABLE pws_contributors (
4 | contributor_id TEXT NOT NULL PRIMARY KEY,
5 | source_system TEXT NOT NULL,
6 | source_system_id TEXT NOT NULL,
7 | master_key TEXT NOT NULL,
8 | tier INT,
9 | pwsid TEXT,
10 | name TEXT,
11 | address_line_1 TEXT,
12 | address_line_2 TEXT,
13 | city TEXT,
14 | state CHAR(2),
15 | zip CHAR(5),
16 | county TEXT,
17 | address_quality TEXT,
18 | city_served TEXT,
19 | primacy_agency_code TEXT,
20 | primacy_type TEXT,
21 | population_served_count INT,
22 | service_connections_count INT,
23 | owner_type_code CHAR(1),
24 | service_area_type_code TEXT,
25 | is_wholesaler_ind BOOLEAN,
26 | primary_source_code TEXT,
27 | centroid_lat DECIMAL(10, 8),
28 | centroid_lon DECIMAL(11, 8),
29 | centroid_quality TEXT,
30 | geometry_source_detail TEXT,
31 | geometry GEOMETRY(GEOMETRY, 4326)
32 | );
33 |
34 | CREATE INDEX ix__pws_contributors__source_system ON pws_contributors (source_system);
35 | CREATE INDEX ix__pws_contributors__source_system_id ON pws_contributors (source_system_id);
36 | CREATE INDEX ix__pws_contributors__master_key ON pws_contributors (master_key);
--------------------------------------------------------------------------------
/src/match/map_contributed.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import geopandas as gpd
5 | import match.helpers as helpers
6 | from dotenv import load_dotenv
7 |
8 | load_dotenv()
9 |
10 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
11 |
12 | #%%
13 |
14 | contrib = gpd.read_file(os.path.join(DATA_PATH, "contributed_pws.gpkg"))
15 |
16 | #%%
17 |
18 | # Remove GeometryCollections -- they cause problems later.
19 | # (Polygons and MultiPolygons are OK)
20 |
21 | before = len(contrib)
22 | contrib = contrib[~(contrib.geom_type == "GeometryCollection")]
23 |
24 | if len(contrib) < before:
25 | print(f"Removed {before - len(contrib)} GeometryCollection type geometries.")
26 |
27 | #%%
28 | # Check assumptions
29 | assert contrib["pwsid"].is_unique
30 |
31 | #%%
32 |
33 | df = gpd.GeoDataFrame().assign(
34 | source_system_id = contrib["pwsid"],
35 | source_system = "contributed",
36 | contributor_id = "contributed." + contrib["pwsid"],
37 | master_key = contrib["pwsid"],
38 | pwsid = contrib["pwsid"],
39 | state = contrib["state"],
40 | name = contrib["pws_name"],
41 | geometry = contrib["geometry"],
42 | centroid_lat = contrib["centroid_lat"],
43 | centroid_lon = contrib["centroid_long"],
44 | centroid_quality = "CALCULATED FROM GEOMETRY",
45 | geometry_source_detail = contrib["geometry_source_detail"]
46 | )
47 |
48 | #%%
49 |
50 | helpers.load_to_postgis("contributed", df)
--------------------------------------------------------------------------------
/src/match/map_echo.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import pandas as pd
5 | import geopandas as gpd
6 | import match.helpers as helpers
7 | from dotenv import load_dotenv
8 |
9 | load_dotenv()
10 |
11 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
12 | EPSG = os.environ["WSB_EPSG"]
13 |
14 | #%%
15 |
16 | usecols=[
17 | "pwsid", "fac_lat", "fac_long", "fac_name",
18 | "fac_street", "fac_city", "fac_state", "fac_zip", "fac_county",
19 | "fac_collection_method", "fac_reference_point", "fac_accuracy_meters",
20 | "fac_indian_cntry_flg", "fac_percent_minority", "fac_pop_den", "ejscreen_flag_us"]
21 |
22 | echo_df = pd.read_csv(
23 | os.path.join(DATA_PATH, "echo.csv"),
24 | usecols=usecols, dtype="str")
25 |
26 | #%%
27 |
28 | pwsids = helpers.get_pwsids_of_interest()
29 |
30 | # Filter to only those in our SDWIS list and with lat/long
31 | # 47,951 SDWIS match to ECHO, 1494 don't match
32 | echo_df = echo_df.loc[
33 | echo_df["pwsid"].isin(pwsids) &
34 | echo_df["fac_lat"].notna()].copy()
35 |
36 | # If fac_state is NA, copy from pwsid
37 | mask = echo_df["fac_state"].isna()
38 | echo_df.loc[mask, "fac_state"] = echo_df.loc[mask, "pwsid"].str[0:2]
39 |
40 | # Convert to geopandas
41 | echo: gpd.GeoDataFrame = gpd.GeoDataFrame(
42 | echo_df,
43 | geometry=gpd.points_from_xy(echo_df["fac_long"], echo_df["fac_lat"]),
44 | crs="EPSG:4326")
45 |
46 | # Cleanse out "UNK"
47 | echo = echo.replace({"UNK": pd.NA})
48 |
49 | echo.head()
50 |
51 | #%%
52 |
53 | df = gpd.GeoDataFrame().assign(
54 | source_system_id = echo["pwsid"],
55 | source_system = "echo",
56 | contributor_id = "echo." + echo["pwsid"],
57 | master_key = echo["pwsid"],
58 | pwsid = echo["pwsid"],
59 | state = echo["fac_state"],
60 | name = echo["fac_name"],
61 | address_line_1 = echo["fac_street"],
62 | city = echo["fac_city"],
63 | county = echo["fac_county"],
64 | zip = echo["fac_zip"],
65 | primacy_agency_code = echo["pwsid"].str[0:2],
66 | centroid_lat = echo["fac_lat"],
67 | centroid_lon = echo["fac_long"],
68 | geometry = echo["geometry"],
69 | centroid_quality = echo["fac_collection_method"],
70 | )
71 |
72 | #%%
73 |
74 | helpers.load_to_postgis("echo", df)
--------------------------------------------------------------------------------
/src/match/map_frs.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import pandas as pd
5 | import geopandas as gpd
6 | from dotenv import load_dotenv
7 |
8 | import match.helpers as helpers
9 |
10 | load_dotenv()
11 |
12 | pd.options.display.max_columns = None
13 |
14 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
15 | EPSG = os.environ["WSB_EPSG"]
16 |
17 |
18 | #%%
19 |
20 | frs = gpd.read_file(os.path.join(DATA_PATH, "frs.gpkg"))
21 | print("Read FRS file.")
22 |
23 | pwsids = helpers.get_pwsids_of_interest()
24 | print("Retrieved PWSID's of interest.")
25 |
26 | # Bring in echo so that we can compare FRS and avoid duplication
27 | echo = pd.read_csv(DATA_PATH + "/echo.csv", dtype="str",
28 | usecols=["pwsid", "fac_name", "fac_lat", "fac_long"])
29 |
30 | print("Read ECHO (to avoid duplication)")
31 |
32 | # Filter to those in SDWIS
33 | # And only those with interest_type "WATER TREATMENT PLANT". Other interest types are already in Echo.
34 | frs = frs[
35 | frs["pwsid"].isin(pwsids) &
36 | (frs["interest_type"] == "WATER TREATMENT PLANT")]
37 |
38 | # We only need a subset of the columns
39 | keep_columns = [
40 | "registry_id", "pwsid", "state_code", "primary_name", "location_address",
41 | "city_name", "postal_code", "county_name",
42 | "latitude83", "longitude83", "geometry", "ref_point_desc",
43 | "collect_mth_desc"]
44 |
45 | frs = frs[keep_columns]
46 |
47 | # Exclude FRS that are identical to echo on name and lat/long.
48 | # Maybe later, we also want to allow them through if they have different addresses.
49 | frs = frs.loc[frs
50 | # Find matches to echo, then only include those from FRS that _didn't_ match
51 | .reset_index()
52 | .merge(echo,
53 | left_on=["pwsid", "primary_name", "latitude83", "longitude83"],
54 | right_on=["pwsid", "fac_name", "fac_lat", "fac_long"],
55 | how="outer", indicator=True)
56 | .query("_merge == 'left_only'")
57 | ["index"]
58 | ]
59 | print("Filtered FRS")
60 |
61 | # Furthermore, drop entries where all the columns of interest are duplicated
62 | frs = frs.drop_duplicates(subset=list(set(frs.columns) - set("registry_id")), keep="first")
63 |
64 | print(f"{len(frs)} FRS entries remain after removing various duplicates")
65 |
66 | #%%
67 |
68 | df = gpd.GeoDataFrame().assign(
69 | source_system_id = frs["pwsid"],
70 | source_system = "frs",
71 | contributor_id = "frs." + frs["registry_id"] + "." + frs["pwsid"], # Apparently neither registry_id nor pwsid is fully unique, but together they are
72 | master_key = frs["pwsid"],
73 | pwsid = frs["pwsid"],
74 | state = frs["state_code"],
75 | name = frs["primary_name"],
76 | address_line_1 = frs["location_address"],
77 | city = frs["city_name"],
78 | zip = frs["postal_code"],
79 | county = frs["county_name"],
80 | primacy_agency_code = frs["pwsid"].str[0:2],
81 | centroid_lat = frs["latitude83"],
82 | centroid_lon = frs["longitude83"],
83 | geometry = frs["geometry"],
84 | centroid_quality = frs["collect_mth_desc"]
85 | )
86 |
87 | # Some light cleansing
88 | df["zip"] = df["zip"].str[0:5]
89 |
90 | # %%
91 | helpers.load_to_postgis("frs", df)
92 |
--------------------------------------------------------------------------------
/src/match/map_labeled.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import pandas as pd
5 | import geopandas as gpd
6 | from dotenv import load_dotenv
7 |
8 | import match.helpers as helpers
9 |
10 | load_dotenv()
11 |
12 | pd.options.display.max_columns = None
13 |
14 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
15 | EPSG = os.environ["WSB_EPSG"]
16 |
17 |
18 | #%%
19 |
20 | labeled = gpd.read_file(os.path.join(DATA_PATH, "wsb_labeled_clean.gpkg"))
21 | print("Read Labeled WSB file.")
22 |
23 | pwsids = helpers.get_pwsids_of_interest()
24 | print("Retrieved PWSID's of interest.")
25 |
26 | #%%
27 | # Filter to those in SDWIS
28 | labeled = labeled[labeled["pwsid"].isin(pwsids)]
29 |
30 | #%%
31 | # Null out a few bad lat/long
32 | mask = (
33 | (labeled["centroid_lat"] < -90) | (labeled["centroid_lat"] > 90) |
34 | (labeled["centroid_long"] < -180) | (labeled["centroid_long"] > 180))
35 |
36 | labeled.loc[mask, "centroid_lat"] = pd.NA
37 | labeled.loc[mask, "centroid_long"] = pd.NA
38 |
39 | print(f"Nulled out {mask.sum()} bad lat/long.")
40 |
41 | #%%
42 |
43 | df = gpd.GeoDataFrame().assign(
44 | source_system_id = labeled["pwsid"],
45 | source_system = "labeled",
46 | contributor_id = "labeled." + labeled["pwsid"],
47 | master_key = labeled["pwsid"],
48 | pwsid = labeled["pwsid"],
49 | state = labeled["state"],
50 | primacy_agency_code = labeled["pwsid"].str[0:2],
51 | name = labeled["pws_name"],
52 | # address_line_1 = labeled["location_address"],
53 | city = labeled["city"],
54 | # zip = labeled["postal_code"],
55 | county = labeled["county"],
56 | # Need to convert these to EPSG:4326 before we can save them
57 | centroid_lat = labeled["centroid_lat"],
58 | centroid_lon = labeled["centroid_long"],
59 | centroid_quality = "CALCULATED FROM GEOMETRY",
60 | geometry = labeled["geometry"],
61 | geometry_source_detail = labeled["geometry_source_detail"]
62 | )
63 |
64 | #%%
65 |
66 | print("Labeled record counts:")
67 | print(df
68 | .groupby("primacy_agency_code")
69 | .size()
70 | .sort_index())
71 |
72 | # %%
73 | helpers.load_to_postgis("labeled", df)
--------------------------------------------------------------------------------
/src/match/map_mhp.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import pandas as pd
5 | import geopandas as gpd
6 | from dotenv import load_dotenv
7 | import match.helpers as helpers
8 |
9 | load_dotenv()
10 |
11 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
12 | EPSG = os.environ["WSB_EPSG"]
13 |
14 | #%%
15 | mhp = gpd.read_file(os.path.join(DATA_PATH, "mhp_clean.gpkg"))
16 |
17 | # A little cleansing
18 | mhp = mhp.replace({"NOT AVAILABLE": pd.NA})
19 |
20 | #%%
21 |
22 | df = gpd.GeoDataFrame().assign(
23 | source_system_id = mhp["mhp_id"],
24 | source_system = "mhp",
25 | contributor_id = "mhp." + mhp["mhp_id"],
26 | master_key = "UNK-mhp." + mhp["mhp_id"],
27 | name = mhp["mhp_name"],
28 | address_line_1 = mhp["address"],
29 | city = mhp["city"],
30 | state = mhp["state"],
31 | zip = mhp["zipcode"],
32 | county = mhp["county"],
33 | centroid_lat = mhp["latitude"],
34 | centroid_lon = mhp["longitude"],
35 | geometry = mhp["geometry"],
36 | centroid_quality = mhp["val_method"],
37 | geometry_source_detail = mhp["source"]
38 | )
39 |
40 | #%%
41 |
42 | helpers.load_to_postgis("mhp", df)
--------------------------------------------------------------------------------
/src/match/map_sdwis.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | from shapely.geometry import Polygon
5 | import pandas as pd
6 | import geopandas as gpd
7 | from dotenv import load_dotenv
8 | import match.helpers as helpers
9 |
10 | load_dotenv()
11 |
12 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
13 | EPSG = os.environ["WSB_EPSG"]
14 |
15 | #%% ##########################################
16 | # SDWIS
17 | ##############################################
18 |
19 | """
20 | # SDWIS Schema
21 |
22 | Table relationships:
23 | - water_system
24 | - water_system : water_system_facility is 1 : 0/N (~300 pwsid's missing, N=9.8 mean (wow!))
25 | - water_system : service_area is 1 : 0/N, but almost 1:N (~1k pwsid's missing, N=1.2 mean)
26 | - water_system : geographic_area is 1 : 0/1, but almost 1:1 (~1k pwsid's missing)
27 |
28 | Here are the useful columns we want from SDWIS and supplemental tables
29 | ws.pwsid - the PK
30 | ws.pws_name - name
31 | ws.pws_activity_code - active or not
32 | ws.pws_type_code - Filtered to "cws" only so maybe we don't need it
33 | ws.address_line1 - "The address applicable to the legal entity", whatever that means
34 | ws.address_line2
35 | ws.city_name
36 | ws.zip_code
37 | ws.primacy_agency_code
38 | wsf.facility_id - Optional. This denormalizes the data substantially.
39 | sa.service_area_type_code - for municipal vs mobile home park
40 | ga.city_served - this column is not populated in ws unfortunately
41 | ga.county_served - Maybe this will be helpful?
42 | """
43 |
44 | #########
45 | # 1) SDWIS water_systems - PWSID is unique
46 | keep_columns = ["pwsid", "pws_name", "primacy_agency_code",
47 | "address_line1", "address_line2", "city_name", "zip_code", "state_code",
48 | "population_served_count", "service_connections_count", "owner_type_code",
49 | "primacy_type", "is_wholesaler_ind", "primary_source_code"]
50 |
51 | sdwis = pd.read_csv(
52 | os.path.join(DATA_PATH, "sdwis_water_system.csv"),
53 | usecols=keep_columns,
54 | dtype="string")
55 |
56 | pwsids = helpers.get_pwsids_of_interest()
57 |
58 | sdwis = sdwis.loc[sdwis["pwsid"].isin(pwsids)]
59 |
60 | # If state_code is NA, copy from primacy_agency_code
61 | mask = sdwis["state_code"].isna()
62 | sdwis.loc[mask, "state_code"] = sdwis.loc[mask, "primacy_agency_code"]
63 |
64 |
65 | #########
66 | # Supplement with geographic_area
67 |
68 | # geographic_area - PWSID is unique, very nearly 1:1 with water_system
69 | # ~1k PWSID's appear in water_system but not geographic_area
70 | # We're trying to get city_served and county_served, but these columns aren't always populated
71 | sdwis_ga = pd.read_csv(
72 | os.path.join(DATA_PATH, "sdwis_geographic_area.csv"),
73 | usecols=["pwsid", "city_served", "county_served"],
74 | dtype="string")
75 |
76 | # Verify: pwsid is unique
77 | if not sdwis_ga["pwsid"].is_unique:
78 | raise Exception("Failed assumption: pwsid in geographic_area is assumed to be unique")
79 |
80 | sdwis = sdwis.merge(sdwis_ga, on="pwsid", how="left")
81 |
82 | #########
83 | # Supplement with service_area
84 |
85 | # This is N:1 with sdwis, which is annoying
86 | # (each pws has on average 1.2 service_area_type_codes)
87 |
88 | # service_area - PWSID + service_area_type_code is unique
89 | # ~1k PWSID's appear in water_system but not service_area
90 | sdwis_sa = pd.read_csv(
91 | os.path.join(DATA_PATH, "sdwis_service_area.csv"),
92 | usecols=["pwsid", "service_area_type_code"])
93 |
94 | # Filter to the pws's we're interested in
95 | sdwis_sa = sdwis_sa.loc[sdwis_sa["pwsid"].isin(sdwis["pwsid"])]
96 |
97 | # Supplement sdwis. I'll group it into a python list to avoid denormalized
98 | # Could also do a comma-delimited string. We'll see what seems more useful in practice.
99 | sdwis_sa = sdwis_sa.groupby("pwsid")["service_area_type_code"].apply(list)
100 |
101 | sdwis = sdwis.merge(sdwis_sa, on="pwsid", how="left")
102 |
103 | # Verification
104 | if not sdwis["pwsid"].is_unique:
105 | raise Exception("Expected sdwis pwsid to be unique")
106 |
107 | sdwis.head()
108 |
109 | #%%
110 |
111 | df = gpd.GeoDataFrame().assign(
112 | source_system_id = sdwis["pwsid"],
113 | source_system = "sdwis",
114 | contributor_id = "sdwis." + sdwis["pwsid"],
115 | master_key = sdwis["pwsid"],
116 | pwsid = sdwis["pwsid"],
117 | state = sdwis["state_code"],
118 | name = sdwis["pws_name"],
119 | address_line_1 = sdwis["address_line1"],
120 | address_line_2 = sdwis["address_line2"],
121 | city = sdwis["city_name"],
122 | zip = sdwis["zip_code"],
123 | county = sdwis["county_served"],
124 | city_served = sdwis["city_served"],
125 | geometry = Polygon([]), # Empty geometry.
126 | primacy_agency_code = sdwis["primacy_agency_code"],
127 | primacy_type = sdwis["primacy_type"],
128 | population_served_count = sdwis["population_served_count"],
129 | service_connections_count = sdwis["service_connections_count"].astype("float").astype("int"),
130 | owner_type_code = sdwis["owner_type_code"],
131 | service_area_type_code = sdwis["service_area_type_code"].astype("str"),
132 | is_wholesaler_ind = sdwis["is_wholesaler_ind"],
133 | primary_source_code = sdwis["primary_source_code"],
134 | )
135 |
136 | df = df.set_crs(epsg=EPSG, allow_override=True)
137 |
138 | #%%
139 | helpers.load_to_postgis("sdwis", df)
--------------------------------------------------------------------------------
/src/match/map_tiger.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import pandas as pd
5 | import geopandas as gpd
6 | from dotenv import load_dotenv
7 | import match.helpers as helpers
8 |
9 | load_dotenv()
10 |
11 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
12 | EPSG = os.environ["WSB_EPSG"]
13 |
14 | # Bring in the FIPS -> State Abbr crosswalk
15 | state_cw = (pd
16 | .read_csv("../crosswalks/state_fips_to_abbr.csv", dtype="str")
17 | .set_index("code"))
18 |
19 | #%%
20 |
21 | tiger = gpd.read_file(os.path.join(DATA_PATH, "tiger_places_clean.gpkg"))
22 |
23 | # Ensure strings with leading zeros
24 | tiger["statefp"] = tiger["statefp"].astype("int").astype("str").str.zfill(2)
25 |
26 | # Augment with state code
27 | tiger = (tiger
28 | .join(state_cw, on="statefp", how="left"))
29 |
30 | # TODO - It would be nice to also know county, zip code, etc.,
31 | # but it doesn't seem like we can get this from the data as it stands.
32 | # Might need a lookup table.
33 |
34 | #%%
35 |
36 | df = gpd.GeoDataFrame().assign(
37 | source_system_id = tiger["geoid"],
38 | source_system = "tiger",
39 | contributor_id = "tiger." + tiger["geoid"],
40 | master_key = "UNK-tiger." + tiger["geoid"],
41 | name = tiger["name"],
42 | state = tiger["state"],
43 | population_served_count = tiger["population"].astype(pd.Int64Dtype()),
44 | geometry = tiger["geometry"],
45 | centroid_lat = tiger["intptlat"],
46 | centroid_lon = tiger["intptlon"],
47 | centroid_quality = "CALCULATED FROM GEOMETRY",
48 | geometry_source_detail = "2020 Census"
49 | )
50 |
51 | #%%
52 |
53 | helpers.load_to_postgis("tiger", df)
--------------------------------------------------------------------------------
/src/match/map_ucmr.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | import geopandas as gpd
5 | import pandas as pd
6 | from dotenv import load_dotenv
7 | import match.helpers as helpers
8 |
9 | load_dotenv()
10 |
11 | DATA_PATH = os.environ["WSB_STAGING_PATH"]
12 | EPSG = os.environ["WSB_EPSG"]
13 |
14 | #%%
15 |
16 | ucmr = pd.read_csv(os.path.join(DATA_PATH, "ucmr.csv"))
17 |
18 | ucmr = gpd.GeoDataFrame(
19 | ucmr,
20 | geometry=gpd.points_from_xy(ucmr["centroid_long"], ucmr["centroid_lat"]),
21 | crs="EPSG:4326")
22 |
23 | print("Loaded UCMR")
24 |
25 | pwsids = helpers.get_pwsids_of_interest()
26 | ucmr = ucmr[ucmr["pwsid"].isin(pwsids)]
27 | print("Filtered to PWSID's of interest.")
28 |
29 | #%%
30 |
31 | df = gpd.GeoDataFrame().assign(
32 | source_system_id = ucmr["pwsid"],
33 | source_system = "ucmr",
34 | contributor_id = "ucmr." + ucmr["pwsid"],
35 | master_key = ucmr["pwsid"],
36 | pwsid = ucmr["pwsid"],
37 | zip = ucmr["zipcode"].str[0:5],
38 | centroid_lat = ucmr["centroid_lat"],
39 | centroid_lon = ucmr["centroid_long"],
40 | geometry = ucmr["geometry"],
41 | centroid_quality = "ZIP CODE CENTROID"
42 | )
43 |
44 | #%%
45 |
46 | helpers.load_to_postgis("ucmr", df)
--------------------------------------------------------------------------------
/src/match/match_scorer.py:
--------------------------------------------------------------------------------
1 | #%%
2 |
3 | import os
4 | from typing import List, Optional
5 | import numpy as np
6 | import pandas as pd
7 | import geopandas as gpd
8 | import sqlalchemy as sa
9 | from dotenv import load_dotenv
10 |
11 | load_dotenv()
12 |
13 | STAGING_PATH = os.environ["WSB_STAGING_PATH"]
14 | EPSG = os.environ["WSB_EPSG"]
15 | PROJ = os.environ["WSB_EPSG_AW"]
16 |
17 | # Connect to local PostGIS instance
18 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"])
19 |
20 | class MatchScorer:
21 |
22 | def __init__(self):
23 | self.boundary_df = (self.get_data("tiger", ["contributor_id", "geometry"])
24 | .set_index("contributor_id"))
25 |
26 | self.labeled_df = self.get_data("labeled", ["pwsid", "master_key", "geometry"])
27 |
28 | def score_tiger_matches(self, matches: pd.DataFrame, proximity_buffer: int = 1000) -> pd.DataFrame:
29 |
30 | """
31 | Given a set of matches to boundary data, compare it to known geometries
32 | (labeled data) to evaluate whether each match is good or bad. This can
33 | be used to evaluate the effectiveness of our matching.
34 |
35 | The match DF should have columns: master_key, candidate_contributor_id
36 | """
37 |
38 | # Extract a series of "known geometries" from the labeled geometry data
39 | known_geometries = gpd.GeoSeries(
40 | self.labeled_df[["pwsid", "geometry"]]
41 | .merge(matches[["master_key", "candidate_contributor_id"]], left_on="pwsid", right_on="master_key")
42 | .set_index(["pwsid", "candidate_contributor_id"])
43 | ["geometry"])
44 |
45 | # Extract a series of "potential geometries" from the matched boundary data
46 | candidate_matches = gpd.GeoDataFrame(matches
47 | .join(self.boundary_df["geometry"], on="candidate_contributor_id")
48 | .rename(columns={"master_key": "pwsid"})
49 | .set_index(["pwsid", "candidate_contributor_id"])
50 | [["geometry"]])
51 |
52 | # Filter to only the PWS's that appear in both series
53 | # 7,423 match
54 | known_geometries = (known_geometries
55 | .loc[known_geometries.index.isin(candidate_matches.index)]
56 | .sort_index())
57 |
58 | candidate_matches = (candidate_matches
59 | .loc[candidate_matches.index.isin(known_geometries.index)]
60 | .sort_index())
61 |
62 | print("Retrieved and aligned data.")
63 |
64 | # Switch to a projected CRS
65 | known_geometries = known_geometries.to_crs(PROJ)
66 | candidate_matches = candidate_matches.to_crs(PROJ)
67 |
68 | print("Converted to a projected CRS.")
69 |
70 | distances = known_geometries.distance(candidate_matches, align=True)
71 | print("Calculated distances.")
72 |
73 | # A few empty labeled geometries cause NA distances. Filter only non-NA
74 | distances = distances[distances.notna()]
75 | distances.name = "distance"
76 |
77 | # re-join to the match table
78 | candidate_matches = candidate_matches.join(distances, on=["pwsid", "candidate_contributor_id"], how="inner")
79 |
80 | # Assign a score - 1 if a good match, 0 if not a good match
81 | candidate_matches["score"] = candidate_matches["distance"] <= proximity_buffer
82 |
83 | print("Assigned scores.")
84 |
85 | return candidate_matches
86 |
87 | def get_data(self, system: str, columns: List[str] = ["*"]) -> pd.DataFrame:
88 | print(f"Pulling {system} data from database...", end="")
89 |
90 | df = gpd.GeoDataFrame.from_postgis(f"""
91 | SELECT {", ".join(columns)}
92 | FROM pws_contributors
93 | WHERE source_system = '{system}';""",
94 | conn, geom_col="geometry")
95 |
96 | print("done.")
97 |
98 | return df
--------------------------------------------------------------------------------
/src/model/02_linear.R:
--------------------------------------------------------------------------------
1 | # linear model ------------------------------------------------------------
2 |
3 | library(tidyverse)
4 | library(tidymodels)
5 | library(sf)
6 | library(fs)
7 |
8 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
9 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
10 |
11 | # read dataset and log transform the response - only for linear model
12 | d <- read_csv(path(staging_path, "model_input_clean.csv")) %>%
13 | mutate(radius = log10(radius),
14 | # multiply correlated predictors
15 | density = population_served_count * service_connections_count)
16 |
17 | # Stash lat/long
18 | lat_long <- d %>%
19 | select(pwsid, centroid_lon, centroid_lat) %>%
20 | st_as_sf(coords = c("centroid_lon", "centroid_lat"), crs = epsg) %>%
21 | suppressMessages()
22 |
23 | cat("\n\nRead `model_input_clean.csv` from preprocess script.\n")
24 |
25 | # unlabeled data (du) and labeled data (dl)
26 | du <- d %>% filter(is.na(radius))
27 | dl <- d %>% filter(!is.na(radius))
28 |
29 | # split labeled data (dl) into train and test with stratified random sampling
30 | # in each of the radius quartiles to account for the lognormal distribution
31 | # of the response variable (radius) and avoid overfitting to small radius obs
32 | set.seed(55)
33 | dl_split <- initial_split(dl, prop = 0.8, strata = radius)
34 | train <- training(dl_split)
35 | test <- testing(dl_split)
36 |
37 | cat("Split data into train and test sets.\n")
38 |
39 | # lm recipe
40 | lm_recipe <-
41 | # specify the model - interaction terms come later
42 | recipe(
43 | radius ~
44 | service_connections_count +
45 | # use the cleaned owner type code from preprocess.R, which converts
46 | # 2 "N" owner type codes to "M" so that models can evaluate
47 | owner_type_code_clean +
48 | satc +
49 | is_wholesaler_ind,
50 | data = train
51 | ) %>%
52 | # convert predictors to log10
53 | step_log(service_connections_count, base = 10) %>%
54 | # encode categorical variables
55 | step_dummy(all_nominal_predictors()) %>%
56 | # specify interaction effects
57 | step_interact(~service_connections_count:starts_with("owner_type_code")) %>%
58 | step_interact(~service_connections_count:starts_with("satc")) %>%
59 | step_interact(~service_connections_count:starts_with("is_wholesaler_ind"))
60 |
61 | # specify model and engine for linear model and rf
62 | lm_mod <- linear_reg() %>% set_engine("lm")
63 |
64 | # lm workflow
65 | lm_wflow <-
66 | workflow() %>%
67 | add_model(lm_mod) %>%
68 | add_recipe(lm_recipe)
69 |
70 | # fit the linear model on the training set
71 | lm_fit <- fit(lm_wflow, train)
72 | cat("Fit model on training set.\n")
73 |
74 | # predict on the test set and bind mean predictions and CIs
75 | # lm_test_res <- test %>%
76 | # select(radius) %>%
77 | # bind_cols(predict(lm_fit, test)) %>%
78 | # bind_cols(predict(lm_fit, test, type = "conf_int"))
79 |
80 | # plot residuals
81 | # lm_test_res %>%
82 | # ggplot(aes(radius, .pred)) +
83 | # geom_point(alpha = 0.4) +
84 | # geom_abline(lty = 2, color = "red") +
85 | # labs(y = "Predicted radius (log10)", x = "Radius (log10)") +
86 | # # scale and size the x- and y-axis uniformly
87 | # coord_obs_pred()
88 |
89 | # RMSE
90 | # lm_metrics <- metric_set(rmse, rsq, mae)
91 | # lm_metrics(lm_test_res, truth = radius, estimate = .pred)
92 |
93 |
94 | # apply modeled radii to centroids for all data and write -----------------
95 |
96 | # fit the model on all data, apply the spatial buffer, and write
97 | t3m <- d %>%
98 | select(pwsid, radius, centroid_lat, centroid_lon, centroid_quality, geometry_source_detail) %>%
99 | bind_cols(predict(lm_fit, d)) %>%
100 | bind_cols(predict(lm_fit, d, type = "conf_int", level = 0.95)) %>%
101 | # exponentiate results back to median (unbiased), and 5/95 CIs
102 | mutate(across(c("radius", starts_with(".")), ~10^(.x))) %>%
103 | # add matched output lat/lng centroids and make spatial
104 | left_join(lat_long, by = "pwsid") %>%
105 | st_as_sf() %>%
106 | # convert to projected metric CRS for accurate, efficient buffer.
107 | # The project CRS (4326) is inappropriate because units are degrees.
108 | st_transform(3310)
109 | cat("Fit model on all data and added 5/95 CIs.\n")
110 |
111 | # create buffers for median, CI lower, and CI upper (5/95) predictions
112 | # (in units meters) and then transform back into projet CRS
113 | t3m_med <- st_buffer(t3m, t3m$.pred ) %>% st_transform(epsg)
114 | t3m_cil <- st_buffer(t3m, t3m$.pred_lower) %>% st_transform(epsg)
115 | t3m_ciu <- st_buffer(t3m, t3m$.pred_upper) %>% st_transform(epsg)
116 | cat("Created median and 5/95 CI buffers.\n")
117 |
118 | # paths to write modeled data
119 | path_t3m_med <- path(staging_path, "tier3_median.gpkg")
120 | path_t3m_cil <- path(staging_path, "tier3_ci_upper_95.gpkg")
121 | path_t3m_ciu <- path(staging_path, "tier3_ci_lower_05.gpkg")
122 |
123 | # write and delete layer if it already exists
124 | st_write(t3m_med, path_t3m_med, delete_dsn = TRUE, quiet = TRUE)
125 | st_write(t3m_cil, path_t3m_cil, delete_dsn = TRUE, quiet = TRUE)
126 | st_write(t3m_ciu, path_t3m_ciu, delete_dsn = TRUE, quiet = TRUE)
127 | cat("Wrote Tier 3 model putput to `WSB_STAGING_PATH`.\n")
128 |
--------------------------------------------------------------------------------
/src/model/README.md:
--------------------------------------------------------------------------------
1 | # Tier 3 model
2 |
3 | _Last updated 2022-03-28_
4 |
5 | This subdirectory depends on the postgis database, in particular the records created by `5-select_modeled_centroids.py`. It contains two scripts, one which preprocesses data for the Tier 3 model, and another that generates predictions and write those to the staging path for `src/combine_tiers.py`, which compiles the final TEMM spatial layer.
6 |
7 | In order, run `01_preprocess.R` followed by `02_linear.R`.
8 |
9 | For preprocessing and modeling documentation, see: `src/analysis/sandbox/model_explore/model_march.html`.
10 |
11 | The code herein was originally prototyped in the "model_explore" Little Sandbox, which contains additional models (random forest, xgboost) and superseded code (archived preprocess and linear model scripts).
12 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_ar.R:
--------------------------------------------------------------------------------
1 | # transform MA water system data to standard model -------------------
2 |
3 | cat("Preparing to transform MA polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Lookup for AR pwisds where name does not automatically match to shapefile
19 | pwsid_supp <- read_csv("crosswalks/ar_pwsid_lookup.csv")
20 |
21 | # Read layer for MA water service boundaries, clean, transform CRS
22 | ar_wsb <- st_read(path(data_path, "boundary/ar/PUBLIC_WATER_SYSTEMS.shp")) %>%
23 | # clean whitespace
24 | f_clean_whitespace_nas() %>%
25 | # transform to area weighted CRS
26 | st_transform(epsg_aw) %>%
27 | # correct invalid geometries
28 | st_make_valid() %>%
29 | janitor::clean_names()
30 |
31 | cat("Read AR boundary layer; cleaned whitespace; corrected geometries.\n ")
32 |
33 | # Match water system names to sdwis to get pwsid
34 | # Note that comparing to this csv requires having downloaded and cleaned the SDWIS data
35 | # TODO: identify best strategy for this
36 |
37 | # Get active cws in AR
38 | ar_sdwis <- read_csv(path(staging_path, "sdwis_water_system.csv")) %>%
39 | filter(primacy_agency_code == "AR",
40 | pws_activity_code == "A")
41 |
42 | # Select names and object ids from spatial dataset
43 | ar_names <- ar_wsb %>% select(objectid, pws_name) %>%
44 | st_drop_geometry()
45 |
46 | # Join spatial dataset system names with sdwis
47 | ar_pwsids <- ar_names %>% left_join(ar_sdwis, by = c("pws_name")) %>%
48 | select(objectid, pws_name, pwsid)
49 |
50 | # Pull out the number of missing ids
51 | # From this list, pwsids were manually assigned to create the lookup
52 | na_pwsids <- ar_pwsids %>% filter(is.na(pwsid)) %>%
53 | left_join(pwsid_supp, by = c("pws_name")) %>%
54 | select(objectid, pws_name, pwsid.y) %>%
55 | rename(pwsid = pwsid.y)
56 |
57 | # Concatenate pwsid dataframes
58 | ar_pwsids <- ar_pwsids %>%
59 | rbind(na_pwsids) %>%
60 | distinct() %>%
61 | filter(!is.na(pwsid))
62 |
63 | # Rejoin pwsid with shapefiles
64 | ar_wsb <- ar_wsb %>%
65 | left_join(ar_pwsids, by = c("objectid", "pws_name")) %>%
66 | # drop 12 geometries with no matching pwsid
67 | filter(!is.na(pwsid))
68 |
69 | # Compute centroids, convex hulls, and radius assuming circular
70 | ar_wsb <- ar_wsb %>%
71 | bind_rows() %>%
72 | mutate(
73 | state = "AR",
74 | # importantly, area calculations occur in area weighted epsg
75 | st_areashape = st_area(geometry),
76 | convex_hull = st_geometry(st_convex_hull(geometry)),
77 | area_hull = st_area(convex_hull),
78 | radius = sqrt(area_hull/pi)
79 | ) %>%
80 | # transform back to standard epsg
81 | st_transform(epsg) %>%
82 | # compute centroids
83 | mutate(
84 | centroid = st_geometry(st_centroid(geometry)),
85 | centroid_long = st_coordinates(centroid)[, 1],
86 | centroid_lat = st_coordinates(centroid)[, 2],
87 | ) %>%
88 | # select columns and rename for staging
89 | select(
90 | # data source columns
91 | pwsid,
92 | pws_name,
93 | state,
94 | # county,
95 | # city,
96 | # owner,
97 | # geospatial columns
98 | st_areashape,
99 | centroid_long,
100 | centroid_lat,
101 | radius,
102 | geometry
103 | )
104 | cat("Computed area, centroids, and radii from convex hulls.\n")
105 | cat("Combined into one layer; added geospatial columns.\n")
106 |
107 | # delete layer if it exists, then write to geopackage
108 | path_out <- path(staging_path, "wsb_labeled_ar.gpkg")
109 | if(file_exists(path_out)) file_delete(path_out)
110 |
111 | st_write(ar_wsb, path_out)
112 | cat("Wrote clean, labeled data to file.\n\n\n")
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_az.R:
--------------------------------------------------------------------------------
1 | # transform AZ water system data to standard model -------------------
2 |
3 | cat("Preparing to transform AZ polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for AZ water service boundaries, clean, transform CRS
19 | az_wsb <- st_read(path(data_path, "boundary/az/az.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # transform to area weighted CRS
23 | st_transform(epsg_aw) %>%
24 | # correct invalid geometries
25 | st_make_valid()
26 |
27 | cat("Read AZ boundary layer; cleaned whitespace; corrected geometries.\n")
28 |
29 | # Compute centroids, convex hulls, and radius assuming circular
30 | az_wsb <- az_wsb %>%
31 | bind_rows() %>%
32 | mutate(
33 | state = "AZ",
34 | # importantly, area calculations occur in area weighted epsg
35 | st_areashape = st_area(geometry),
36 | convex_hull = st_geometry(st_convex_hull(geometry)),
37 | area_hull = st_area(convex_hull),
38 | radius = sqrt(area_hull/pi)
39 | ) %>%
40 | # transform back to standard epsg
41 | st_transform(epsg) %>%
42 | # compute centroids
43 | mutate(
44 | centroid = st_geometry(st_centroid(geometry)),
45 | centroid_long = st_coordinates(centroid)[, 1],
46 | centroid_lat = st_coordinates(centroid)[, 2],
47 | ) %>%
48 | # select columns and rename for staging
49 | select(
50 | # data source columns
51 | pwsid = ADEQ_ID,
52 | pws_name = CWS_NAME,
53 | state,
54 | county = COUNTY,
55 | city = CITY_SRVD,
56 | owner = OWNER_NAME,
57 | # geospatial columns
58 | st_areashape,
59 | centroid_long,
60 | centroid_lat,
61 | radius,
62 | geometry
63 | )
64 | cat("Computed area, centroids, and radii from convex hulls.\n")
65 | cat("Combined into one layer; added geospatial columns.\n")
66 |
67 | # delete layer if it exists, then write to geopackage
68 | path_out <- path(staging_path, "wsb_labeled_az.gpkg")
69 | if(file_exists(path_out)) file_delete(path_out)
70 |
71 | st_write(az_wsb, path_out)
72 | cat("Wrote clean, labeled data to file.\n\n\n")
73 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_ca.R:
--------------------------------------------------------------------------------
1 | # transform CA water system data to standard model -------------------
2 |
3 | cat("Preparing to transform CA polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for CA water service boundaries, clean, transform CRS
19 | ca_wsb <- st_read(
20 | dsn = path(data_path, "boundary/ca/SABL_Public_220207/",
21 | "SABL_Public_220207.shp")) %>%
22 | # clean whitespace
23 | f_clean_whitespace_nas() %>%
24 | # transform to area weighted CRS
25 | st_transform(epsg_aw) %>%
26 | # correct invalid geometries
27 | st_make_valid()
28 |
29 | cat("Read CA boundary layer; cleaned whitespace; corrected geometries.\n ")
30 |
31 | ca_wsb <- ca_wsb %>%
32 | bind_rows() %>%
33 | # compute area, convex hulls, and radius assuming circular
34 | mutate(
35 | state = "CA",
36 | # importantly, area calculations occur in area weighted epsg
37 | st_areashape = st_area(geometry),
38 | convex_hull = st_geometry(st_convex_hull(geometry)),
39 | area_hull = st_area(convex_hull),
40 | radius = sqrt(area_hull/pi)
41 | ) %>%
42 | # transform back to standard epsg
43 | st_transform(epsg) %>%
44 | # compute centroid
45 | mutate (
46 | centroid = st_geometry(st_centroid(geometry)),
47 | centroid_long = st_coordinates(centroid)[, 1],
48 | centroid_lat = st_coordinates(centroid)[, 2]
49 | ) %>%
50 | # select columns and rename for staging
51 | select(
52 | # data source columns
53 | pwsid = WATER_SYST,
54 | pws_name = WATER_SY_1,
55 | state,
56 | county = COUNTY,
57 | # city,
58 | # owner,
59 | # geospatial columns
60 | st_areashape,
61 | centroid_long,
62 | centroid_lat,
63 | radius,
64 | geometry
65 | )
66 | cat("Computed area, centroids, and radii from convex hulls.\n")
67 | cat("Combined into one layer; added geospatial columns.\n")
68 |
69 | # delete layer if it exists, then write to geopackage
70 | path_out <- path(staging_path, "wsb_labeled_ca.gpkg")
71 | if(file_exists(path_out)) file_delete(path_out)
72 |
73 | st_write(ca_wsb, path_out)
74 | cat("Wrote clean, labeled data to file.\n\n\n")
75 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_ct.R:
--------------------------------------------------------------------------------
1 | # transform CT water system data to standard model -------------------
2 |
3 | cat("Preparing to transform CT polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for CT water service boundaries, clean, transform CRS
19 | ct_wsb <- st_read(
20 | path(data_path, "boundary/ct/Buffered_Community_PWS_Service_Areas.shp")) %>%
21 | # clean whitespace
22 | f_clean_whitespace_nas() %>%
23 | # transform to area weighted CRS
24 | st_transform(epsg_aw) %>%
25 | # correct invalid geometries
26 | st_make_valid()
27 |
28 | cat("Read CT boundary layer; cleaned whitespace; corrected geometries.\n ")
29 |
30 | # Compute centroids, convex hulls, and radius assuming circular
31 | ct_wsb <- ct_wsb %>%
32 | bind_rows() %>%
33 | mutate(
34 | state = "CT",
35 | # importantly, area calculations occur in area weighted epsg
36 | st_areashape = st_area(geometry),
37 | convex_hull = st_geometry(st_convex_hull(geometry)),
38 | area_hull = st_area(convex_hull),
39 | radius = sqrt(area_hull/pi)
40 | ) %>%
41 | # transform back to standard epsg
42 | st_transform(epsg) %>%
43 | # compute centroids
44 | mutate(
45 | centroid = st_geometry(st_centroid(geometry)),
46 | centroid_long = st_coordinates(centroid)[, 1],
47 | centroid_lat = st_coordinates(centroid)[, 2],
48 | ) %>%
49 | # select columns and rename for staging
50 | select(
51 | # data source columns
52 | pwsid = pwsid,
53 | pws_name = pws_name,
54 | state,
55 | # county,
56 | # city,
57 | # owner,
58 | # geospatial columns
59 | st_areashape,
60 | centroid_long,
61 | centroid_lat,
62 | radius,
63 | geometry
64 | )
65 | cat("Computed area, centroids, and radii from convex hulls.\n")
66 | cat("Combined into one layer; added geospatial columns.\n")
67 |
68 | # delete layer if it exists, then write to geopackage
69 | path_out <- path(staging_path, "wsb_labeled_ct.gpkg")
70 | if(file_exists(path_out)) file_delete(path_out)
71 |
72 | st_write(ct_wsb, path_out)
73 | cat("Wrote clean, labeled data to file.\n\n\n")
74 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_il.R:
--------------------------------------------------------------------------------
1 | # transform IL water system data to standard model -------------------
2 |
3 | cat("Preparing to transform IL polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for IL water service boundaries, clean, transform CRS
19 | il_wsb <- st_read(
20 | dsn = path(data_path, "boundary/il/Illinois_Municipal_Water_use_2012/Municipal_Water_Use_Statewide.gdb"),
21 | layer = "Municipal_Use_2012") %>%
22 | rename(geometry = "Shape") %>%
23 | # clean whitespace
24 | f_clean_whitespace_nas() %>%
25 | # transform to area weighted CRS
26 | st_transform(epsg_aw) %>%
27 | # correct invalid geometries
28 | st_make_valid() %>%
29 | janitor::clean_names()
30 |
31 | cat("Read IL boundary layer; cleaned whitespace; corrected geometries.\n ")
32 |
33 | # Compute centroids, convex hulls, and radius assuming circular
34 | # Combine data and merge geometries for rows with duplicate pwsids
35 | il_wsb <- il_wsb %>%
36 | mutate(
37 | state = "IL",
38 | # 161 cities have no pwsid listed
39 | # Of these, 123 have a seller pwsid listed
40 | pwsid = if_else(is.na(fac_id), seller_fac_id, fac_id),
41 | # Facility id blank = seller system, with name in "buys from"
42 | # Spot checking on name_1 for cases where fac_id is listed looks consistent
43 | pws_name = if_else(!is.na(fac_id), buys_from, name_1),
44 | # Preliminary geometry calculations
45 | # Calculate area sums and convex hulls
46 | st_areashape = st_area(geometry),
47 | convex_hull = st_geometry(st_convex_hull(geometry)),
48 | area_hull = st_area(convex_hull),
49 | ) %>%
50 | group_by(pwsid) %>%
51 | # mutate these new columns, knowing full well that duplicate rows
52 | # will be created, but that they will be dropped in the next step
53 | mutate(
54 | # combine all fragmented geometries
55 | geometry = st_union(geometry),
56 | # new area is the sum of the area of all polygons
57 | st_areashape = sum(st_areashape),
58 | area_hull = sum(area_hull),
59 | # new radius is calculated from the new area
60 | radius = sqrt(area_hull/pi),
61 | # combine data into list-formatted strings for character columns
62 | across(where(is.character), ~toString(unique(.)))
63 | ) %>%
64 | # only take the first result from each group
65 | slice(1) %>%
66 | ungroup() %>%
67 | # convert back to the project standard epsg
68 | st_transform(epsg) %>%
69 | # compute new centroids and note that when multipolygons are separated
70 | # by space, these are suspect and should not be used. Importantly, this
71 | # calculation occurs in the EPSG consistent with other staged data!
72 | mutate(
73 | centroid = st_geometry(st_centroid(geometry)),
74 | centroid_long = st_coordinates(centroid)[, 1],
75 | centroid_lat = st_coordinates(centroid)[, 2]
76 | ) %>%
77 | # select columns and rename for staging
78 | select(
79 | # data source columns
80 | pwsid,
81 | pws_name,
82 | state,
83 | # county,
84 | city = name_1,
85 | # owner,
86 | # geospatial columns
87 | st_areashape,
88 | centroid_long,
89 | centroid_lat,
90 | radius,
91 | geometry
92 | )
93 | cat("Computed area, centroids, and radii from convex hulls.\n")
94 | cat("Combined into one layer; added geospatial columns.\n")
95 |
96 | # verify that there is only one pwsid per geometry
97 | n <- il_wsb %>%
98 | count(pwsid) %>%
99 | filter(n > 1) %>%
100 | nrow()
101 | cat(n, "duplicate pwsids in labeled data following fix.\n")
102 |
103 | # delete layer if it exists, then write to geopackage
104 | path_out <- path(staging_path, "wsb_labeled_il.gpkg")
105 | if(file_exists(path_out)) file_delete(path_out)
106 |
107 | st_write(il_wsb, path_out)
108 | cat("Wrote clean, labeled data to file.\n\n\n")
109 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_ks.R:
--------------------------------------------------------------------------------
1 | # transform KS water system data to standard model -------------------
2 |
3 | cat("Preparing to transform KS polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for KS water service boundaries, clean, transform CRS
19 | ks_wsb <- st_read(path(data_path, "boundary/ks/PWS_bnd_2021_0430.shp")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # transform to area weighted CRS
23 | st_transform(epsg_aw) %>%
24 | # correct invalid geometries
25 | st_make_valid()
26 |
27 | cat("Read KS boundary layer; cleaned whitespace; corrected geometries.\n ")
28 |
29 | # Compute centroids, convex hulls, and radius assuming circular
30 | ks_wsb <- ks_wsb %>%
31 | bind_rows() %>%
32 | mutate(
33 | state = "KS",
34 | # importantly, area calculations occur in area weighted epsg
35 | st_areashape = st_area(geometry),
36 | convex_hull = st_geometry(st_convex_hull(geometry)),
37 | area_hull = st_area(convex_hull),
38 | radius = sqrt(area_hull/pi)
39 | ) %>%
40 | # transform back to standard epsg
41 | st_transform(epsg) %>%
42 | # compute centroids
43 | mutate(
44 | centroid = st_geometry(st_centroid(geometry)),
45 | centroid_long = st_coordinates(centroid)[, 1],
46 | centroid_lat = st_coordinates(centroid)[, 2],
47 | ) %>%
48 | # select columns and rename for staging
49 | select(
50 | # data source columns
51 | pwsid = FED_ID,
52 | pws_name = NAMEWCPSTA,
53 | state,
54 | # county,
55 | # city,
56 | # owner,
57 | # geospatial columns
58 | st_areashape,
59 | centroid_long,
60 | centroid_lat,
61 | radius,
62 | geometry
63 | )
64 | cat("Computed area, centroids, and radii from convex hulls.\n")
65 | cat("Combined into one layer; added geospatial columns.\n")
66 |
67 | # delete layer if it exists, then write to geopackage
68 | path_out <- path(staging_path, "wsb_labeled_ks.gpkg")
69 | if(file_exists(path_out)) file_delete(path_out)
70 |
71 | st_write(ks_wsb, path_out)
72 | cat("Wrote clean, labeled data to file.\n\n\n")
73 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_mo.R:
--------------------------------------------------------------------------------
1 | # transform MO water system data to standard model -------------------
2 |
3 | cat("Preparing to transform MO polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for MO water service boundaries, clean, transform CRS
19 | mo_wsb <- st_read(dsn = path(data_path, "boundary/mo/mo.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # drop multiple systems in one boundary, for now
23 | filter(str_detect(IPWS, "^MO\\d{5}")) %>%
24 | # transform to area weighted CRS
25 | st_transform(epsg_aw) %>%
26 | # correct invalid geometries
27 | st_make_valid()
28 |
29 | cat("Read MO boundary layer; cleaned whitespace; corrected geometries.\n ")
30 |
31 | # Compute centroids, convex hulls, and radius assuming circular
32 | mo_wsb <- mo_wsb %>%
33 | bind_rows() %>%
34 | mutate(
35 | state = "MO",
36 | # importantly, area calculations occur in area weighted epsg
37 | st_areashape = st_area(geometry),
38 | convex_hull = st_geometry(st_convex_hull(geometry)),
39 | area_hull = st_area(convex_hull),
40 | radius = sqrt(area_hull/pi)
41 | ) %>%
42 | # transform back to standard epsg
43 | st_transform(epsg) %>%
44 | # compute centroid
45 | mutate (
46 | centroid = st_geometry(st_centroid(geometry)),
47 | centroid_long = st_coordinates(centroid)[, 1],
48 | centroid_lat = st_coordinates(centroid)[, 2]
49 | ) %>%
50 | # select columns and rename for staging
51 | select(
52 | # data source columns
53 | pwsid = IPWS,
54 | pws_name = PWSSNAME,
55 | state,
56 | county = COUNTY,
57 | # city,
58 | # owner,
59 | # geospatial columns
60 | st_areashape,
61 | centroid_long,
62 | centroid_lat,
63 | radius,
64 | geometry
65 | )
66 | cat("Computed area, centroids, and radii from convex hulls.\n")
67 | cat("Combined into one layer; added geospatial columns.\n")
68 |
69 |
70 | # delete layer if it exists, then write to geopackage
71 | path_out <- path(staging_path, "wsb_labeled_mo.gpkg")
72 | if(file_exists(path_out)) file_delete(path_out)
73 |
74 | st_write(mo_wsb, path_out)
75 | cat("Wrote clean, labeled data to file.\n\n\n")
76 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_nc.R:
--------------------------------------------------------------------------------
1 | # transform NC water system data to standard model -------------------
2 |
3 | cat("Preparing to transform NC polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for NC water service boundaries, clean, transform CRS
19 | nc_wsb <- st_read(dsn = path(data_path, "boundary/nc/nc.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # transform to area weighted CRS
23 | st_transform(epsg_aw) %>%
24 | # correct invalid geometries
25 | st_make_valid()
26 |
27 | cat("Read NC boundary layer; cleaned whitespace; corrected geometries.\n ")
28 |
29 | # Compute centroids, convex hulls, and radius assuming circular
30 | nc_wsb <- nc_wsb %>%
31 | bind_rows() %>%
32 | mutate(
33 | state = "NC",
34 | wasyid = paste0("NC", wasyid),
35 | # importantly, area calculations occur in area weighted epsg
36 | st_areashape = st_area(geometry),
37 | convex_hull = st_geometry(st_convex_hull(geometry)),
38 | area_hull = st_area(convex_hull),
39 | radius = sqrt(area_hull/pi)
40 | ) %>%
41 | # transform back to standard epsg
42 | st_transform(epsg) %>%
43 | # compute centroids
44 | mutate(
45 | centroid = st_geometry(st_centroid(geometry)),
46 | centroid_long = st_coordinates(centroid)[, 1],
47 | centroid_lat = st_coordinates(centroid)[, 2],
48 | ) %>%
49 | # select columns and rename for staging
50 | select(
51 | # data source columns
52 | pwsid = wasyid,
53 | pws_name = wasyname,
54 | state,
55 | county = wapcs,
56 | # city,
57 | # owner,
58 | # geospatial columns
59 | st_areashape,
60 | centroid_long,
61 | centroid_lat,
62 | radius,
63 | geometry
64 | )
65 | cat("Computed area, centroids, and radii from convex hulls.\n")
66 | cat("Combined into one layer; added geospatial columns.\n")
67 |
68 |
69 | # delete layer if it exists, then write to geopackage
70 | path_out <- path(staging_path, "wsb_labeled_nc.gpkg")
71 | if(file_exists(path_out)) file_delete(path_out)
72 |
73 | st_write(nc_wsb, path_out)
74 | cat("Wrote clean, labeled data to file.\n\n\n")
75 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_nj.R:
--------------------------------------------------------------------------------
1 | # transform NJ water system data to standard model -------------------
2 |
3 | cat("Preparing to transform NJ polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for NJ water service boundaries, clean, transform CRS
19 | nj_wsb <- st_read(dsn = path(data_path, "boundary/nj/nj.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # transform to area weighted CRS
23 | st_transform(epsg_aw) %>%
24 | # correct invalid geometries
25 | st_make_valid()
26 |
27 | cat("Read NJ boundary layer; cleaned whitespace; corrected geometries.\n ")
28 |
29 | # Compute centroids, convex hulls, and radius assuming circular
30 | nj_wsb <- nj_wsb %>%
31 | bind_rows() %>%
32 | mutate(
33 | state = "NJ",
34 | # importantly, area calculations occur in area weighted epsg
35 | st_areashape = st_area(geometry),
36 | convex_hull = st_geometry(st_convex_hull(geometry)),
37 | area_hull = st_area(convex_hull),
38 | radius = sqrt(area_hull/pi)
39 | ) %>%
40 | # transform back to standard epsg
41 | st_transform(epsg) %>%
42 | mutate(
43 | centroid = st_geometry(st_centroid(geometry)),
44 | centroid_long = st_coordinates(centroid)[, 1],
45 | centroid_lat = st_coordinates(centroid)[, 2],
46 | ) %>%
47 | # select columns and rename for staging
48 | select(
49 | # data source columns
50 | pwsid = PWID,
51 | pws_name = SYS_NAME,
52 | state,
53 | # county, # county code is first 2 digits of PWID
54 | # city,
55 | # owner,
56 | # geospatial columns
57 | st_areashape,
58 | centroid_long,
59 | centroid_lat,
60 | radius,
61 | geometry
62 | )
63 | cat("Computed area, centroids, and radii from convex hulls.\n")
64 | cat("Combined into one layer; added geospatial columns.\n")
65 |
66 |
67 | # delete layer if it exists, then write to geopackage
68 | path_out <- path(staging_path, "wsb_labeled_nj.gpkg")
69 | if(file_exists(path_out)) file_delete(path_out)
70 |
71 | st_write(nj_wsb, path_out)
72 | cat("Wrote clean, labeled data to file.\n\n\n")
73 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_nm.R:
--------------------------------------------------------------------------------
1 | # transform NM water system data to standard model -------------------
2 |
3 | cat("Preparing to transform NM polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for NM water service boundaries, clean, transform CRS
19 | nm_wsb <- st_read(dsn = path(data_path, "boundary/nm/nm.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # drop rows where WaterSystem_ID is NA
23 | drop_na(Water_System_ID) %>%
24 | # filter for Water_System_ID matching pattern
25 | filter(str_detect(Water_System_ID, "^NM\\d{7}")) %>%
26 | # select first 9 characters of Water_System_ID
27 | mutate(Water_System_ID = substr(Water_System_ID, 1, 9)) %>%
28 | # transform to area weighted CRS
29 | st_transform(epsg_aw) %>%
30 | # correct invalid geometries
31 | st_make_valid()
32 |
33 | cat("Read NM boundary layer; cleaned whitespace; corrected geometries.\n ")
34 |
35 | # Compute centroids, convex hulls, and radius assuming circular
36 | nm_wsb <- nm_wsb %>%
37 | bind_rows() %>%
38 | mutate(
39 | state = "NM",
40 | geometry_source_detail = Polygon_Basis,
41 | # importantly, area calculations occur in area weighted epsg
42 | st_areashape = st_area(geometry),
43 | convex_hull = st_geometry(st_convex_hull(geometry)),
44 | area_hull = st_area(convex_hull),
45 | radius = sqrt(area_hull/pi)
46 | ) %>%
47 | # transform back to standard epsg
48 | st_transform(epsg) %>%
49 | # compute centroids
50 | mutate(
51 | centroid = st_geometry(st_centroid(geometry)),
52 | centroid_long = st_coordinates(centroid)[, 1],
53 | centroid_lat = st_coordinates(centroid)[, 2],
54 | ) %>%
55 | # select columns and rename for staging
56 | select(
57 | # data source columns
58 | pwsid = Water_System_ID,
59 | pws_name = PublicSystemName,
60 | state,
61 | county = CN,
62 | city = City,
63 | # owner,
64 | # geospatial columns
65 | st_areashape,
66 | centroid_long,
67 | centroid_lat,
68 | radius,
69 | geometry,
70 | geometry_source_detail
71 | )
72 | cat("Computed area, centroids, and radii from convex hulls.\n")
73 | cat("Combined into one layer; added geospatial columns.\n")
74 |
75 |
76 | # delete layer if it exists, then write to geopackage
77 | path_out <- path(staging_path, "wsb_labeled_nm.gpkg")
78 | if(file_exists(path_out)) file_delete(path_out)
79 |
80 | st_write(nm_wsb, path_out)
81 | cat("Wrote clean, labeled data to file.\n\n\n")
82 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_ok.R:
--------------------------------------------------------------------------------
1 | # transform OK water system data to standard model -------------------
2 |
3 | cat("Preparing to transform OK polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for OK water service boundaries, clean, transform CRS
19 | ok_wsb <- st_read(path(data_path, "boundary/ok/ok.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # transform to area weighted CRS
23 | st_transform(epsg_aw) %>%
24 | # correct invalid geometries
25 | st_make_valid()
26 |
27 | cat("Read OK boundary layer; cleaned whitespace; corrected geometries.\n ")
28 |
29 | # Compute centroids, convex hulls, and radius assuming circular
30 | ok_wsb <- ok_wsb %>%
31 | bind_rows() %>%
32 | mutate(
33 | state = "OK",
34 | geometry_source_detail = source,
35 | # importantly, area calculations occur in area weighted epsg
36 | st_areashape = st_area(geometry),
37 | convex_hull = st_geometry(st_convex_hull(geometry)),
38 | area_hull = st_area(convex_hull),
39 | radius = sqrt(area_hull/pi)
40 | ) %>%
41 | # transform back to standard epsg
42 | st_transform(epsg) %>%
43 | # compute centroids
44 | mutate(
45 | centroid = st_geometry(st_centroid(geometry)),
46 | centroid_long = st_coordinates(centroid)[, 1],
47 | centroid_lat = st_coordinates(centroid)[, 2],
48 | ) %>%
49 | # select columns and rename for staging
50 | select(
51 | # data source columns
52 | pwsid,
53 | pws_name = name,
54 | state,
55 | county,
56 | # city,
57 | # owner,
58 | # geospatial columns
59 | st_areashape,
60 | centroid_long,
61 | centroid_lat,
62 | radius,
63 | geometry,
64 | geometry_source_detail
65 | )
66 | cat("Computed area, centroids, and radii from convex hulls.\n")
67 | cat("Combined into one layer; added geospatial columns.\n")
68 |
69 |
70 | # delete layer if it exists, then write to geopackage
71 | path_out <- path(staging_path, "wsb_labeled_ok.gpkg")
72 | if(file_exists(path_out)) file_delete(path_out)
73 |
74 | st_write(ok_wsb, path_out)
75 | cat("Wrote clean, labeled data to file.\n\n\n")
76 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_pa.R:
--------------------------------------------------------------------------------
1 | # transform PA water system data to standard model -------------------
2 |
3 | cat("Preparing to transform PA polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for PA water service boundaries, clean, transform CRS
19 | pa_wsb <- st_read(dsn = path(data_path, "boundary/pa/pa.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # transform to area weighted CRS
23 | st_transform(epsg_aw) %>%
24 | # correct invalid geometries
25 | st_make_valid()
26 |
27 | cat("Read PA boundary layer; cleaned whitespace; corrected geometries.\n ")
28 |
29 | # Compute centroids, convex hulls, and radius assuming circular
30 | pa_wsb <- pa_wsb %>%
31 | bind_rows() %>%
32 | mutate(
33 | state = "PA",
34 | PWS_ID = paste0("PA", PWS_ID),
35 | # importantly, area calculations occur in area weighted epsg
36 | st_areashape = st_area(geometry),
37 | convex_hull = st_geometry(st_convex_hull(geometry)),
38 | area_hull = st_area(convex_hull),
39 | radius = sqrt(area_hull/pi)
40 | ) %>%
41 | # transform back to standard epsg
42 | st_transform(epsg) %>%
43 | mutate(
44 | centroid = st_geometry(st_centroid(geometry)),
45 | centroid_long = st_coordinates(centroid)[, 1],
46 | centroid_lat = st_coordinates(centroid)[, 2],
47 | ) %>%
48 | # select columns and rename for staging
49 | select(
50 | # data source columns
51 | pwsid = PWS_ID,
52 | pws_name = NAME,
53 | state,
54 | county = CNTY_NAME,
55 | # city,
56 | # owner,
57 | # geospatial columns
58 | st_areashape,
59 | centroid_long,
60 | centroid_lat,
61 | radius,
62 | geometry
63 | )
64 | cat("Computed area, centroids, and radii from convex hulls.\n")
65 | cat("Combined into one layer; added geospatial columns.\n")
66 |
67 |
68 | # delete layer if it exists, then write to geopackage
69 | path_out <- path(staging_path, "wsb_labeled_pa.gpkg")
70 | if(file_exists(path_out)) file_delete(path_out)
71 |
72 | st_write(pa_wsb, path_out)
73 | cat("Wrote clean, labeled data to file.\n\n\n")
74 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_ri.R:
--------------------------------------------------------------------------------
1 | # Transform RI water system data to standard model -------------------
2 |
3 | cat("Preparing to transform RI polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # Helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # Path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read manually curated list of pwsids developed by EPIC to link shapefiles for
19 | # water districts with water systems (see here: https://docs.google.com/spreadsheets/d/13aVFXj9Ty5EsRNFuHczpX04HoCzc689LVGKcJIREXY4/edit#gid=0)
20 | pwsid_lookup <- read.csv(here::here("crosswalks/ri_pwsid_lookup.csv")) %>%
21 | select(PWSID, H20_DISTRI, NAME, pws_name)
22 |
23 | # Read layer for AZ water service boundaries, clean, transform CRS
24 | ri_wsb <- st_read(path(data_path, "boundary/ri/ri.geojson")) %>%
25 | # clean whitespace
26 | f_clean_whitespace_nas() %>%
27 | # transform to area weighted CRS
28 | st_transform(epsg_aw) %>%
29 | # calculate geometries and areas of individual polygons
30 | mutate(
31 | state = "RI",
32 | # area calculations occur in area weighted epsg
33 | st_areashape = st_area(geometry),
34 | convex_hull = st_geometry(st_convex_hull(geometry)),
35 | area_hull = st_area(convex_hull)
36 | )
37 |
38 | cat("Read RI boundary layer; cleaned whitespace; corrected geometries.\n")
39 |
40 | ri_wsb <- ri_wsb %>%
41 | # join to pwsids
42 | left_join(pwsid_lookup, on = c("H20_DISTRI", "NAME")) %>%
43 | # clean up names
44 | janitor::clean_names() %>%
45 | # only keep boundaries with a pwsid (others are GW/SW sources it appears)
46 | filter(!is.na(pwsid)) %>%
47 | # group by pwsid to calculate total area based in multipolygons
48 | group_by(pwsid) %>%
49 | # mutate these new columns, knowing full well that duplicate rows
50 | # will be created, but that they will be dropped in the next step
51 | mutate(
52 | # combine all fragmented geometries
53 | geometry = st_union(geometry),
54 | # new area is the sum of the area of all polygons
55 | st_areashape = sum(st_areashape),
56 | area_hull = sum(area_hull),
57 | # new radius is calculated from the new area
58 | radius = sqrt(area_hull/pi),
59 | # combine data into list-formatted strings for character columns
60 | across(where(is.character), ~toString(unique(.)))
61 | ) %>%
62 | # only take the first result from each group
63 | slice(1) %>%
64 | ungroup() %>%
65 | # convert back to the project standard epsg
66 | st_transform(epsg) %>%
67 | # correct invalid geometries
68 | st_make_valid() %>%
69 | # compute new centroids and note that when multipolygons are separated
70 | # by space, these are suspect and should not be used. Importantly, this
71 | # calculation occurs in the EPSG consistent with other staged data!
72 |
73 |
74 | # Strangely, this step fails when run from run_pipeline in an ipykernel.
75 | # The error is "Found 1 feature with invalid spherical geometry."
76 | # But I thought st_make_valid should've solved this.
77 | # The workaround is to run this step manually from R.
78 | mutate(
79 | centroid = st_geometry(st_centroid(geometry)),
80 | centroid_long = st_coordinates(centroid)[, 1],
81 | centroid_lat = st_coordinates(centroid)[, 2]
82 | ) %>%
83 | # select columns and rename for staging
84 | select(
85 | # data source columns
86 | pwsid,
87 | pws_name,
88 | state,
89 | county,
90 | # geospatial columns
91 | st_areashape,
92 | centroid_long,
93 | centroid_lat,
94 | radius,
95 | geometry
96 | )
97 |
98 | cat("Recalculated area, radius, centroids for multipolygon pwsids.\n")
99 | cat("Combined string values for multipolygon pwsids.\n")
100 |
101 | # verify that there is only one pwsid per geometry
102 | n <- ri_wsb %>%
103 | count(pwsid) %>%
104 | filter(n > 1) %>%
105 | nrow()
106 | cat(n, "duplicate pwsids in labeled data following fix.\n")
107 |
108 |
109 | # delete layer if it exists, then write to geopackage
110 | path_out <- path(staging_path, "wsb_labeled_ri.gpkg")
111 | if(file_exists(path_out)) file_delete(path_out)
112 |
113 | st_write(ri_wsb, path_out)
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_tx.R:
--------------------------------------------------------------------------------
1 | # transform TX water system data to standard model -------------------
2 |
3 | cat("Preparing to transform TX polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for TX water service boundaries, clean, transform CRS
19 | tx_wsb <- st_read(path(data_path,
20 | "boundary/tx/PWS_shapefile/PWS_Export.shp")) %>%
21 | # clean whitespace
22 | f_clean_whitespace_nas() %>%
23 | # transform to area weighted CRS
24 | st_transform(epsg_aw) %>%
25 | # correct invalid geometries
26 | st_make_valid()
27 |
28 | cat("Read TX boundary layer; cleaned whitespace; corrected geometries.\n ")
29 |
30 | # Compute centroids, convex hulls, and radius assuming circular
31 | tx_wsb <- tx_wsb %>%
32 | bind_rows() %>%
33 | mutate(
34 | state = "TX",
35 | geometry_source_detail = Source,
36 | # importantly, area calculations occur in area weighted epsg
37 | st_areashape = st_area(geometry),
38 | convex_hull = st_geometry(st_convex_hull(geometry)),
39 | area_hull = st_area(convex_hull),
40 | radius = sqrt(area_hull/pi)
41 | ) %>%
42 | # transform back to standard epsg
43 | st_transform(epsg) %>%
44 | # compute centroids
45 | mutate(
46 | centroid = st_geometry(st_centroid(geometry)),
47 | centroid_long = st_coordinates(centroid)[, 1],
48 | centroid_lat = st_coordinates(centroid)[, 2],
49 | ) %>%
50 | # select columns and rename for staging
51 | select(
52 | # data source columns
53 | pwsid = PWSId,
54 | pws_name = pwsName,
55 | state,
56 | # county,
57 | # city,
58 | # owner,
59 | # geospatial columns
60 | st_areashape,
61 | centroid_long,
62 | centroid_lat,
63 | radius,
64 | geometry,
65 | geometry_source_detail
66 | )
67 | cat("Computed area, centroids, and radii from convex hulls.\n")
68 | cat("Combined into one layer; added geospatial columns.\n")
69 |
70 |
71 | # delete layer if it exists, then write to geopackage
72 | path_out <- path(staging_path, "wsb_labeled_tx.gpkg")
73 | if(file_exists(path_out)) file_delete(path_out)
74 |
75 | st_write(tx_wsb, path_out)
76 | cat("Wrote clean, labeled data to file.\n\n\n")
77 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_ut.R:
--------------------------------------------------------------------------------
1 | # transform UT water system data to standard model -------------------
2 |
3 | cat("Preparing to transform UT polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for UT water service boundaries, clean, transform CRS
19 | ut_wsb <- st_read(dsn = path(data_path, "boundary/ut/ut.geojson"),
20 | quiet = TRUE) %>%
21 | # clean whitespace
22 | f_clean_whitespace_nas() %>%
23 | # drop rows where DWSYSNUM is NA
24 | drop_na(DWSYSNUM) %>%
25 | # filter for DWSYSNUM matching pattern
26 | filter(str_detect(DWSYSNUM, "^UTAH\\d{5}$")) %>%
27 | # replace missing DWNAME with WRENAME
28 | mutate(DWNAME = ifelse(is.na(DWNAME), WRENAME, DWNAME)) %>%
29 | # transform to area weighted CRS
30 | st_transform(epsg_aw) %>%
31 | # correct invalid geometries
32 | st_make_valid()
33 |
34 | cat("Read UT boundary layer; cleaned whitespace; corrected geometries.\n ")
35 |
36 | # Compute centroids, convex hulls, and radius assuming circular
37 | ut_wsb <- ut_wsb %>%
38 | bind_rows() %>%
39 | mutate(
40 | state = "UT",
41 | geometry_source_detail = DATASOURCE,
42 | # importantly, area calculations occur in area weighted epsg
43 | st_areashape = st_area(geometry),
44 | convex_hull = st_geometry(st_convex_hull(geometry)),
45 | area_hull = st_area(convex_hull),
46 | radius = sqrt(area_hull/pi)
47 | ) %>%
48 | # transform back to standard epsg
49 | st_transform(epsg) %>%
50 | mutate(
51 | centroid = st_geometry(st_centroid(geometry)),
52 | centroid_long = st_coordinates(centroid)[, 1],
53 | centroid_lat = st_coordinates(centroid)[, 2],
54 | ) %>%
55 | # select columns and rename for staging
56 | select(
57 | # data source columns
58 | pwsid = DWSYSNUM,
59 | pws_name = DWNAME,
60 | state,
61 | county = COUNTY,
62 | # city,
63 | # owner,
64 | # geospatial columns
65 | st_areashape,
66 | centroid_long,
67 | centroid_lat,
68 | radius,
69 | geometry,
70 | geometry_source_detail
71 | )
72 | cat("Computed area, centroids, and radii from convex hulls.\n")
73 | cat("Combined into one layer; added geospatial columns.\n")
74 |
75 |
76 | # delete layer if it exists, then write to geopackage
77 | path_out <- path(staging_path, "wsb_labeled_ut.gpkg")
78 | if(file_exists(path_out)) file_delete(path_out)
79 |
80 | st_write(ut_wsb, path_out)
81 | cat("Wrote clean, labeled data to file.\n\n\n")
82 |
--------------------------------------------------------------------------------
/src/transformers/states/transform_wsb_wa.R:
--------------------------------------------------------------------------------
1 | # transform WA water system data to standard model -------------------
2 |
3 | cat("Preparing to transform WA polygon boundary data.\n\n")
4 |
5 | library(fs)
6 | library(sf)
7 | library(tidyverse)
8 |
9 | # helper function
10 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
11 |
12 | # path to save raw data, staging data, and standard projection
13 | data_path <- Sys.getenv("WSB_DATA_PATH")
14 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
17 |
18 | # Read layer for WA water service boundaries, clean, transform CRS
19 | wa_wsb <- st_read(path(data_path, "boundary/wa/wa.geojson")) %>%
20 | # clean whitespace
21 | f_clean_whitespace_nas() %>%
22 | # filter for five-character pwsid's
23 | filter(str_detect(WS_ID, "^.{5}$")) %>%
24 | # transform to area weighted CRS
25 | st_transform(epsg_aw) %>%
26 | # correct invalid geometries
27 | st_make_valid()
28 |
29 | cat("Read WA boundary layer; cleaned whitespace; corrected geometries.\n ")
30 |
31 | # Compute centroids, convex hulls, and radius assuming circular
32 | wa_wsb <- wa_wsb %>%
33 | bind_rows() %>%
34 | mutate(
35 | state = "WA",
36 | WS_ID = paste0("WA53", WS_ID),
37 | # importantly, area calculations occur in area weighted epsg
38 | st_areashape = st_area(geometry),
39 | convex_hull = st_geometry(st_convex_hull(geometry)),
40 | area_hull = st_area(convex_hull),
41 | radius = sqrt(area_hull/pi)
42 | ) %>%
43 | # transform back to standard epsg
44 | st_transform(epsg) %>%
45 | # compute centroids
46 | mutate(
47 | centroid = st_geometry(st_centroid(geometry)),
48 | centroid_long = st_coordinates(centroid)[, 1],
49 | centroid_lat = st_coordinates(centroid)[, 2],
50 | ) %>%
51 | # select columns and rename for staging
52 | select(
53 | # data source columns
54 | pwsid = WS_ID,
55 | pws_name = WS_Name,
56 | state,
57 | county = County,
58 | # city,
59 | # owner,
60 | # geospatial columns
61 | st_areashape,
62 | centroid_long,
63 | centroid_lat,
64 | radius,
65 | geometry
66 | )
67 | cat("Computed area, centroids, and radii from convex hulls.\n")
68 | cat("Combined into one layer; added geospatial columns.\n")
69 |
70 |
71 | # delete layer if it exists, then write to geopackage
72 | path_out <- path(staging_path, "wsb_labeled_wa.gpkg")
73 | if(file_exists(path_out)) file_delete(path_out)
74 |
75 | st_write(wa_wsb, path_out)
76 | cat("Wrote clean, labeled data to file.\n\n\n")
77 |
--------------------------------------------------------------------------------
/src/transformers/transform_contributed_pws.R:
--------------------------------------------------------------------------------
1 | # Transform contributed pws shapefiles ------------------------------------
2 | cat("Preparing to transform individually contributed pws shapefiles.\n\n")
3 |
4 | library(fs)
5 | library(sf)
6 | library(tidyverse)
7 |
8 | # helper function
9 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
10 |
11 | # path to save raw data, staging data, and standard projection
12 | data_path <- Sys.getenv("WSB_DATA_PATH")
13 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
14 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
15 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
16 |
17 | # Read layer for IL water service boundaries, clean, transform CRS
18 | pws_wsb <- st_read(path(data_path, "contributed_pws/contributed_pws.gpkg"),
19 | geometry_column = "geom",
20 | stringsAsFactors = FALSE) %>%
21 | rename(geometry = geom) %>%
22 | filter(!is.na(pwsid)) %>%
23 | # clean whitespace
24 | f_clean_whitespace_nas() %>%
25 | # transform to area weighted CRS
26 | st_transform(epsg_aw) %>%
27 | # correct invalid geometries
28 | st_make_valid() %>%
29 | janitor::clean_names()
30 |
31 | cat("Read individual pws shapefiles; cleaned whitespace; corrected geometries.\n ")
32 |
33 | # Compute centroids, convex hulls, and radius assuming circular
34 | # Combine data and merge geometries for rows with duplicate pwsids
35 | pws_wsb <- pws_wsb %>%
36 | mutate(
37 | state = substr(pwsid, 1, 2),
38 | geometry_source_detail = data_source,
39 | # importantly, area calculations occur in area weighted epsg
40 | st_areashape = st_area(geometry),
41 | convex_hull = st_geometry(st_convex_hull(geometry)),
42 | area_hull = st_area(convex_hull),
43 | radius = sqrt(area_hull/pi)
44 | ) %>%
45 | # transform back to standard epsg
46 | st_transform(epsg) %>%
47 | st_make_valid() %>%
48 | # compute centroid
49 | mutate (
50 | centroid = st_geometry(st_centroid(geometry)),
51 | centroid_long = st_coordinates(centroid)[, 1],
52 | centroid_lat = st_coordinates(centroid)[, 2],
53 |
54 | ) %>%
55 | # select columns and rename for staging
56 | select(
57 | # data source columns
58 | pwsid,
59 | pws_name,
60 | state,
61 | # county,
62 | # city,
63 | # owner,
64 | # geospatial columns
65 | st_areashape,
66 | centroid_long,
67 | centroid_lat,
68 | radius,
69 | geometry,
70 | geometry_source_detail
71 | )
72 | cat("Computed area, centroids, and radii from convex hulls.\n")
73 | cat("Combined into one layer; added geospatial columns.\n")
74 |
75 | # delete layer if it exists, then write to geopackage
76 | path_out <- path(staging_path, "contributed_pws.gpkg")
77 | if(file_exists(path_out)) file_delete(path_out)
78 |
79 | st_write(pws_wsb, path_out)
80 | cat("Wrote clean, labeled data to geopackage.\n\n\n")
81 |
82 |
--------------------------------------------------------------------------------
/src/transformers/transform_echo.R:
--------------------------------------------------------------------------------
1 | # transform ECHO data -----------------------------------
2 |
3 | library(fs)
4 | library(sf)
5 | library(tidyverse)
6 |
7 | # source functions
8 | dir_ls(here::here("src/functions")) %>% walk(~source(.x))
9 |
10 | # helper function
11 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
12 |
13 | # path to save raw data and standard projection
14 | data_path <- Sys.getenv("WSB_DATA_PATH")
15 | echo_data_path <- path(data_path, "echo")
16 | echo_file <- path(echo_data_path, "ECHO_EXPORTER.CSV")
17 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
18 | path_out <- path(staging_path, "echo.csv")
19 |
20 | cols <- c('REGISTRY_ID', 'FAC_NAME', 'FAC_NAME', 'FAC_STREET',
21 | 'FAC_CITY', 'FAC_STATE', 'FAC_ZIP', 'FAC_COUNTY',
22 | 'FAC_FIPS_CODE', 'FAC_LAT', 'FAC_LONG', 'FAC_INDIAN_CNTRY_FLG',
23 | 'FAC_FEDERAL_FLG', 'FAC_COLLECTION_METHOD',
24 | 'FAC_REFERENCE_POINT', 'FAC_ACCURACY_METERS',
25 | 'FAC_DERIVED_HUC', 'FAC_MAJOR_FLAG', 'FAC_ACTIVE_FLAG',
26 | 'FAC_QTRS_WITH_NC', 'SDWIS_FLAG', 'SDWA_IDS',
27 | 'SDWA_SYSTEM_TYPES', 'SDWA_INFORMAL_COUNT',
28 | 'SDWA_FORMAL_ACTION_COUNT', 'SDWA_COMPLIANCE_STATUS',
29 | 'SDWA_SNC_FLAG', 'FAC_DERIVED_TRIBES', 'FAC_DERIVED_HUC',
30 | 'FAC_DERIVED_WBD', 'FAC_DERIVED_STCTY_FIPS',
31 | 'FAC_DERIVED_ZIP', 'FAC_DERIVED_CD113', 'FAC_DERIVED_CB2010',
32 | 'FAC_PERCENT_MINORITY', 'FAC_POP_DEN', 'EJSCREEN_FLAG_US')
33 |
34 | bool_cols = c('fac_major_flag', 'fac_active_flag', 'sdwis_flag',
35 | 'sdwa_snc_flag', 'fac_indian_cntry_flg', 'fac_federal_flg',
36 | 'ejscreen_flag_us')
37 |
38 | # read in ECHO data and clean
39 | echo <- read_csv(echo_file, col_select=cols) %>%
40 | # make column names lowercase
41 | janitor::clean_names() %>%
42 | # clean whitespace and nulls
43 | f_clean_whitespace_nas() %>%
44 | # drop duplicates
45 | unique() %>%
46 | # drop null SDWA_IDS
47 | filter(!is.na(sdwa_ids)) %>%
48 | # split space-delimited pwsid's in sdwa_ids into lists
49 | mutate(sdwa_ids = str_split(sdwa_ids, " ")) %>%
50 | # explode rows with multiple pwsid's
51 | unnest(sdwa_ids) %>%
52 | # rename sdwa_ids to pwsid
53 | rename(pwsid = sdwa_ids) %>%
54 | # for bool_cols, map N to 0, Y to 1, and '' to NaN
55 | mutate_at(bool_cols, recode, `N`=0, `Y`=1, .default=NaN) %>%
56 | # convert bool_cols to boolean type
57 | mutate_at(bool_cols, as.logical)
58 |
59 | # Delete output file if exists
60 | if(file_exists(path_out)) file_delete(path_out)
61 |
62 | # Drop geometry and write as a CSV
63 | echo %>% write_csv(path_out)
64 |
--------------------------------------------------------------------------------
/src/transformers/transform_frs.R:
--------------------------------------------------------------------------------
1 | # Transform EPA facility registry service data ---------------------------------
2 |
3 | library(fs)
4 | library(sf)
5 | library(tidyverse)
6 |
7 | # source functions
8 | dir_ls(here::here("src/functions")) %>% walk(~source(.x))
9 |
10 | # path to save raw data and standard projection
11 | data_path <- Sys.getenv("WSB_DATA_PATH")
12 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
13 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
14 |
15 | # Read un-zipped geodatabase (~6GB so querying on water to reduce file size)
16 | # First look at available layers
17 | frs_layers <- st_layers(dsn = path(data_path, "frs/FRS_INTERESTS.gdb"))
18 |
19 | # SQL query to target facilities with water focus
20 | get_water_frs <- "
21 | SELECT *
22 | FROM FACILITY_INTERESTS
23 | WHERE INTEREST_TYPE IN (
24 | 'COMMUNITY WATER SYSTEM',
25 | 'NON-TRANSIENT NON-COMMUNITY WATER SYSTEM',
26 | 'TRANSIENT NON-COMMUNITY WATER SYSTEM',
27 | 'WATER TREATMENT PLANT',
28 | 'DRINKING WATER PROGRAM',
29 | 'DRINKING WATER SYSTEM'
30 | )"
31 |
32 | # Read layer for FRS_INTERESTS with conditional query on `INTEREST_TYPE`.
33 | # Then, transform to standard epsg.
34 | frs_water <- path(data_path, "frs/FRS_INTERESTS.gdb") %>%
35 | st_read(query = get_water_frs,
36 | layer = "FACILITY_INTERESTS",
37 | stringsAsFactors = FALSE) %>%
38 | st_transform(epsg)
39 |
40 | cat("Read labeled FRS layer and transformed to CRS:", epsg, "\n ")
41 |
42 | # Visualize points
43 | #plot(st_geometry(frs_water), pch = 1, col = 'blue')
44 |
45 |
46 | # General cleaning --------------------------------------------------------
47 |
48 | # Set column names to lower case, clean names, clean whitespace,
49 | # split PWSID and Facility ID from pgm_sys_id, add reported state name
50 | frs_water <- frs_water %>%
51 | rename(geometry = Shape) %>%
52 | janitor::clean_names() %>%
53 | f_clean_whitespace_nas() %>%
54 | mutate(pwsid = word(pgm_sys_id, 1),
55 | state = substr(pwsid, 1, 2),
56 | facility_id = word(pgm_sys_id, 2),
57 | facility_id = ifelse(pwsid == facility_id, NA, facility_id))
58 |
59 | # Write to geopackage ----------------------------------------------------
60 | path_out <- path(staging_path, "frs.gpkg")
61 | if(file_exists(path_out)) file_delete(path_out)
62 |
63 | st_write(frs_water, path_out)
64 | cat("Wrote FRS data to geopackage. \n")
65 |
--------------------------------------------------------------------------------
/src/transformers/transform_labeled.R:
--------------------------------------------------------------------------------
1 | # combine transformed state water system data ----------------------------
2 |
3 | library(fs)
4 | library(sf)
5 | library(tidyverse)
6 | library(mapview)
7 |
8 | # path to save staging data and standard projection
9 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
10 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
11 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
12 |
13 | # list, read, and combine all staged state wsb files
14 | wsb_labeled <- dir_ls(staging_path,
15 | regex = "wsb_labeled_[a-z][a-z].gpkg$") %>%
16 | map_df(~st_read(., quiet = TRUE)) %>%
17 | rename(geometry = geom) %>%
18 | # remove NA pwsid
19 | filter(!is.na(pwsid)) %>%
20 | suppressMessages()
21 |
22 | # combine data and merge geometries for rows with duplicate pwsids --------
23 |
24 | # show there are rows with duplicate pwsids
25 | multi <- st_drop_geometry(wsb_labeled) %>%
26 | count(pwsid, sort = TRUE) %>%
27 | filter(n > 1)
28 | cat("Detected", nrow(multi), "groups of rows with duplicate pwsids.\n")
29 |
30 | # add column indicating if row has a duplicated pwsid
31 | wsb_labeled <- wsb_labeled %>%
32 | # label duplicated pwsid geometries
33 | mutate(is_multi = ifelse(pwsid %in% multi$pwsid, TRUE, FALSE))
34 | cat("Added `is_multi` field to wsb labeled data.\n")
35 |
36 | # separate rows without duplicated pwsids
37 | wsb_labeled_no_multi <- wsb_labeled %>%
38 | filter(is_multi == FALSE)
39 |
40 | # for rows with duplicated pwsids:
41 | # union geometries, recalculate area, centroids, radius
42 | wsb_labeled_multi <- wsb_labeled %>%
43 | # filter for rows with duplicated pwsid's
44 | filter(is_multi == TRUE) %>%
45 | st_make_valid() %>%
46 | # importantly, all calculations take place in AW epsg
47 | st_transform(epsg_aw) %>%
48 | group_by(pwsid) %>%
49 | # mutate these new columns, knowing full well that duplicate rows
50 | # will be created, but that they will be dropped in the next step
51 | mutate(
52 | # combine all fragmented geometries
53 | geometry = st_union(geometry),
54 | # new area is the sum of the area of all polygons
55 | st_areashape = st_area(geometry),
56 | convex_hull = st_geometry(st_convex_hull(geometry)),
57 | area_hull = st_area(convex_hull),
58 | # new radius is calculated from the new area
59 | radius = sqrt(area_hull/pi),
60 | # combine data into list-formatted strings for character columns
61 | across(where(is.character), ~toString(unique(.)))
62 | ) %>%
63 | # only take the first result from each group
64 | slice(1) %>%
65 | ungroup() %>%
66 | # convert back to the project standard epsg
67 | st_transform(epsg) %>%
68 | st_make_valid() %>%
69 | # compute new centroids and note that when multipolygons are separated
70 | # by space, these are suspect and should not be used. Importantly, this
71 | # calculation occurs in the EPSG consistent with other staged data!
72 | mutate(
73 | centroid = st_geometry(st_centroid(geometry)),
74 | centroid_long = st_coordinates(centroid)[, 1],
75 | centroid_lat = st_coordinates(centroid)[, 2]
76 | ) %>%
77 | # remove centroid, convex_hull, and area_hull columns
78 | select(-c(centroid, convex_hull, area_hull)) %>%
79 | # replace empty or string "NA" cells with NA
80 | mutate(across(where(is.character), ~ gsub("^$|^ $|^NA$", NA, .))) %>%
81 | # convert columns with class units to numeric
82 | # before this, cols st_areashape and radius are
83 | # numeric, but have the class "units"
84 | mutate(across(where(is.numeric), as.numeric))
85 |
86 | cat("Recalculated area, radius, centroids for multipolygon pwsids.\n")
87 | cat("Combined string values for multipolygon pwsids.\n")
88 |
89 | # view
90 | # mapview::mapview(wsb_labeled_multi, zcol = "pwsid", burst = TRUE)
91 |
92 | # combine wsb labeled data with corrected rows
93 | wsb_labeled_clean <- bind_rows(wsb_labeled_no_multi, wsb_labeled_multi) %>%
94 | # remove is_multi column
95 | select(-is_multi)
96 |
97 | # verify that there is only one pwsid per geometry
98 | n <- wsb_labeled_clean %>%
99 | st_drop_geometry() %>%
100 | count(pwsid) %>%
101 | filter(n > 1) %>%
102 | nrow()
103 | cat(n, "duplicate pwsids in labeled data following fix.\n")
104 |
105 | # delete layer if it exists, then write to geopackage
106 | path_out <- path(staging_path, "wsb_labeled_clean.gpkg")
107 | if(file_exists(path_out)) file_delete(path_out)
108 |
109 | st_write(wsb_labeled_clean, path_out)
110 | cat("Wrote clean, labeled data to file.\n")
111 |
--------------------------------------------------------------------------------
/src/transformers/transform_mhp.R:
--------------------------------------------------------------------------------
1 | # Transform mobile home park point data ----------------------------------
2 |
3 | library(fs)
4 | library(sf)
5 | library(tidyverse)
6 |
7 | # helper functions
8 | dir_ls(here::here("src/functions")) %>% walk(~source(.x))
9 |
10 | # path to save raw data and standard projection
11 | data_path <- Sys.getenv("WSB_DATA_PATH")
12 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
13 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
14 |
15 | # Read un-zipped geodatabase, clean names, transform to standard epsg
16 | mhp_sp <- st_read(dsn = path(data_path, "mhp/mhp.geojson")) %>%
17 | janitor::clean_names() %>%
18 | st_transform(crs = epsg)
19 |
20 | cat("Read MHP layer, cleaned names, & transformed to CRS:", epsg, "\n ")
21 |
22 | # Visualize points
23 | #plot(st_geometry(mhp_sp), pch = 1, col = 'blue')
24 |
25 |
26 | # Clean attribute data ----------------------------------------------------
27 |
28 | mhp_sp <- mhp_sp %>%
29 | # clean size column and replace -999 missing units with NA
30 | mutate(size = as.factor(tolower(size)),
31 | units = na_if(units, -999)) %>%
32 | # clean column names
33 | rename(
34 | object_id = objectid,
35 | mhp_id = mhpid,
36 | mhp_name = name,
37 | zipcode = zip,
38 | county_fips = countyfips,
39 | source_date = sourcedate,
40 | rev_geo_flag = revgeoflag
41 | ) %>%
42 | f_clean_whitespace_nas()
43 |
44 | # Write clean mobile home park centroids
45 | path_out <- path(staging_path, "mhp_clean.gpkg")
46 | if(file_exists(path_out)) file_delete(path_out)
47 |
48 | st_write(mhp_sp, path_out)
49 |
--------------------------------------------------------------------------------
/src/transformers/transform_sdwis_geo_areas.py:
--------------------------------------------------------------------------------
1 | #%%
2 | #!/usr/bin/env python3
3 | # -*- coding: utf-8 -*-
4 | """
5 | Created on Fri Feb 4 11:32:27 2022
6 |
7 | @author: jjg
8 | """
9 |
10 |
11 | # Libraries
12 | import pandas as pd
13 | import numpy as np
14 | import os, sys
15 |
16 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
17 |
18 | from transformers.transform_sdwis_helpers import clean_up_columns, trim_whitespace, date_type
19 |
20 | from dotenv import load_dotenv
21 |
22 | # %% File path and data import
23 | load_dotenv()
24 |
25 | data_path = os.environ["WSB_DATA_PATH"]
26 | staging_path = os.environ["WSB_STAGING_PATH"]
27 | sdwis_data_path = os.path.join(data_path, "sdwis")
28 |
29 | file = "GEOGRAPHIC_AREA.CSV"
30 |
31 | # We only use a few columns from this data. Most other columns
32 | # are better in the primary SDWIS file.
33 |
34 | # Though, these columns are potentially valuable, just currently unused:
35 | # area_type_code
36 | # tribal_code
37 |
38 | geo_area = pd.read_csv(os.path.join(sdwis_data_path, file))
39 |
40 | # %% Basic cleaning
41 |
42 | # Remove table name from column headers
43 | geo_area = clean_up_columns(geo_area)
44 |
45 | # Trim whitespace
46 | geo_area = trim_whitespace(geo_area)
47 |
48 | # Drop duplicates
49 | geo_area = geo_area.drop_duplicates()
50 |
51 | # Narrow to columns of interest
52 | geo_area = geo_area[["pwsid", "city_served", "county_served"]]
53 |
54 |
55 | # %% Clean city_served column
56 |
57 | geo_area["city_served"] = (geo_area["city_served"]
58 | .str.replace(r"\.?-\.?\s*\d{4}", "", regex=True) # Remove "-" followed by 0 or 1 ".", 0 or more spaces, and four digits
59 | .str.replace(r"'", "'", regex=True) # Replace "'" with "'"
60 | .str.replace(r"\(\s*[A-Z]\s*\)", "", regex=True) # Replace parenthetical with single letter (plus any spaces) in it, e.g. (V) or (T)
61 | .str.replace(r"\s\s+", " ", regex=True)) # Replace excess whitespace within line with a single space
62 |
63 | # Trim whitespace again
64 | geo_area = trim_whitespace(geo_area)
65 |
66 | #%% Deduplicate
67 |
68 | # In a previous SDWIS download, the records with area_type_code = "TR" were
69 | # excluded. Now they're included.
70 |
71 | # But records with area_type_code = "TR" are contributing duplicates;
72 | # there's often another record of a different area_type_code.
73 |
74 | # Some notes about these duplicates:
75 | # The ones with area_type_code = "TR" also have the tribal_code attribute populated.
76 | # city_served and county_served is only populated when area_type_code != "TR".
77 |
78 | # How to eliminate these duplicates?
79 | # Since we specifically need the city_served and county_served data
80 | # downstream, we can eliminate records that have NA's in both fields.
81 | # This also eliminates the duplicates.
82 |
83 | geo_area = geo_area[
84 | geo_area["city_served"].notna() |
85 | geo_area["county_served"].notna()]
86 |
87 |
88 | # %% Raise duplication issue on key fields
89 |
90 | if not geo_area["pwsid"].is_unique:
91 | raise Exception("pwsid is not unique.")
92 | #%%
93 | # Save csv in staging
94 |
95 | geo_area.to_csv(os.path.join(staging_path, "sdwis_geographic_area.csv"), index = False)
96 |
--------------------------------------------------------------------------------
/src/transformers/transform_sdwis_helpers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Thu Feb 3 15:19:40 2022
5 |
6 | @author: jjg
7 | """
8 |
9 | # Libraries
10 | from typing import List
11 | import pandas as pd
12 | import numpy as np
13 |
14 |
15 | # Clean up columns
16 | def clean_up_columns(df: pd.DataFrame):
17 | """
18 | Remove table names from column headers and set to lower case.
19 |
20 | Args:
21 | df : data frame for transformation
22 |
23 | Output:
24 | df_clean : cleaned data frame
25 |
26 | """
27 | # Remove column header issues
28 | df.columns = (df.columns.str.replace('.*\\.', '', regex = True))
29 |
30 | # set all names to lowercase
31 | df.columns = df.columns.str.lower()
32 |
33 | # remove column extras
34 | df = df.dropna(axis = 1, how = "all")
35 |
36 | return df
37 |
38 |
39 | # Standardize date columns
40 | def date_type(df: pd.DataFrame, date_columns: List[str]):
41 | """
42 | Clean up date columns using pandas datetime
43 |
44 | Args:
45 | df : data frame for transformation
46 |
47 | Output:
48 | df : cleaned data frame
49 |
50 | """
51 | # set date columns to date
52 | for x in date_columns:
53 | df[x] = (
54 | pd.to_datetime(df[x], format="%d-%b-%y")
55 | .dt.normalize())
56 |
57 |
58 | # Trims all white space
59 | def trim_whitespace(df: pd.DataFrame):
60 |
61 | df = df.copy()
62 |
63 | for col in df.select_dtypes(include=[object]):
64 | df[col] = df[col].str.strip()
65 |
66 | return df
--------------------------------------------------------------------------------
/src/transformers/transform_sdwis_service.py:
--------------------------------------------------------------------------------
1 | #%%
2 | #!/usr/bin/env python3
3 | # -*- coding: utf-8 -*-
4 | """
5 | Created on Fri Feb 11 16:54:19 2022
6 |
7 | @author: jjg
8 | """
9 |
10 |
11 | # Libraries
12 | import pandas as pd
13 | import numpy as np
14 | import os, sys
15 |
16 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
17 |
18 | from transformers.transform_sdwis_helpers import clean_up_columns, trim_whitespace, date_type
19 |
20 | from dotenv import load_dotenv
21 |
22 | # %% File path and data import
23 | load_dotenv()
24 |
25 | data_path = os.environ["WSB_DATA_PATH"]
26 | staging_path = os.environ["WSB_STAGING_PATH"]
27 | sdwis_data_path = os.path.join(data_path, "sdwis")
28 |
29 | file = "SERVICE_AREA.CSV"
30 | service_area = pd.read_csv(os.path.join(sdwis_data_path, file))
31 |
32 | # %% Basic cleaning
33 |
34 | # Remove table name from column headers
35 | service_area = clean_up_columns(service_area)
36 |
37 | # Trim whitespace
38 | service_area = trim_whitespace(service_area)
39 |
40 | # Drop duplicates
41 | service_area = service_area.drop_duplicates()
42 |
43 | # Drop fully empty columns (cities_served, counties_served -- get from other tables)
44 | service_area = service_area.dropna(how='all', axis=1)
45 |
46 |
47 | # %% Sanitize booleans
48 | bool_cols = ["is_primary_service_area_code"]
49 |
50 | for i in bool_cols:
51 | service_area[i] = service_area[i].map({'N': 0, 'Y': 1, '': np.NaN, np.NaN : np.NaN})
52 | service_area[i] = service_area[i].astype('boolean')
53 |
54 |
55 | # %% Raise duplication issue on key fields
56 |
57 | if service_area[["pwsid", "service_area_type_code"]].duplicated().any():
58 | raise Exception("pwsid is not unique.")
59 |
60 | # %% Save csv in staging
61 |
62 | service_area.to_csv(os.path.join(staging_path, "sdwis_service_area.csv"), index = False)
63 |
--------------------------------------------------------------------------------
/src/transformers/transform_sdwis_ws.py:
--------------------------------------------------------------------------------
1 | #%%
2 | #!/usr/bin/env python3
3 | # -*- coding: utf-8 -*-
4 | """
5 | Created on Thu Feb 3 15:11:24 2022
6 |
7 | @author: jjg
8 | """
9 |
10 | # Libraries
11 | import pandas as pd
12 | import numpy as np
13 | import os, sys
14 |
15 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
16 |
17 | from transformers.transform_sdwis_helpers import clean_up_columns, trim_whitespace, date_type
18 |
19 | from dotenv import load_dotenv
20 |
21 | # %% File path and data import
22 | load_dotenv()
23 |
24 | data_path = os.environ["WSB_DATA_PATH"]
25 | staging_path = os.environ["WSB_STAGING_PATH"]
26 | sdwis_data_path = os.path.join(data_path, "sdwis")
27 |
28 | file = "WATER_SYSTEM.CSV"
29 | water_system = pd.read_csv(os.path.join(sdwis_data_path, file))
30 |
31 | # %% Basic cleaning
32 |
33 | # Remove table name from column headers
34 | water_system = clean_up_columns(water_system)
35 |
36 | # Trim whitespace
37 | water_system = trim_whitespace(water_system)
38 |
39 | # Drop duplicates
40 | water_system = water_system.drop_duplicates()
41 |
42 | # Drop fully empty columns (cities_served, counties_served -- get from other tables)
43 | water_system = water_system.dropna(how='all', axis=1)
44 |
45 |
46 | # %% Sanitize booleans
47 | bool_cols = ["npm_candidate", "is_wholesaler_ind", \
48 | "is_school_or_daycare_ind", "source_water_protection_code"]
49 |
50 | for i in bool_cols:
51 | water_system[i] = water_system[i].map({'N': 0, 'Y': 1, '': np.NaN, np.NaN : np.NaN})
52 | water_system[i] = water_system[i].astype('boolean')
53 |
54 | # %% Standardize dates
55 |
56 | date_cols = ['outstanding_perform_begin_date','pws_deactivation_date', \
57 | 'source_protection_begin_date']
58 |
59 | date_type(water_system, date_cols)
60 |
61 | # %% Simplify zip-code column to 5 digit
62 |
63 | water_system["zip_code"] = water_system["zip_code"].str[0:5]
64 |
65 |
66 | # %% Raise duplication issue on key fields
67 |
68 | if not water_system["pwsid"].is_unique:
69 | raise Exception("pwsid is not unique.")
70 |
71 | # %% Save csv in staging
72 |
73 | water_system.to_csv(os.path.join(staging_path, "sdwis_water_system.csv"), index = False)
74 |
--------------------------------------------------------------------------------
/src/transformers/transform_tigris_ne.R:
--------------------------------------------------------------------------------
1 | # transform TIGER places and crop to oceans polyline ---------------------
2 |
3 | library(fs)
4 | library(sf)
5 | library(tidyverse)
6 | library(tigris)
7 | library(rmapshaper)
8 |
9 |
10 | # path to save raw data, staging data, and standard projection
11 | data_path <- Sys.getenv("WSB_DATA_PATH")
12 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
13 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
14 |
15 | # download large files without timeout error
16 | options(timeout = 100000)
17 |
18 | # read Natural Earth ocean geometry
19 | ocean <- st_read(path(data_path, "ne/ocean/ne-ocean-10m/ne_10m_ocean.shp")) %>%
20 | select(geometry) %>%
21 | st_make_valid()
22 |
23 | # transform places to ocean crs, make valid
24 | places <- read_rds(path(data_path, "tigris/tigris_places.rds")) %>%
25 | st_transform(st_crs(ocean)$epsg) %>%
26 | st_make_valid()
27 |
28 | # intersect places with oceans and write
29 | places_clean <- places %>%
30 | st_intersection(ocean) %>%
31 | st_make_valid() %>%
32 | janitor::clean_names()
33 |
34 | # sanity check that oceans are removed
35 | # mapview::mapview(places_clean)
36 |
37 | # download tigris population data
38 | pop <- read_csv(path(data_path, "tigris/tigris_pop.csv")) %>%
39 | select(geoid, population)
40 |
41 | # join population data to places_clean
42 | places_clean <- places_clean %>%
43 | left_join(pop, by = "geoid")
44 |
45 | # write clean TIGER places
46 | path_out <- path(staging_path, "tiger_places_clean.gpkg")
47 | if(file_exists(path_out)) file_delete(path_out)
48 |
49 | st_write(places_clean, path_out)
50 | cat("Wrote clean TIGER places.\n")
51 |
--------------------------------------------------------------------------------
/src/transformers/transform_ucmr.R:
--------------------------------------------------------------------------------
1 | # transform UCMR3 and UCMR4 zip codes and add centroids ---------------------
2 |
3 | library(fs)
4 | library(sf)
5 | library(tidyverse)
6 | library(tigris)
7 |
8 | # tell tigris to cache Census shapefile downloads for faster subsequent runs
9 | options(tigris_use_cache = TRUE)
10 |
11 | # helper function
12 | source(here::here("src/functions/f_clean_whitespace_nas.R"))
13 |
14 | # path to save raw data, staging data, and standard projection
15 | data_path <- Sys.getenv("WSB_DATA_PATH")
16 | staging_path <- Sys.getenv("WSB_STAGING_PATH")
17 | epsg <- as.numeric(Sys.getenv("WSB_EPSG"))
18 | epsg_aw <- Sys.getenv("WSB_EPSG_AW")
19 |
20 | # read ucmr3 and ucmr4 data, combine, clean names, add
21 | ucmr <- dir_ls(path(data_path, "ucmr"), regexp = "ZipCodes.txt") %>%
22 | read_tsv(col_types = "c") %>%
23 | distinct() %>%
24 | janitor::clean_names() %>%
25 | # a number of zipcodes end in "-" and should be cleaned
26 | mutate(zipcode = str_replace_all(zipcode, "-", "")) %>%
27 | # clean whitespace and NAs, and drop NA zipcodes
28 | f_clean_whitespace_nas() %>%
29 | drop_na(zipcode)
30 |
31 | # print nonsense zipcodes for review because they're few in number.
32 | # zip codes should have exactly 5 digits and no alphabetical chars
33 | zip_rm <- filter(ucmr,
34 | nchar(zipcode) != 5 |
35 | str_detect(zipcode, "[:alpha:]"))
36 |
37 | cat("Detected", nrow(zip_rm), "nonsense zipcodes:\n"); print(zip_rm)
38 |
39 | # remove nonsense zipcodes
40 | ucmr <- anti_join(ucmr, zip_rm)
41 |
42 | cat("Removed", nrow(zip_rm), "nonsense zipcodes from ucmr data.\n")
43 |
44 |
45 | # merge zip codes to spatial zip code polygon -----------------------------
46 |
47 | # zip code columns to keep
48 | cols_keep <- c("zipcode", "geoid20", "aland20", "awater20", "st_areashape",
49 | "area_hull")
50 |
51 | # pull usa state geometries, project to input data CRS
52 | zipcode_areas <- tigris::zctas()
53 | zipcodes <- zipcode_areas %>%
54 | janitor::clean_names() %>%
55 | # use area weighted crs because we calculate polygon areas
56 | st_transform(st_crs(epsg_aw)) %>%
57 | mutate(
58 | # area calculations occur in area weighted epsg
59 | zipcode = zcta5ce20,
60 | st_areashape = st_area(geometry),
61 | convex_hull = st_geometry(st_convex_hull(geometry)),
62 | area_hull = st_area(convex_hull)
63 | ) %>%
64 | select(all_of(cols_keep))
65 |
66 |
67 | # join zipcode polygon geometries to ucmr master list and
68 | # combine data and merge geometries for rows with duplicate pwsids --------
69 |
70 | ucmr <- ucmr %>%
71 | left_join(zipcodes, on = "zipcode") %>%
72 | # convert object back to spatial
73 | st_as_sf(crs = epsg_aw) %>%
74 | # ensure valid geometries
75 | st_make_valid() %>%
76 | group_by(pwsid) %>%
77 | # mutate these new columns, knowing full well that duplicate rows
78 | # will be created, but that they will be dropped in the next step
79 | mutate(
80 | # combine all fragmented geometries
81 | geometry = st_union(geometry),
82 | # new area is the sum of the area of all polygons
83 | st_areashape = sum(st_areashape),
84 | area_hull = sum(area_hull),
85 | # new radius is calculated from the new area
86 | radius = sqrt(area_hull/pi),
87 | # combine data into list-formatted strings for character columns
88 | across(where(is.character), ~toString(unique(.)))
89 | ) %>%
90 | # only take the first result from each group
91 | slice(1) %>%
92 | ungroup() %>%
93 | # convert back to the project standard epsg
94 | st_transform(epsg) %>%
95 | # compute new centroids and note that when multipolygons are separated
96 | # by space, these are suspect and should not be used. Importantly, this
97 | # calculation occurs in the EPSG consistent with other staged data!
98 | mutate(
99 | centroid = st_geometry(st_centroid(geometry)),
100 | centroid_long = st_coordinates(centroid)[, 1],
101 | centroid_lat = st_coordinates(centroid)[, 2]
102 | ) %>%
103 | # remove columns. Note: future iteration may include other values downstream
104 | select(c(pwsid, zipcode, st_areashape, radius, centroid_long, centroid_lat)) %>%
105 | st_drop_geometry()
106 |
107 | cat("Recalculated area, radius, centroids for multipolygon pwsids.\n")
108 | cat("Combined string values for multipolygon pwsids.\n")
109 |
110 | # verify that there is only one pwsid per geometry
111 | n <- ucmr %>%
112 | count(pwsid) %>%
113 | filter(n > 1) %>%
114 | nrow()
115 | cat(n, "duplicate pwsids in labeled data following fix.\n")
116 |
117 |
118 | # Write clean ucmr data to CSV
119 | path_out <- path(staging_path, "ucmr.csv")
120 | if(file_exists(path_out)) file_delete(path_out)
121 |
122 | write_csv(ucmr, path_out)
123 |
--------------------------------------------------------------------------------
/wsb.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------