├── .Rprofile ├── .gitignore ├── CHANGELOG.md ├── README.md ├── crosswalks ├── ar_pwsid_lookup.csv ├── county_fips.csv ├── ri_pwsid_lookup.csv └── state_fips_to_abbr.csv ├── docs ├── contributing.md ├── credits.md ├── diagrams │ ├── contributed_pws.drawio │ ├── flow_diagram.drawio │ ├── flow_diagram_v2.drawio │ ├── match_diagrams.drawio │ └── sl-march-2020.excalidraw ├── frs │ └── facility-registry-service-best-pick-processing-v-2.0.pdf └── img │ ├── contributed_pws.png │ ├── data_flow_diagram.png │ ├── data_flow_diagram_v2.png │ ├── data_sources.png │ ├── epic_logo.png │ ├── mapping_diagram.png │ ├── matches.png │ ├── matching_diagram.png │ ├── simplelab_logo.png │ ├── spatial_assignment.png │ ├── stacked_match_report.png │ ├── temm-nation.png │ ├── tiers_diagram.png │ └── wadl_logo.jpg ├── etc └── wsb_labeled_simplified.rds ├── layers ├── epa_regions.csv └── us_states.geojson ├── renv.lock ├── renv ├── .gitignore ├── activate.R └── settings.dcf ├── requirements.txt ├── src ├── analysis │ ├── README.md │ └── sandbox │ │ ├── eda │ │ ├── eda_february.Rmd │ │ ├── explore_wsb_sdwis.py │ │ ├── multipolygon_pwsids_in_labeled_data.Rmd │ │ ├── multipolygon_pwsids_in_labeled_data.html │ │ └── wholesalers.Rmd │ │ ├── matching │ │ ├── match_reports.py │ │ └── stats.py │ │ ├── model_explore │ │ ├── .gitignore │ │ ├── 02_random_forest.R │ │ ├── 03_xgboost.R │ │ ├── README.md │ │ ├── archive │ │ │ ├── 01_preprocess.R │ │ │ └── 04_linear.R │ │ ├── etc │ │ │ ├── final_xgb.rds │ │ │ └── xgb_res.rds │ │ ├── model_march.Rmd │ │ └── model_march.html │ │ ├── report_review │ │ └── report_changes.Rmd │ │ └── sanity_checks │ │ └── 01_convex_hull.R ├── combine_tiers.py ├── downloaders │ ├── README.md │ ├── download_contributed_pws.R │ ├── download_echo.R │ ├── download_frs.R │ ├── download_helpers.py │ ├── download_mhp.R │ ├── download_sdwis.py │ ├── download_tigris_ne.R │ ├── download_ucmr.R │ └── states │ │ ├── download_ar_wsb.R │ │ ├── download_az_wsb.R │ │ ├── download_ct_wsb.R │ │ ├── download_il_wsb.R │ │ ├── download_ks_wsb.R │ │ ├── download_mo_wsb.R │ │ ├── download_nc_wsb.R │ │ ├── download_nj_wsb.R │ │ ├── download_nm_wsb.R │ │ ├── download_ok_wsb.R │ │ ├── download_pa_wsb.R │ │ ├── download_ri_wsb.R │ │ ├── download_state_helpers.R │ │ ├── download_ut_wsb.R │ │ └── download_wa_wsb.R ├── functions │ ├── f_clean_whitespace_nas.R │ └── f_drop_imposters.R ├── match │ ├── 0-init.py │ ├── 2-cleansing.py │ ├── 3-matching.py │ ├── 4-rank_boundary_matches.py │ ├── 5-select_modeled_centroids.py │ ├── helpers.py │ ├── init_model.sql │ ├── map_contributed.py │ ├── map_echo.py │ ├── map_frs.py │ ├── map_labeled.py │ ├── map_mhp.py │ ├── map_sdwis.py │ ├── map_tiger.py │ ├── map_ucmr.py │ ├── match_scorer.py │ └── readme.md ├── model │ ├── 01_preprocess.R │ ├── 02_linear.R │ └── README.md ├── run_pipeline.py └── transformers │ ├── README.md │ ├── states │ ├── transform_wsb_ar.R │ ├── transform_wsb_az.R │ ├── transform_wsb_ca.R │ ├── transform_wsb_ct.R │ ├── transform_wsb_il.R │ ├── transform_wsb_ks.R │ ├── transform_wsb_mo.R │ ├── transform_wsb_nc.R │ ├── transform_wsb_nj.R │ ├── transform_wsb_nm.R │ ├── transform_wsb_ok.R │ ├── transform_wsb_pa.R │ ├── transform_wsb_ri.R │ ├── transform_wsb_tx.R │ ├── transform_wsb_ut.R │ └── transform_wsb_wa.R │ ├── transform_contributed_pws.R │ ├── transform_echo.R │ ├── transform_frs.R │ ├── transform_labeled.R │ ├── transform_mhp.R │ ├── transform_sdwis_geo_areas.py │ ├── transform_sdwis_helpers.py │ ├── transform_sdwis_service.py │ ├── transform_sdwis_ws.py │ ├── transform_tigris_ne.R │ └── transform_ucmr.R └── wsb.Rproj /.Rprofile: -------------------------------------------------------------------------------- 1 | source("renv/activate.R") 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # Environment Variables 39 | .Renviron 40 | .env 41 | 42 | # local paths to ignore: raw data and staging 43 | /data 44 | /staging 45 | /output 46 | /log 47 | 48 | # pesky OSX 49 | .DS_Store 50 | 51 | # virtual environment 52 | .venv 53 | 54 | # vs code 55 | .vscode 56 | 57 | # pycache 58 | __pycache__ 59 | 60 | # package 61 | package-lock.json 62 | package.json 63 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Water Service Boundaries - Change Log 2 | 3 | # 3.0.0 (2022-10-31) 4 | * Adding manually-contributed systems from the Internet of Water's [Github](https://github.com/cgs-earth/ref_pws/raw/main/02_output/contributed_pws.gpkg) 5 | * Refactored to use geopackage through most of pipeline instead of geojson 6 | * Added `geometry_source_detail` column, to document where the data provider got the geometries from 7 | 8 | # 2.4.0 (2022-09-27) 9 | * Added Arkansas labeled boundaries. The original data source did not have water system ids, but a match on names was pretty comprehensive. We supplemented with ~40 manually looked-up water system ids based on the remaining non-matches. There are still 12 systems with shapefiles from the underlying data that did not actually have any water system id that I could match. 10 | 11 | # 2.3.0 (2022-09-02) 12 | * Added Rhode Island labeled boundaries. The original data source did not include PWS ID's, so these were supplemented by manual effort from the EPIC team. 13 | 14 | # 2.2.0 (2022-08-23) 15 | * With version 2.0, we changed logic to eliminate Tier 2b, meaning only one PWS could "own" any particular tiger place. This caused many PWS's that were formerly Tier 2b to fall back to Tier 3. In some cases, these relied on low-quality county or state centroids from Echo, resulting in a less accurate map. In this release, we addressed this problem. For PWS's that (1) have a low-quality centroid and (2) have a matched tiger boundary, but (3) were not selected as the "best" match for that boundary, we overwrite the centroid with a calculated centroid from the top-ranked matched boundary. 16 | * Refactor to preserve all "ranked" boundary matches, not just the "best" match. 17 | * Saving the final "master" records back to the database 18 | * Added "tier" column to the database 19 | 20 | 21 | # 2.1.0 (2022-08-09) 22 | * Improved logic for how "impostors" are calculated. Here is a summary of impacts: 23 | 24 | | Category | Echo | FRS | Reason | 25 | |----------|-------|-----|-----------| 26 | | Rejected only before | 114 | 336 | 20 echo and 1 FRS are now allowed because they're within 50 km of the primacy agency's border. 326 FRS are no longer in the system at all, now that the ECHO's are coming through (the FRS mapping is unusual in that it doesn't load records if they're already coming through via ECHO, since FRS is largely duplicates of ECHO). 9 FRS were rejected for being tribal regions and not recognizing the primacy_agency_code as a state. 91 ECHO's had NULL state. | 27 | | Rejected both times | 6 | 26 | Legit impostors, identified both times. | 28 | | Rejected only after | 292 | 0 | These were previously allowed because the lat/long was consistent with the _address's state_, but not of the _primacy agency_. In the new logic, I do allow it to be outside of the primacy_agency state, but not further than 50 km away. | 29 | 30 | 31 | # 2.0.0 (2022-07-01) 32 | * No longer dropping any PWS's (but some results have tier "none", indicating no geometry) 33 | * Added Utah and Illinois labeled boundaries 34 | * Eliminated Tier 2b by implement ranking and selection of best PWS per Tiger. Roughly 3000 became Tier 2a, remaining 7000 became Tier 3 35 | * Renamed some columns: 36 | * geometry_lat -> centroid_lat 37 | * geometry_lon -> centroid_lon 38 | * geometry_quality -> centroid_quality 39 | * tiger_geoid -> matched_bound_geoid 40 | * tiger_name -> matched_bound_name 41 | * Cleaned up column names in the shapefile 42 | * Improved matching to MHPs, and prevented MHP's from matching to Tiger places 43 | * Pulled in population data for Tiger places, to help deduplicate matches 44 | * Misc bugfixes and performance improvements 45 | 46 | | Tier 1 | Tier 2a | Tier 2b | Tier 3 | None | Total | 47 | |---------|---------|----------|---------|--------|--------| 48 | | 16,896 | 11,526 | 0 | 17,526 | 3,476 | 49,424 | 49 | 50 | 51 | # 1.0.0 (2022-05-02) 52 | Initial release 53 | 54 | | Tier 1 | Tier 2a | Tier 2b | Tier 3 | None | Total | 55 | |---------|---------|----------|---------|-------|---------| 56 | | 14,607 | 9,488 | 10,104 | 10,720 | 0 | 44,919* | 57 | 58 | *Note: 4505 systems were dropped due to missing geometry or not falling within 50 US states. 59 | -------------------------------------------------------------------------------- /crosswalks/ar_pwsid_lookup.csv: -------------------------------------------------------------------------------- 1 | pws_name,pwsid 2 | SHADY ACRES MOBILE HOME PARK, 3 | AURELLE WATER SYSTEM, 4 | COMMUNITY WATER ASSOCIATION (SALESVILLE),AR0000036 5 | BRADLEY CO RURAL WATER ASSN,AR0000054 6 | FOUR MILE HILL WATER ASSOC,AR0000586 7 | SOUTHWEST ARKANSAS WATER SYST,AR0000889 8 | "PARAGOULD, LIGHT, WATER, & CABLE",AR0000222 9 | CHIDESTER WATERWORKS,AR0000403 10 | CLARK CO COUNTRY WATER FACILIT,AR0000741 11 | MCGEHEE WATERWORKS,AR0000170 12 | STAR CITY WATER WATERWORKS,AR0000318 13 | SHIRLEY WATERWORKS, 14 | PARON-OWENSVILLE WATER AUTH, 15 | LIBERTY - WOODSON / HENSLEY,AR0000471 16 | NAIL SWAIN WATER ASSOCIATION,AR0000856 17 | RAMBO WATER DISTRICT #1 INC, 18 | FURLOW WATER ASSOCIATION,AR0000645 19 | PARKIN RURAL WATER ASSOCIATION,AR0000662 20 | MENA WATER DEPT,AR0000438 21 | BASSETT WATERWORKS,AR0000377 22 | SANDRIDGE-BARDSTOWN WTR ASSOC, 23 | O'KEAN WATERWORKS,AR0000473 24 | BEAVERFORK VOLUNTEER FD WSD,AR0000844 25 | HOT SPRINGS VILLAGE WATER,AR0000208 26 | CENTRAL PUBLIC WATER AUTHORITY, 27 | OLD UNION WATER ASSOCIATION,AR0000559 28 | ODEN- PENCIL BLUFF WATER ASSN.,AR0000616 29 | VANDERVOORT WATERWORKS, 30 | OSAGE POINT MHP, 31 | CLARKSVILLE WATERWORKS,AR0000289 32 | BOIS D'ARC WATER SYSTEM,AR0000178 33 | WIEDERKEHR VILLAGE WATER DEPT, 34 | LAFE REGIONAL WATER DISTRIBUTION DISTRICT,AR0000483 35 | EAST PRAIRIE CNTY PUBLIC WATER AUTHORITY,AR0000458 36 | KINGWOOD MHP, 37 | FRANKLIN-SEBATIAN PWA,AR0001077 38 | BELLA VISTA P.O.A.,AR0000039 39 | RIDGEFIELD ESTATES,AR0000776 40 | JAMES FORK REGIONAL WATER,AR0000513 41 | SOUTH PIKE CO. WATER,AR0000978 42 | MID-ARKANSAS UTILITIES PWA,AR0000725 43 | MONTROSE / BOYDELL WATER SYSTEM,AR0000014 44 | OZAN CREEK RURAL WATER SYSTEM,AR0001078 45 | LAKEVIEW MIDWAY PUBLIC WATER AUTHORITY,AR0000027 46 | CONCORD WATER & SEWER PFB,AR0000147 47 | LONOKE WHITE PUBLIC WATER AUTH,AR0001076 48 | BAXTER-MARION REGIONAL WATER ASSOCIATION,AR0001178 49 | BEDFORD FALLS MHP, 50 | HIGHLAND PUBLIC WATER AUTHORITY,AR0000672 -------------------------------------------------------------------------------- /crosswalks/ri_pwsid_lookup.csv: -------------------------------------------------------------------------------- 1 | ID,H20_DISTRI,NAME,POP_SERVED,PWSID,pws_name,Notes 2 | 1,Block Island Water Department,BLOCK ISLAND,n/a,RI1858430,BLOCK ISLAND WATER COMPANY, 3 | 3,Cumberland Water District,CUMBERLAND,"28,586 (1995)",RI1647530,"CUMBERLAND, TOWN OF", 4 | 4,East Providence Water District,EAST PROVIDENCE,"50,857 (1992)",RI1615610,EAST PROVIDENCE-CITY OF, 5 | 5,East Smithfield Water District,NORTH PROVIDENCE,7450 (1992),RI1592024,PROVIDENCE-CITY OF,Became part of Providence in 2017 6 | 6,Greenville Water District,SMITHFIELD,8100 (1998),RI1858410,GREENVILLE WATER DISTRICT, 7 | 7,Jamestown Water District,JAMESTOWN,5339 (2000),RI1858419,JAMESTOWN WATER DEPARTMENT, 8 | 8,Johnston Water Department,JOHNSTON,4965 (1999),RI1592024,PROVIDENCE-CITY OF,Became part of providence in 2021 9 | 9,Kent County Water Authority,EAST GREENWICH,"63,706 (1993)",RI1559511,KENT COUNTY WATER AUTHORITY,*Their spreadsheet says RI1592021 10 | 10,Kingston Water District,SOUTH KINGSTOWN,3800 (1999),RI1858421,KINGSTON WATER DISTRICT, 11 | 11,Narragansett Water -- North End System,NARRAGANSETT,"12,389 (1997)",RI1858429,NARRAGANSETT WATER DEPT-NORTH END, 12 | 12,Narragansett Water -- South End System,NARRAGANSETT,"12,389 (1997)",RI1858428,NARRAGANSETT WATER SYSTEM-POINT JUDITH, 13 | 13,Newport Water District,MIDDLETOWN,"43,825 (1992)",RI1592010,NEWPORT-CITY OF, 14 | 14,North Kingstown Water District,NORTH KINGSTOWN,"26,821 (2000)",RI1559517,NORTH KINGSTOWN TOWN OF, 15 | 15,North Tiverton Water District,TIVERTON,8587 (1990),RI1592018,NORTH TIVERTON FIRE DISTRICT, 16 | 16,Pascoag Water and Fire District,BURRILLVILLE,3900 (2000),RI1592020,"PASCOAG UTILITY DISTRICT, WATER DIVISION", 17 | 17,Pawtucket Water Supply Board,CENTRAL FALLS,"109,042 (2000)",RI1592021,PAWTUCKET-CITY OF, 18 | 18,Portsmouth Water and Fire District,PORTSMOUTH,"15,797 (1994)",RI1592022,PORTSMOUTH WATER & FIRE DISTRICT, 19 | 19,Providence Water Supply Board,PROVIDENCE,"267,157 (1992)",RI1592024,PROVIDENCE-CITY OF, 20 | 20,RIEDC Water District,NORTH KINGSTOWN,5000 (1999),RI1559517,NORTH KINGSTOWN TOWN OF,*Might be covered by RI1559517 or RI1559511 21 | 21,Richmond Water District,RICHMOND,n/a,RI1000040,"RICHMOND, TOWN OF","*Probably RI1000040, but could be RI2980480, RI1647529, RI2980447 or any combination" 22 | 22,Smithfield Water Supply Board,SMITHFIELD,8900 (1998),RI1615616,SMITHFIELD WATER SUPPLY BOARD, 23 | 23,South Kingstown WD(Middlebridge W.System,SOUTH KINGSTOWN,3888 (1992),RI1000015,SOUTH KINGSTOWN-MIDDLEBRIDGE, 24 | 24,South Kingstown WD(South Shore W. System,SOUTH KINGSTOWN,3888 (1992),RI1615623,SOUTH KINGSTOWN-SOUTH SHORE, 25 | 25,Stone Bridge Fire District,TIVERTON,2125 (1993),RI1615619,STONE BRIDGE FIRE DISTRICT, 26 | 26,Tiverton Water District,TIVERTON,n/a,RI1900042,"TIVERTON WATER AUTHORITY, TOWN HALL","*Might be any combination of RI2980001, RI2051311, RI1900042, RI2980003 " 27 | 27,University of Rhode Island Water Facilit,SOUTH KINGSTOWN (URI),5000 (1999),RI1858422,UNIVERSITY OF RHODE ISLAND, 28 | 28,Warwick Water Department,WARWICK,"75,107 (1990)",RI1615627,WARWICK-CITY OF, 29 | 29,Woonsocket Water Division,WOONSOCKET & N. SMITHFIELD,n/a,RI1559518,WOONSOCKET WATER DIVISION,*Spreadsheet says RI1559512 30 | 30,Harrisville Water and Fire District,BURRILLVILLE,2637 (2000),RI1858411,HARRISVILLE FIRE DISTRICT, 31 | 31,Lincoln Water Commission,LINCOLN,"18,301 (1992)",RI1858423,LINCOLN WATER COMMISSION, 32 | 32,Westerly Water Supply System,WESTERLY,"26,842 (1993)",RI1559512,WESTERLY WATER DEPARTMENT, 33 | 33,North Smithfield Water Department,NORTH SMITHFIELD,n/a,RI1615614,SLATERSVILLE PUBLIC SUPPLY,Slatersville according to system website 34 | 34,United Water Rhode Island,SOUTH KINGSTOWN/NARRAGANSETT,"16,700 (1992)",RI1615624,VEOLIA WATER WAKEFIELD RHODE ISLAND INC, 35 | 55,Bristol County Water District,BARRINGTON,"48,853(1993)",RI1647515,BRISTOL COUNTY WATER AUTHORITY, -------------------------------------------------------------------------------- /crosswalks/state_fips_to_abbr.csv: -------------------------------------------------------------------------------- 1 | state,code 2 | AK,02 3 | AL,01 4 | AR,05 5 | AS,60 6 | AZ,04 7 | CA,06 8 | CO,08 9 | CT,09 10 | DC,11 11 | DE,10 12 | FL,12 13 | GA,13 14 | GU,66 15 | HI,15 16 | IA,19 17 | ID,16 18 | IL,17 19 | IN,18 20 | KS,20 21 | KY,21 22 | LA,22 23 | MA,25 24 | MD,24 25 | ME,23 26 | MI,26 27 | MN,27 28 | MO,29 29 | MS,28 30 | MT,30 31 | NC,37 32 | ND,38 33 | NE,31 34 | NH,33 35 | NJ,34 36 | NM,35 37 | NV,32 38 | NY,36 39 | OH,39 40 | OK,40 41 | OR,41 42 | PA,42 43 | PR,72 44 | RI,44 45 | SC,45 46 | SD,46 47 | TN,47 48 | TX,48 49 | UT,49 50 | VA,51 51 | VI,78 52 | VT,50 53 | WA,53 54 | WI,55 55 | WV,54 56 | WY,56 57 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributor Guidelines 2 | 3 | This document is meant as a primer on contributing to this repository. We strive to uphold a high standard of open-source collaboration through this repository. Please read this document before contributing to the repository. Also read Github's [code of conduct](https://github.com/github/docs/blob/a980e8037dfca61dea25796f542ea5c0fad93ee9/CODE_OF_CONDUCT.md) for reference to best practices interacting with others through the repository. 4 | 5 | ## Types of Contributions 6 | 7 | ### Issue 8 | 9 | Issues enable task-tracking. If you identify something that is incorrect or doesn't run, you can create an issue that is clearly labeled. If want to work on a contribution or refactor via a pull request, you should first create an issue that outlines the problem you are working on and obtain a green-light by the project maintainer(s). 10 | 11 | ### Pull Request 12 | 13 | Pull requests allow users to suggest a change to the repository. Pull requests will only be reviewed and incorporated if they were first agreed upon in an Issue by the project maintainer(s). Furthermore, all PRs should link to an issue(s) closed by the PR for clarity. 14 | 15 | ## How to Contribute 16 | 17 | ### Issue 18 | 19 | #### Creating an Issue 20 | 21 | Create a **new issue** if you find a problem you want to fix, identify an error in the code, or propose a refactor. Be sure to review existing issues first before posting, to ensure you do not double post. Follow the following steps to post an issue: 22 | 23 | - Clearly label new issue; e.g. if you have an issue about the transformer for AZ, label the issue accordingly: `transformer/az {descriptive message}`. 24 | - Use "labels" (e.g., "transformer", "downloader", "analysis", "bugfix", etc.) to clearly mark what part of the pipeline your issue addresses. 25 | - Include a clear description of the issue. 26 | 27 | For issues concerning larger refactors or feature developments, indicate that the issue needs review as a proposal and be sure to include your proposed plan. 28 | 29 | Issues should not be used to make feature requests. 30 | 31 | #### Solving an Issue 32 | 33 | If you review existing issues and find one you would like to solve, assign yourself the issue and propose the solution through a pull request linked to that issue. 34 | 35 | ### Pull Requests 36 | 37 | If you are solving an existing issue or working on an issue you created, always code on a branch from `develop`. When you stage and commit your code to the branch, you can create a pull request. You may also create a pull request from a forked repository. 38 | 39 | Please follow the pull request instructions from creative commons, [here](https://opensource.creativecommons.org/contributing-code/pr-guidelines/). 40 | 41 | ### Forking 42 | 43 | If you wish to manage your own version of this repository, you can fork the repository. Learn more about forking [here](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/about-forks). 44 | -------------------------------------------------------------------------------- /docs/credits.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | The repository and code herein was organized, directed, and developed by SimpleLab, Inc. As this is an MIT License, the repository code and data herein can be reused and re-purposed. 4 | 5 | 6 | 7 | [SimpleLab website](www.gosimplelab.com) 8 | 9 | ## Collaboration 10 | 11 | Water Data Lab contributed technical code and methods development for the initial development of this repository and the TEMM methodology. 12 | 13 | 14 | 15 | [WaDL website](https://www.waterdatalab.com/) 16 | 17 | Environmental Policy Innovation Center (EPIC) financed and supported engagement with the initial development of this repository from February-April, 2022 as part of their efforts with the Justice40 Initiative. 18 | 19 | 20 | 21 | 22 | [EPIC website](https://www.policyinnovation.org/) 23 | 24 | Internet of Water (IoW) provided technical advising and feedback on the approach and is collaborating with as part of the broader effort to expand use and improvement of water service boundaries. 25 | 26 | 27 | 28 | 29 | [IoW](https://internetofwater.org/) 30 | 31 | For more information about this project, please contact Jess Goddard at \. 32 | -------------------------------------------------------------------------------- /docs/diagrams/contributed_pws.drawio: -------------------------------------------------------------------------------- 1 | 5Vrbkto4EP2WfaBq98FTvmMehyG33ewm2dlkkqct2RagGttyZDFAvj4tWzaWbcBMMOwFqsBuyS2p+/RRt2Bk3cWbVwyly99piKORqYebkTUbmaZh6C58Ccm2kHiOWQgWjISy005wT75hKdSldEVCnCkdOaURJ6kqDGiS4IArMsQYXavd5jRSR03RArcE9wGK2tIHEvKlXIU53slfY7JYliMb7qRoiVHZWa4kW6KQrmsi68XIumOU8uIq3tzhSBivtMvDm+1D9PbRffXrh+wr+jj97a8/PmmFspenPFItgeGEP1u19wGZL8w7bbX6qnkP8d/Lb7O0VP2EopW0l1wr35YGxEl4K/wAd0GEsowEI2u65HEEAgMuGV0lIRaD6HCHN4R/li3i+ouQ3zjybrapdZtt5U0xJA5bjjyyXqNyAqAX0xhztoXn1js3O9J1y5qHSxnDEeLkSR0TSbQtKnXVCO8pgdmYehkZrtQj48K0JqqKjK5YgOVTdb80FFX42qeII7bAvKUILmrL3olyt58AAbMDAm4ExppmKUoULLhfVwLs04BGlI2sW2HFhf8zzB+G1suvX3KXQqQmXJujmETboutrHD1hTgJUa89yvhCthplu6g3FoKIloSxGUa3tCTGC4DsiC8RXTLDLwX4BSvd1WUtgiEZbL8CoR5hzzDRYfkCSRftJytIlSqRKs5BxvOEaghklhTgA6GJWayMQJIkcSS+XmrdwBsrmoL8cKcFFKwA5p7/aMGvKQnVilS5Yi/9IQJ3QmXFGH7EmQ0Hp56PgcZEHrdbwo2l7hQtNeyIvnNKbudYQB5RB0NBE40sSPCY4k9MjCeGktE+zb82XB/vVpqP0m0cU8aZxQpKlEdqW3SMCDab+E4lTyjgSlp7uAAtXC/H9JgnJEwlXKIq2MFzCGfFXHMjL1N8/3MPn/RKleE4iwJQMAoipIg4KDS16zMQTcBnSYBXnHp6ul4Tje3CSkK9hW1UJ0y8I861fCmC86K5YO2i05o54CzlMsCYvXiIuc+fWWtz8Vc3uCTMw7Vm4VGppcl25udeo1tA7uNbT99OqwmOnkpZhtVjrzbsHUBUsMg0jBrMy9T9xSjPCqRxP8dqaxBHKsdSw/sv8NaAtm3xvTDqM6XYYs7Lw+a1pn74HtINLg36fgALDPJpHYpkuigX4i0/RDMwqmKtsSPysaNkTa4XYZz1n8MxJzxBHeXcgIshBega+YC81sCX9zyruF+CB/S66lQ0xCUPx+BT2LPIN+bkqsemkYnPPvepMR85M6FpxWmyOuWo15CUL1pFbEWMP0B6IqL2oNVTQ2h2YtTswaw8GWbcF2Xvg/gi/RT4oXGf+P5YAbKcHAYwvSwDjcxHAjK4T2K7DIn/YpTb5bT2gi0cCGvv5zn1VArhjGHGRPsxJgoSxYtiz/6c04J5CA1ZPGnAHA67XAdyLF7BgUrb9XL+pPSVud4/ld2cpfGGNeYXZI0sqKshneP1CpfS4kV7azy2lrSOKhi6l9aHQqJ+AxhBly1yBoUBzh8Yv9bZrQVPvC809eclloGlNVES5zS24LzQtt5lANRQNDc09BVOx1el5vv56GzIK9SzD7XTp7FVu6GAvtLuqWc/0rUGrWbORgFldCdhFy1mzqwA7D3FUZFFF/Y8Rh35t4ui7p5lXJQ574qjxbjyXOFopl3VZ4mgXWrdhSESJn6fKKaMBBjSKql6HVB8+VxnWApS1WaSdK/dNaPvn1F20pIbGYLTSyD6sjuy461zHGopVrLbrWi4RJ8HpXovIH+VkiTKqjq5+tAL2XNVS5a9xNUt1mmrAQqKdur1LcaJJPhL7ZCaQZ+oR2ubfl98y5/O5GQRdW2bo+q7Te8s8AJaDdHnZPdHqOpM4O3pPsoWEa0e60IlWQ/dubNdwLE9+jocyVTu5u5ipxgdNZXVb4JjlxoOBqh3mH6vdSr+9QBCfK+89AIS9vKt17EiTDvPb7s3EHMoD7b8c1Dxwid/XrukB7/oOMNu82jehy1re+RdldCfRV/X3J1tJVLQ2f3ldacpg9DU5TvSn1YlqzbcrGm3lwMkzjxyAws17zAisUviycSh6Y+c/W9cORnXj2PlTftdUOTpeWx5MhM93KmVd9b9HzlhBpddQ0Le0NEwV3cb4wqWlMyyahzvBOIiy4+DRrwoe1emu9UzwQFjr9ZepYsmxbgxn7E3sseV6dnOaAyPLbqd5P4is/+55Wl/UGldGbeOnnUljj+2L26Yiq6no2dCE292fgYvuu79UWy++Aw== -------------------------------------------------------------------------------- /docs/diagrams/flow_diagram_v2.drawio: -------------------------------------------------------------------------------- 1 | 7V1bc6M4Fv41qZp5aApduD0m7stsVc9OajI7mdmXLRlkm24MbpCTeH79CnMxSMKWEyBObLqqg4WQzPnOXUf4Ck2WT19Sslr8mgQ0uoJm8HSFPl5BCDC0+Z+8ZVO0eKZVNMzTMCg77Rruwn9o2WiWreswoFmrI0uSiIWrdqOfxDH1WauNpGny2O42S6L2rCsyp1LDnU8iufU+DNiiaHUtc9f+Cw3ni2pmYJZXlqTqXDZkCxIkj40m9OkKTdIkYcXZ8mlCo5x4FV2K+z53XK2/WEpjpnND5jAn/uX2z6X3Hf5IYIym998+uG4xzAOJ1uUTl9+WbSoSpMk6Dmg+inmFbh4XIaN3K+LnVx856LxtwZYR/wT4aUCyxbZv/mGWxOwzWYZRDv4kWachTfnY/6aP5cVJEiXpdhZkbw/enrE0+V4TG+Y9wyiqesZJTOtOjdst07WcG35FJktJqQeaMvrUaCrJ9IUmS8rSDe9SXsV2yaMlz6ISwccdAwCr7LJogu9hA5UEJSXfzevRd9DwkxKdI5ACSIGUHfGpb0J+Ms9P7hhh9MMqTR7CHDBoRmRKo+1ZQBip+vP561sktDmNWBvSNq1LABSYkCicx/yjzynPcUY3OcVDLkrX5YVlGAT5NEoeanNZF3P0gG4luiW4AEIDeBLAjgJfbA2FrXNYCrn6WOWn/iYKOaFSdFgWpwVJv07rBuJ/n28J/dua8WFohXChdoElAMspP5vNoO+rJC6wp7bVEybQbIMCsSxyrqmQOHMwSFSKURC338ljLmMJZ/Hq2jStLnI60myTMbp8Q2LXC5Rt5cltpSxcCigrJds7ksg+jOR17HPCku1zFJrS/Ikac+MK8jnNCY2zdcZPbiNOu/yExMH2/w1bhPE890kWhOUtYZBfnlHC1inlZzSeczGjadGruG1ZOknTdRgF/MLPIzFIRGfsZLUysFCLbTC2JbaptcQofIPhYaVM4+A6dzJz8YtIloV+GyJN6aJBywmVidQggqWgQdWW0oiw8KHtuqoIU85wm4Qxa4iuBwTRFWibcRfOp+VdTVdTGAgdGoiRdE6ZNNAWp/qxXwCdylc6L+g8z7B6Ak8eamj48PnB1/Z/EHy25B0YaGjorE5jO92ZtqIhtya5sSwMDm/9sc5j4Zt2oFg3V7f/kZI4myXpkqbZ1j6LnleW+vy7s1Y3sU/D5BZfQ2qenrirJthikx+TSU8uHBRCpEZc25AHe1RrrPLiTiNECizqBlgVIrlwivpykPDJhUhYFbXKGYk4IGnAqRfIUliGThxDMi885HONmvCpRU1YIzH4vmwwFhJDEm11bTB2RfWpZ4M5Lcmm0W2Vd8j0v3A1zw72YsReDbx1dq41RgPxhQ01/eqjGQN1TDQsZ6hcvw4LvUoTn2aZhn3uMMf7VxaEFKbrU3UKc+pauRD1o8EFz1thnvEeJu1dgVsaOeV3JqgCAi+IgW1BVJE5bhhldWefXx5GNTigaszH+FC4uNe8A4CrJznw+pUwf3FVZTGrpd7u2OoSbekpDssx2s4fkFUHGNX5szx9VT5ysEUBD7cclTL3bAeRnoItW1Qlrx5sVcH2vmDrNk2etuqWsK3cV4GUuSU84dogO6+oyvaEqMqTYRxVsGygQPFdG2UbCL7oM51n2xaN+zBBlTiPPUJMZZ9dTCVyBfCe66iJwdlQbIHUX3hQvnC6k+m18r6vdD1NH0Kf7pR9/nQR2WwvPoZscSVXA5k//VE4iODnYqF7mXt4qg6w7NBYwFZ1Q5el7JxXcNvq1Hma1lK2or5oMLMDuksg+lqV+Zg8xlFCgv2LMkGz1ytHCQGdkfV25LcSKLTXZR1FlAAUKhsPxVWehjNzRgkfYCoLCUdN+njwAkkTEluWkXHxODvPDmPBIettFUUcqK9kOVbPM6hn52msZr8vvpA8/ufyhehdDebxC/OMwxcaS+qFJ7UskrFvJpszZEE/gi2kqrqE10ryeBql436ynOaWFJosbDjDFyBbIufK5ntcJFV5cLU4FqshFxQVKHZssBkTSGCqYpV+I+CCEepSpssa2FGhrbDeqkiZDFVuCKb25vMsW7nLL3dfrfX//vvPj/iDht+eh0Srjget96OSadXd7CBTdxpJcJgUyrB2olprUMNJ0WGi8FHCVdbFTw0GJtmq2K47C59yDpMCRiv/J3NdcagCyX06bB/E2kzqGgDKVXmoruJpMaZU29MbChoh/pmgUPl7ruGcAC46of5Z4eLVdQ2vCctxdayl2TwmzN5DvROJvRsCUifGDPjMFTdgWoYtIA0AqmcYqUJKo0Ll/GC13zamlRN4AfUQEG8NWB2f5aXpztPHVhcNbXTFooRXQFbeqjm5lrA9jfj1OJ7R8Hwq/9MwbUVg0JDkJi/x7kgEqb8ITa4Y+fPc4LBMw3FVkVoXIK6Bh0PElYNmwzDODBJlJGBBAyKLS45ZHraMDTINbLt9gKN8EI195kNneYDVrupQZb6AqtRYfKVCf/mFI4q/g8RfL7dPenTtt7iJllB3piwQsH2XTmf63Ik70OrW39jAzQPJbx6xsII3LcNqHvZAeCANr/h94FHXWrYk4tThOaIK6l3AA6pXOr4RfMYowdlD2BOJPaAjmJm91Ncvv27fN3C0gcZ4MczpYym+K6snMEH1GsKx0NTYMHwGaIpv7OkHTM2qqN6w1Chc+nx794YqJIYyn1jY7IlNw20dkvXEqgWTsaynRi3T3cf7f12Q5cZQ3Mer4RmpsIW2YbcOPBS4GuVNlPnbfMXZowuQebrwKh9Rw7wOXsNiicZNzgMpt7YPlo+zX8uFtE7K6XC5GXKwtzue52V0DLOpOR07puW6HHfoYQAEKzWwT+LAw1CfcmReya92Igt5RovcDpaErco2Cr5E0d0u0RrKmXA0QvH3AUhVCusJJuPkEdLQju8KIQC8twWQxsvMs8Xq/flrRyOLDWGlvQujprumLk4bDVyNIJpP9i1L4gvA0EZipYyG8KoRPuSo9Iewztag7OGCLgQQSQJ8SvAqn9KWQMrd/bvyY5KyRTJPYhJ92rUK5Nz1+ZokqxLXb5SxTfnTWGTNkjbq9ClkfzXO/26cf3wqx91+2LQwEyIR5Q6OIg7Yw9BF3k9G8aWrFcJ+ECzq3P5CByWQGor4PyyMQrY5g19bUmQG9DMAYgJAUak/1E8tKZGFGqvpRVZkz+O/PDFiCW6n6ifGkMrR7KVQXkkZMOb2nsn2kFlPewPDIabsxr4TE2/0/T1qHI7LUOlVQe9X9scQa6x3ufa6ZcGW627l4Qa2KYrazH24amce3xq0mmhogzvcL5yoYZRd+Nv7Ow3D3r/X3acOrFK2hgkb6VwHyu9SGLRuk3/c/SJqAdnud2XRp/8D -------------------------------------------------------------------------------- /docs/frs/facility-registry-service-best-pick-processing-v-2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/frs/facility-registry-service-best-pick-processing-v-2.0.pdf -------------------------------------------------------------------------------- /docs/img/contributed_pws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/contributed_pws.png -------------------------------------------------------------------------------- /docs/img/data_flow_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/data_flow_diagram.png -------------------------------------------------------------------------------- /docs/img/data_flow_diagram_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/data_flow_diagram_v2.png -------------------------------------------------------------------------------- /docs/img/data_sources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/data_sources.png -------------------------------------------------------------------------------- /docs/img/epic_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/epic_logo.png -------------------------------------------------------------------------------- /docs/img/mapping_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/mapping_diagram.png -------------------------------------------------------------------------------- /docs/img/matches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/matches.png -------------------------------------------------------------------------------- /docs/img/matching_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/matching_diagram.png -------------------------------------------------------------------------------- /docs/img/simplelab_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/simplelab_logo.png -------------------------------------------------------------------------------- /docs/img/spatial_assignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/spatial_assignment.png -------------------------------------------------------------------------------- /docs/img/stacked_match_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/stacked_match_report.png -------------------------------------------------------------------------------- /docs/img/temm-nation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/temm-nation.png -------------------------------------------------------------------------------- /docs/img/tiers_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/tiers_diagram.png -------------------------------------------------------------------------------- /docs/img/wadl_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/docs/img/wadl_logo.jpg -------------------------------------------------------------------------------- /etc/wsb_labeled_simplified.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/etc/wsb_labeled_simplified.rds -------------------------------------------------------------------------------- /layers/epa_regions.csv: -------------------------------------------------------------------------------- 1 | region,state 2 | 1,CT 3 | 1,ME 4 | 1,MA 5 | 1,NH 6 | 1,RI 7 | 1,VT 8 | 2,NJ 9 | 2,NY 10 | 2,PR 11 | 2,VI 12 | 3,DE 13 | 3,DC 14 | 3,MD 15 | 3,PA 16 | 3,VA 17 | 3,WV 18 | 4,AL 19 | 4,FL 20 | 4,GA 21 | 4,KY 22 | 4,MS 23 | 4,NC 24 | 4,SC 25 | 4,TN 26 | 5,IL 27 | 5,IN 28 | 5,MI 29 | 5,MN 30 | 5,OH 31 | 5,WI 32 | 6,AR 33 | 6,LA 34 | 6,NM 35 | 6,OK 36 | 6,TX 37 | 7,IA 38 | 7,KS 39 | 7,MO 40 | 7,NE 41 | 8,CO 42 | 8,MT 43 | 8,ND 44 | 8,SD 45 | 8,UT 46 | 8,WY 47 | 9,AZ 48 | 9,CA 49 | 9,HI 50 | 9,NV 51 | 9,AS 52 | 9,MP 53 | 9,FM 54 | 9,GU 55 | 9,MH 56 | 9,PW 57 | 10,AK 58 | 10,ID 59 | 10,OR 60 | 10,WA 61 | -------------------------------------------------------------------------------- /renv/.gitignore: -------------------------------------------------------------------------------- 1 | library/ 2 | local/ 3 | cellar/ 4 | lock/ 5 | python/ 6 | staging/ 7 | -------------------------------------------------------------------------------- /renv/settings.dcf: -------------------------------------------------------------------------------- 1 | bioconductor.version: 2 | external.libraries: 3 | ignored.packages: 4 | package.dependency.fields: Imports, Depends, LinkingTo 5 | r.version: 6 | snapshot.type: implicit 7 | use.cache: TRUE 8 | vcs.ignore.cellar: TRUE 9 | vcs.ignore.library: TRUE 10 | vcs.ignore.local: TRUE 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.2 2 | pandas==1.4.1 3 | python-dotenv==0.19.2 4 | openpyxl==3.0.9 5 | sqlalchemy==1.4.31 6 | psycopg2==2.9.3 7 | geoalchemy2==0.6.3 8 | tabulate==0.8.9 9 | 10 | # Optional 11 | ipykernel==6.9.0 12 | -------------------------------------------------------------------------------- /src/analysis/README.md: -------------------------------------------------------------------------------- 1 | # Sandbox 2 | 3 | The sandbox houses EDA, sanity checks, feature engineering experiments, and other ad hoc analysis that should remain separate from the pipeline in `/src`. -------------------------------------------------------------------------------- /src/analysis/sandbox/eda/explore_wsb_sdwis.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code explores the relationship between the state WSB data and the SDWIS data. 3 | 4 | Updated 3/21/22 5 | 6 | Make a dataframe that: 7 | - compares percentages of pwsid matching between WSB and SDWIS data 8 | - displays pwsid duplicate counts for states with staged WSB data 9 | """ 10 | 11 | #%% 12 | 13 | import geopandas as gpd 14 | import pandas as pd 15 | import os 16 | from dotenv import load_dotenv 17 | import re 18 | 19 | 20 | # File path and data import 21 | load_dotenv() 22 | 23 | staging_path = os.environ["WSB_STAGING_PATH"] 24 | 25 | # Helper: Divides and returns a percent 26 | 27 | def get_pc(num, denom): 28 | return round((num/denom)*100, 1) 29 | 30 | #%% get list of paths/filenames for staged state wsb data 31 | staging_file_list = [file for file in os.listdir(staging_path) if re.search(r"wsb_labeled_\w\w.gpkg", file)] 32 | num_states = len(staging_file_list) 33 | 34 | #%% read in sdwis data 35 | sdwis = pd.read_csv(os.path.join(staging_path, 'sdwis_water_system.csv')) 36 | 37 | # filter for systems with active community water systems (reduces by 90%) 38 | sdwis = sdwis[(sdwis['pws_activity_code'] == 'A') & 39 | (sdwis['pws_type_code'] == 'CWS')] 40 | 41 | #%% compare wsb staging data with sdwis 42 | nested_list = [] 43 | 44 | for i, staging_file in enumerate(staging_file_list): 45 | print(f'\rComparing WSB and SDWIS data for state {i+1}/{num_states}...', end='') 46 | 47 | # read in staged state wsb data 48 | # select state from sdwis data 49 | state_wsb = gpd.read_file(os.path.join(staging_path, staging_file)) 50 | state = staging_file[:2].upper() 51 | state_sdwis = sdwis[sdwis['primacy_agency_code'] == state] 52 | 53 | # df id columns 54 | id_wsb = state_wsb['pwsid'] 55 | id_sdwis = state_sdwis['pwsid'] 56 | 57 | # df lengths 58 | len_wsb = len(state_wsb) 59 | len_sdwis = len(state_sdwis) 60 | 61 | # wsb id % matching to sdwis id 62 | wsb_matching_to_sdwis = len(state_wsb[state_wsb['pwsid'].isin(id_sdwis)]) 63 | 64 | # sdwis id % matching to wsb id 65 | sdwis_matching_to_wsb = len(state_sdwis[state_sdwis['pwsid'].isin(id_wsb)]) 66 | 67 | nested_list.append([state, 68 | get_pc(wsb_matching_to_sdwis, len_wsb), 69 | get_pc(sdwis_matching_to_wsb, len_sdwis), 70 | get_pc(len_wsb, len_sdwis), 71 | len(id_wsb) - len(set(id_wsb)), 72 | len(id_sdwis) - len(set(id_sdwis))]) 73 | 74 | print('done.') 75 | 76 | wsb_sdwis_matches = pd.DataFrame(nested_list, 77 | columns=['state', 78 | '% WSB IDs \nin SDWIS', 79 | '% SDWIS IDs \nin WSB', 80 | 'WSB % size \nof SDWIS', 81 | 'WSB dup IDs', 'SDWIS dup IDs']) 82 | 83 | #%% print table 84 | 85 | print(wsb_sdwis_matches.to_markdown(tablefmt='pretty')) 86 | 87 | # %% 88 | -------------------------------------------------------------------------------- /src/analysis/sandbox/eda/multipolygon_pwsids_in_labeled_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "multipolygon pwsid in labeled geometries" 3 | output: html_document 4 | --- 5 | 6 | _Rich Pauloo_ 7 | _Last updated `r Sys.time()`_ 8 | 9 | ```{r setup, include=FALSE} 10 | knitr::opts_chunk$set(echo = TRUE, 11 | out.width = "100%", 12 | message = FALSE, 13 | error = FALSE, 14 | warning = FALSE) 15 | ``` 16 | 17 | There are duplicate pwsid in labeled data, and these polygons are adjacent. Thus they should be joined in transformers. 18 | 19 | For instance: 20 | 21 | ```{r} 22 | library(tidyverse) 23 | library(sf) 24 | library(fs) 25 | library(mapview) 26 | 27 | # mapview option for render 28 | mapviewOptions(fgb = FALSE) 29 | 30 | # data input location for modeling is the post-transformer staging path 31 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 32 | 33 | # read labeled data and ignore NA pwsid 34 | wsb_labeled <- st_read(path(staging_path, "wsb_labeled.geojson")) %>% 35 | filter(!is.na(pwsid)) 36 | 37 | # multipolygon count (dc) data frame 38 | mc <- st_drop_geometry(wsb_labeled) %>% 39 | count(pwsid, sort = TRUE) %>% 40 | filter(n > 1) 41 | 42 | mc 43 | 44 | # multipolygon ids (mid) and data (md), remove 3 NA pwsid 45 | mid <- mc$pwsid 46 | md <- filter(wsb_labeled, pwsid %in% mid) %>% 47 | filter(!is.na(pwsid)) 48 | 49 | # plot dupes for visual inspection 50 | mapview(md, zcol = "pwsid") 51 | ``` 52 | 53 | 54 | Visual inspection of duplicate pwsid indicates they are few in number (`r nrow(md)` rows and `r length(unique(md$pwsid))` unique values) and typically spatially adjacent, but in other cases, can be separated by considerable distance. This issue is addressed in the pre-modeling transformer, currently in `src/analysis/sandbox/model_explore/01_preprocess.R`. Eventually, this preprocess program will be moved to `src/predict` and the code that accomplishes the cleaning (pending consideration) may be moved to a transformer helper function. The cleaning currently implemented: 55 | 56 | 1. unions duplicate pwsid geometries 57 | 2. groups by pwsid and sums area 58 | 3. recalculates radius from these areas (no convex hull, which would inflate radii for multipolygon systems with considerable space between polygons) 59 | 4. recalculates centroids x and y (as before, these are suspect and less meaningful for non-adjacent multipolgyon systems) 60 | -------------------------------------------------------------------------------- /src/analysis/sandbox/eda/wholesalers.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Wholesaler EDA" 3 | output: 4 | html_document: 5 | highlight: zenburn 6 | code_folding: hide 7 | --- 8 | 9 | ```{r setup, include=FALSE} 10 | knitr::opts_chunk$set(warning = FALSE, message = FALSE, out.width = "100%") 11 | ``` 12 | 13 | **Wholesalers**: to keep or not to keep? 14 | 15 | Labeled data seems to indicate that these should be kept, as there are clear urban areas covered by wholesalers without internal water system boundaries. 16 | 17 | ```{r} 18 | library(tidyverse) 19 | library(sf) 20 | library(fs) 21 | library(mapview) 22 | 23 | mapviewOptions(fgb = FALSE) 24 | 25 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 26 | 27 | # cols to keep from sdwis data 28 | cols_keep <- c("pwsid", "is_wholesaler_ind", 29 | "primacy_type", "primary_source_code") 30 | 31 | # read sdwis data and only keep the specified columns 32 | sdwis <- path(staging_path, "sdwis_water_system.csv") %>% 33 | read_csv(col_select = all_of(cols_keep)) 34 | 35 | # clean labeled wsb 36 | wsb_labeled_clean <- st_read(path(staging_path, "wsb_labeled_clean.gpkg")) 37 | 38 | # plot 39 | wsb_labeled_clean %>% 40 | left_join(sdwis) %>% 41 | mapview(zcol = "is_wholesaler_ind") 42 | ``` 43 | -------------------------------------------------------------------------------- /src/analysis/sandbox/matching/stats.py: -------------------------------------------------------------------------------- 1 | # Let's use the labeled data to check some hypotheses. 2 | 3 | #%% 4 | 5 | import os 6 | import pandas as pd 7 | import geopandas as gpd 8 | import sqlalchemy as sa 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | DATA_PATH = os.environ["WSB_STAGING_PATH"] + "/../outputs" 14 | EPSG = os.environ["WSB_EPSG"] 15 | 16 | # Connect to local PostGIS instance 17 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 18 | 19 | PROJ = os.environ["WSB_EPSG_AW"] 20 | 21 | #%% 22 | # Load up the data sources 23 | 24 | supermodel = gpd.GeoDataFrame.from_postgis( 25 | "SELECT * FROM pws_contributors WHERE source_system NOT IN ('ucmr');", 26 | conn, geom_col="geometry") 27 | 28 | candidates = supermodel[supermodel["source_system"].isin(["tiger", "mhp"])].set_index("contributor_id") 29 | labeled = supermodel[supermodel["source_system"] == "labeled"] 30 | 31 | matches = pd.read_sql("SELECT * FROM matches;", conn) 32 | 33 | 34 | candidates = candidates.to_crs(PROJ) 35 | labeled = labeled.to_crs(PROJ) 36 | 37 | 38 | # Q: Which match type leads to the best results? 39 | # Q: Are MHP matches good? 40 | # Q: Are MHP points better than ECHO points? 41 | # Q: What centroid_quality's result in good vs bad spatial matches? Perhaps there are some we could exclude. 42 | 43 | #%% 44 | 45 | # Q: Which match type leads to the best results? 46 | 47 | # I need to get the labeled polygon in one series and the TIGER polygons + match types in another series 48 | # Then join them on PWSID and find the distance between polygons 49 | # Then score the match rules: If distance is 0 it gets a point, otherwise not 50 | # Assign a percentage correctness 51 | 52 | s1 = gpd.GeoSeries( 53 | labeled[["pwsid", "geometry"]] 54 | .loc[labeled["master_key"].isin(matches["master_key"])] 55 | .set_index("pwsid") 56 | ["geometry"]) 57 | 58 | # TIGER and MHP candidates (note that this index will not be unique) 59 | candidate_matches = gpd.GeoDataFrame(matches 60 | .join(candidates[["source_system", "geometry"]], on="candidate_contributor_id") 61 | .rename(columns={"master_key": "pwsid"}) 62 | .set_index("pwsid") 63 | [["geometry", "match_rule", "source_system"]]) 64 | 65 | # Filter to only the PWS's that appear in both series 66 | # 7,423 match 67 | 68 | s1 = s1.loc[s1.index.isin(candidate_matches.index)] 69 | candidate_matches = candidate_matches.loc[candidate_matches.index.isin(s1.index)] 70 | 71 | 72 | # This gives a couple warnings, but they're OK 73 | # "Indexes are different" - this is because tiger_matches has duplicated indices (multiple matches to the same PWS) 74 | # "Geometry is in a geographic CRS" - Projected CRS's will give more accurate distance results, but it's fine for our purposes. 75 | distances = s1.distance(candidate_matches, align=True) 76 | 77 | # Not sure what causes NA. Filter only non-NA 78 | distances = distances[distances.notna()] 79 | distances.name = "distance" 80 | 81 | # re-join to the match table 82 | candidate_matches = candidate_matches.join(distances, on="pwsid", how="inner") 83 | 84 | # Assign a score 85 | PROXIMITY_BUFFER = 1000 86 | candidate_matches["score"] = candidate_matches["distance"] < PROXIMITY_BUFFER 87 | 88 | 89 | #%% 90 | # How did our match rules (and combos of rules) perform for TIGER? 91 | (candidate_matches 92 | .loc[candidate_matches["source_system"] == "tiger"] 93 | .groupby(["match_rule", "source_system"]) 94 | .agg( 95 | points = ("score", "sum"), 96 | total = ("score", "size") 97 | ) #type:ignore 98 | .eval("score = points / total") 99 | .sort_values("score", ascending=False)) 100 | 101 | # This suggests that our MHP matching is pretty bad. 102 | # However, this only includes MHP's that matched to labeled bounds. And labeled bounds are likely municipalities / other big water systems, not MHP's. 103 | # So perhaps we're filtering to only the bad matches? 104 | 105 | 106 | #%% 107 | candidate_matches 108 | 109 | #%% 110 | distances -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/.gitignore: -------------------------------------------------------------------------------- 1 | /model_march_files -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/02_random_forest.R: -------------------------------------------------------------------------------- 1 | # fit a random forest ----------------------------------------------------- 2 | 3 | library(tidyverse) 4 | library(tidymodels) 5 | library(sf) 6 | library(fs) 7 | library(vip) 8 | 9 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 10 | 11 | # read full dataset 12 | d <- read_csv(path(staging_path, "model_input_clean.csv")) 13 | 14 | # unlabeled data (du) and labeled data (dl) 15 | du <- d %>% filter(is.na(radius)) 16 | dl <- d %>% filter(!is.na(radius)) 17 | 18 | # plit labeled data (dl) into train and test with stratified random sampling 19 | # in each of the radius quartiles to account for the lognormal distribution 20 | # of the response variable (radius) and avoid overfitting to small radius obs 21 | set.seed(55) 22 | dl_split <- initial_split(dl, prop = 0.8, strata = radius) 23 | train <- training(dl_split) 24 | test <- testing(dl_split) 25 | 26 | # model and workflow 27 | rf_mod <- 28 | rand_forest(trees = 1000) %>% 29 | set_engine("ranger", importance = "impurity") %>% 30 | set_mode("regression") 31 | 32 | rf_wflow <- 33 | workflow() %>% 34 | add_formula( 35 | radius ~ 36 | population_served_count + 37 | # importantly, the RF can have correlated predictors, so we add 38 | # service connections, and don't need to account for interactions 39 | service_connections_count + 40 | # use the cleaned owner type code from preprocess.R, which converts 41 | # 2 "N" owner type codes to "M" so that models can evaluate 42 | owner_type_code_clean + 43 | is_wholesaler_ind + 44 | satc 45 | ) %>% 46 | add_model(rf_mod) 47 | 48 | # fit the random forest model 49 | rf_fit <- fit(rf_wflow, train) 50 | 51 | # show variable importance 52 | rf_fit %>% 53 | extract_fit_parsnip() %>% 54 | vip(geom = "point") 55 | 56 | # predict on test set 57 | rf_test_res <- test %>% 58 | # select(radius) %>% 59 | bind_cols(predict(rf_fit, test)) 60 | 61 | # plot residuals 62 | rf_test_res %>% 63 | ggplot(aes(log10(radius), log10(.pred), color = owner_type_code)) + 64 | geom_point(alpha = 0.4) + 65 | geom_abline(lty = 2, color = "red") + 66 | labs(y = "Predicted radius (log10)", x = "Radius (log10)") + 67 | # scale and size the x- and y-axis uniformly 68 | coord_obs_pred() 69 | 70 | # RMSE 71 | rf_metrics <- metric_set(rmse, rsq, mae) 72 | rf_metrics(rf_test_res, truth = log10(radius), estimate = log10(.pred)) 73 | -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/03_xgboost.R: -------------------------------------------------------------------------------- 1 | # fit a random forest ----------------------------------------------------- 2 | 3 | library(tidyverse) 4 | library(tidymodels) 5 | library(sf) 6 | library(fs) 7 | library(vip) 8 | library(here) 9 | 10 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 11 | 12 | # read full dataset 13 | d <- read_csv(path(staging_path, "model_input_clean.csv")) 14 | 15 | # unlabeled data (du) and labeled data (dl) 16 | du <- d %>% filter(is.na(radius)) 17 | dl <- d %>% filter(!is.na(radius)) 18 | 19 | # plit labeled data (dl) into train and test with stratified random sampling 20 | # in each of the radius quartiles to account for the lognormal distribution 21 | # of the response variable (radius) and avoid overfitting to small radius obs 22 | set.seed(55) 23 | dl_split <- initial_split(dl, prop = 0.8, strata = radius) 24 | train <- training(dl_split) 25 | test <- testing(dl_split) 26 | 27 | # model and workflow 28 | xgb_mod <- 29 | boost_tree( 30 | trees = 1000, 31 | tree_depth = tune(), 32 | min_n = tune(), 33 | # loss_reduction = tune(), 34 | # sample_size = tune(), 35 | # mtry = tune(), 36 | learn_rate = tune() 37 | ) %>% 38 | set_engine("xgboost") %>% 39 | set_mode("regression") 40 | 41 | # hyperparameter space 42 | xgb_grid <- grid_latin_hypercube( 43 | tree_depth(), 44 | min_n(), 45 | # loss_reduction(), 46 | # sample_size = sample_prop(), 47 | # finalize(mtry(), train), 48 | learn_rate(), 49 | size = 30 50 | ) 51 | 52 | xgb_wflow <- 53 | workflow() %>% 54 | add_formula( 55 | radius ~ 56 | population_served_count + 57 | # importantly, the RF can have correlated predictors, so we add 58 | # service connections, and don't need to account for interactions 59 | service_connections_count + 60 | # use the cleaned owner type code from preprocess.R, which converts 61 | # 2 "N" owner type codes to "M" so that models can evaluate 62 | owner_type_code_clean + 63 | is_wholesaler_ind + 64 | satc 65 | ) %>% 66 | add_model(xgb_mod) 67 | 68 | # CV 69 | set.seed(123) 70 | xgb_folds <- vfold_cv(train, strata = radius) 71 | 72 | # tune the model 73 | doParallel::registerDoParallel() 74 | 75 | set.seed(234) 76 | xgb_res <- tune_grid( 77 | xgb_wflow, 78 | resamples = xgb_folds, 79 | grid = xgb_grid, 80 | control = control_grid(save_pred = TRUE) 81 | ) 82 | 83 | # save for use in report 84 | write_rds(xgb_res, here("src/analysis/sandbox/model_explore/etc/xgb_res.rds")) 85 | 86 | # visualize model performance across tuning grid 87 | xgb_res %>% 88 | collect_metrics() %>% 89 | filter(.metric == "rsq") %>% 90 | select(mean, min_n:learn_rate) %>% 91 | pivot_longer(min_n:learn_rate, 92 | values_to = "value", 93 | names_to = "parameter" 94 | ) %>% 95 | ggplot(aes(value, mean, color = parameter)) + 96 | geom_point(alpha = 0.8, show.legend = FALSE) + 97 | facet_wrap(~parameter, scales = "free_x") + 98 | labs(x = NULL, y = "rsq") 99 | 100 | show_best(xgb_res, "rsq") 101 | 102 | # select best model 103 | final_xgb <- finalize_workflow( 104 | xgb_wflow, select_best(xgb_res, "rsq") 105 | ) 106 | 107 | final_xgb 108 | 109 | # save for later use in report 110 | write_rds(final_xgb, here("src/analysis/sandbox/model_explore/etc/final_xgb.rds")) 111 | 112 | # fit the final xgboost model on training data 113 | xgb_fit <- fit(final_xgb, train) 114 | 115 | # show variable importance 116 | xgb_fit %>% 117 | extract_fit_parsnip() %>% 118 | vip(geom = "point") 119 | 120 | # predict on test set 121 | xgb_test_res <- test %>% 122 | select(radius) %>% 123 | bind_cols(predict(xgb_fit, test)) 124 | 125 | # plot residuals 126 | xgb_test_res %>% 127 | ggplot(aes(log10(radius), log10(.pred))) + 128 | geom_point(alpha = 0.4) + 129 | geom_abline(lty = 2, color = "red") + 130 | labs(y = "Predicted radius (log10)", x = "Radius (log10)") + 131 | # scale and size the x- and y-axis uniformly 132 | coord_obs_pred() 133 | 134 | # RMSE 135 | xgb_metrics <- metric_set(rmse, rsq, mae) 136 | xgb_metrics(xgb_test_res, truth = log10(radius), estimate = log10(.pred)) 137 | -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/README.md: -------------------------------------------------------------------------------- 1 | # "model_explore" Little Sandbox 2 | 3 | This little sandbox houses model exploration scripts used to prototype the final code in `src/model` and the March 2022 report summarizing construction of the TEMM data layer. 4 | 5 | ## Table of contents 6 | 7 | * `model_march.Rmd` summarizes construction of the TEMM data layer and uses flat files in `/etc` to render. 8 | * `02_random_forest.R` fits the random forest model. 9 | * `03_xgboost.R` fits the xgboost model. 10 | 11 | * `/archive` has two scripts: 12 | - `01_preprocess.R` -> migrated to and superseded by `src/model/01_preprocess.R` 13 | - `04_linear.R` -> migrated to and superseded by `src/model/02_linear.R` 14 | -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/archive/01_preprocess.R: -------------------------------------------------------------------------------- 1 | # preprocess data for model ----------------------------------------------- 2 | 3 | library(tidyverse) 4 | library(sf) 5 | library(fs) 6 | 7 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 8 | 9 | # this is the critical service connection count below which (inclusive) we 10 | # assume that the value is nonsensical, and impute it based on population. 11 | # We also assume that population counts less than n_max are unreasonable, 12 | # and only work with populations >= 15 (at least one per service connection) 13 | n_max <- 15 14 | cat("Preparing to mean impute service connection count", 15 | "for all values >=", n_max, ".\n") 16 | 17 | # j stands for joined data, read and rm rownumber column, then drop 18 | # observations without a centroid or with nonsensical service connections 19 | j <- read_csv(path(staging_path, "matched_output.csv")) %>% 20 | filter(!is.na(centroid_lat) | !is.na(centroid_lon)) %>% 21 | # filter out systems with < n_max population count - 243 (0.5%), 22 | filter(population_served_count > n_max) 23 | cat("Read", nrow(j), "matched outputs with >=", 24 | n_max, "connection & population count.\n") 25 | 26 | 27 | # mean impute service connections == 0 with linear model ------------------ 28 | 29 | # A 2022-03-08 meeting with IoW/BC/EPIC recommended filtering out 30 | # wholesalers and water systems with a zero population count. However, 31 | # many water systems have service connection counts (e.g., between 0 and 10), 32 | # but very high population (e.g., in the hundreds to thousands), wholesalers 33 | # in labeled data are primarily found in WA and TX, and wholesalers 34 | # typically occupy urban areas and do not contain smaller pwsids. Thus, we 35 | # retain all observations and mean impute suspect (between 0 and N) service 36 | # connections. 37 | 38 | # we learned in the Feb 2022 EDA (sandbox/eda/eda_february.Rmd) that 39 | # population served and service connection count had outliers that were 40 | # likely incorrect. Here we highlight "bad" high leverage points 41 | # j %>% 42 | # mutate( 43 | # grp = ifelse( 44 | # population_served_count %in% 0:n_max, "bad", "good" 45 | # ) 46 | # ) %>% 47 | # ggplot(aes(service_connections_count, population_served_count)) + 48 | # geom_point(aes(color = grp), alpha = 0.5) + 49 | # geom_smooth(method = "lm") 50 | 51 | # linear model for imputing service connections from population served 52 | # Only train on population served >= n_max (community water systems) 53 | jm <- j %>% filter(service_connections_count >= n_max, 54 | population_served_count >= n_max) 55 | 56 | # simple linear model for imputing service connection count and b1 slope 57 | m <- lm(service_connections_count ~ population_served_count, data = jm) 58 | b1 <- coefficients(m)["population_served_count"] 59 | 60 | # predict, & change the y-intercept to 0 to avoid negative connections 61 | j <- j %>% 62 | mutate( 63 | service_connections_count = ifelse( 64 | service_connections_count < n_max, 65 | ceiling(population_served_count * b1), 66 | service_connections_count) 67 | ) 68 | cat("Mean imputed service connection count.\n") 69 | 70 | 71 | # read labeled data with recalculated area, centroid for multipolygon pwsids -- 72 | 73 | # read wsb_labeled_clean 74 | wsb_labeled_clean <- st_read(path(staging_path, "wsb_labeled_clean.gpkg")) 75 | 76 | # rm geometry and other unnecessary (for model) cols from clean wsb labels 77 | vars_keep <- c("pwsid", "radius") 78 | 79 | wsb_labeled_clean_df <- wsb_labeled_clean %>% 80 | select(all_of(vars_keep)) %>% 81 | st_drop_geometry() 82 | 83 | 84 | # join clean wsb labeled data to matched output and write ----------------- 85 | 86 | # add other data, including SDWIS 87 | 88 | # cols to keep from sdwis data 89 | cols_keep <- c("pwsid", "is_wholesaler_ind", 90 | "primacy_type", "primary_source_code") 91 | 92 | # read sdwis data and only keep the specified columns 93 | sdwis <- path(staging_path, "sdwis_water_system.csv") %>% 94 | read_csv(col_select = all_of(cols_keep)) 95 | 96 | # ensure non-duplicate pwsid in SDIWS pre-join 97 | cat("Detected", length(unique(sdwis$pwsid)), "unique pwsids", "and", 98 | nrow(sdwis), "rows in SDWIS. Numbers must equal for safe join.\n") 99 | 100 | # join to matched output, and lose 378/13435 (2.8% of labeled data) which 101 | # is not in combined_output.csv 102 | d <- j %>% 103 | left_join(wsb_labeled_clean_df, by = "pwsid") %>% 104 | left_join(sdwis) 105 | cat("Joined matched output, labeled data, and sdwis data.\n") 106 | 107 | # sanity row count equivalence pre and post join (this is FALSE when, for 108 | # instance, duplicate pwsid are present) 109 | cat("Row count equivalence pre and post-join is", nrow(d) == nrow(j), "\n") 110 | 111 | 112 | # apply cleaning informed by EDA ------------------------------------------ 113 | 114 | d <- d %>% 115 | mutate( 116 | # when radius == 0, make it NA 117 | radius = ifelse(radius == 0, NA, radius), 118 | # split type codes in the "python list" into chr vectors 119 | satc = strsplit(service_area_type_code, ", "), 120 | # map over the list to remove brackets ([]) and quotes (') 121 | satc = map(satc, ~str_remove_all(.x, "\\[|\\]|'")), 122 | # sort the resulting chr vector 123 | satc = map(satc, ~sort(.x)), 124 | # collapse the sorted chr vector 125 | satc = map_chr(satc, ~paste(.x, collapse = "")), 126 | # convert the sorted chr vector to factor with reasonable level count 127 | satc = fct_lump_prop(satc, 0.02), 128 | satc = as.character(satc), 129 | satc = ifelse(is.na(satc), "Other", satc), 130 | # convert T/F is_wholesaler_ind to character for dummy var prep 131 | is_wholesaler_ind = ifelse(is_wholesaler_ind == TRUE, 132 | "wholesaler", "not wholesaler"), 133 | # make native american owner types (only 2 present) public/private (M) 134 | owner_type_code = ifelse(owner_type_code == "N", "M", owner_type_code) 135 | ) 136 | cat("Cleaned data according to EDA-generated insights.\n") 137 | 138 | # write for modeling 139 | write_csv(d, path(staging_path, "matched_output_clean.csv")) 140 | cat("Wrote clean preprocessed data for modeling to staging path.\n") 141 | -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/archive/04_linear.R: -------------------------------------------------------------------------------- 1 | # linear model ------------------------------------------------------------ 2 | 3 | library(tidyverse) 4 | library(tidymodels) 5 | library(sf) 6 | library(fs) 7 | 8 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 9 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 10 | 11 | # read dataset and log transform the response - only for linear model 12 | d <- read_csv(path(staging_path, "matched_output_clean.csv")) %>% 13 | mutate(radius = log10(radius), 14 | # multiply correlated predictors 15 | density = population_served_count * service_connections_count) 16 | 17 | # unlabeled data (du) and labeled data (dl) 18 | du <- d %>% filter(is.na(radius)) 19 | dl <- d %>% filter(!is.na(radius)) 20 | 21 | # split labeled data (dl) into train and test with stratified random sampling 22 | # in each of the radius quartiles to account for the lognormal distribution 23 | # of the response variable (radius) and avoid overfitting to small radius obs 24 | set.seed(55) 25 | dl_split <- initial_split(dl, prop = 0.8, strata = radius) 26 | train <- training(dl_split) 27 | test <- testing(dl_split) 28 | 29 | # lm recipe 30 | lm_recipe <- 31 | # specify the model - interaction terms come later 32 | recipe( 33 | radius ~ 34 | service_connections_count + 35 | owner_type_code + 36 | satc + 37 | is_wholesaler_ind, 38 | data = train 39 | ) %>% 40 | # convert predictors to log10 41 | step_log(service_connections_count, base = 10) %>% 42 | # encode categorical variables 43 | step_dummy(all_nominal_predictors()) %>% 44 | # specify interaction effects 45 | step_interact(~service_connections_count:starts_with("owner_type_code")) %>% 46 | step_interact(~service_connections_count:starts_with("satc")) %>% 47 | step_interact(~service_connections_count:starts_with("is_wholesaler_ind")) 48 | 49 | # specify model and engine for linear model and rf 50 | lm_mod <- linear_reg() %>% set_engine("lm") 51 | 52 | # lm workflow 53 | lm_wflow <- 54 | workflow() %>% 55 | add_model(lm_mod) %>% 56 | add_recipe(lm_recipe) 57 | 58 | # fit the linear model on the training set 59 | lm_fit <- fit(lm_wflow, train) 60 | 61 | # predict on the test set and bind mean predictions and CIs 62 | lm_test_res <- test %>% 63 | select(radius) %>% 64 | bind_cols(predict(lm_fit, test)) %>% 65 | bind_cols(predict(lm_fit, test, type = "conf_int")) 66 | 67 | # plot residuals 68 | lm_test_res %>% 69 | ggplot(aes(radius, .pred)) + 70 | geom_point(alpha = 0.4) + 71 | geom_abline(lty = 2, color = "red") + 72 | labs(y = "Predicted radius (log10)", x = "Radius (log10)") + 73 | # scale and size the x- and y-axis uniformly 74 | coord_obs_pred() 75 | 76 | # RMSE 77 | lm_metrics <- metric_set(rmse, rsq, mae) 78 | lm_metrics(lm_test_res, truth = radius, estimate = .pred) 79 | 80 | 81 | # apply modeled radii to centroids for all data and write ----------------- 82 | 83 | # read matched output for centroid lat/lng 84 | matched_output_clean <- path(staging_path, "matched_output_clean.csv") %>% 85 | read_csv(col_select = c("pwsid", "centroid_lat", "centroid_lon")) %>% 86 | st_as_sf(coords = c("centroid_lon", "centroid_lat"), crs = epsg) 87 | 88 | # fit the model on all data, apply the spatial buffer, and write 89 | t3m <- d %>% 90 | select(pwsid, radius) %>% 91 | bind_cols(predict(lm_fit, d)) %>% 92 | bind_cols(predict(lm_fit, d, type = "conf_int", level = 0.95)) %>% 93 | # exponentiate results back to median (unbiased), and 5/95 CIs 94 | mutate(across(where(is.numeric), ~10^(.x))) %>% 95 | # add matched output lat/lng centroids and make spatial 96 | left_join(matched_output_clean, by = "pwsid") %>% 97 | st_as_sf() %>% 98 | # convert to projected metric CRS for accurate, efficient buffer. 99 | # The project CRS (4326) is inappropriate because units are degrees. 100 | st_transform(3310) 101 | 102 | # create buffers for median, CI lower, and CI upper (5/95) predictions 103 | # (in units meters) and then transform back into projet CRS 104 | t3m_med <- st_buffer(t3m, t3m$.pred ) %>% st_transform(epsg) 105 | t3m_cil <- st_buffer(t3m, t3m$.pred_lower) %>% st_transform(epsg) 106 | t3m_ciu <- st_buffer(t3m, t3m$.pred_upper) %>% st_transform(epsg) 107 | 108 | # paths to write modeled data 109 | path_t3m_med <- path(staging_path, "tier3_median.gpkg") 110 | path_t3m_cil <- path(staging_path, "tier3_ci_upper_95.gpkg") 111 | path_t3m_ciu <- path(staging_path, "tier3_ci_lower_05.gpkg") 112 | 113 | # write and delete layer if it already exists 114 | st_write(t3m_med, path_t3m_med, delete_dsn = TRUE) 115 | st_write(t3m_cil, path_t3m_cil, delete_dsn = TRUE) 116 | st_write(t3m_ciu, path_t3m_ciu, delete_dsn = TRUE) 117 | -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/etc/final_xgb.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/src/analysis/sandbox/model_explore/etc/final_xgb.rds -------------------------------------------------------------------------------- /src/analysis/sandbox/model_explore/etc/xgb_res.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimpleLab-Inc/wsb/4b3f7cb51e236f0139e5f6563b15965036a0e25b/src/analysis/sandbox/model_explore/etc/xgb_res.rds -------------------------------------------------------------------------------- /src/analysis/sandbox/sanity_checks/01_convex_hull.R: -------------------------------------------------------------------------------- 1 | # sanity check convex hull error 2 | 3 | library(tidyverse) 4 | library(here) 5 | library(fs) 6 | library(sf) -------------------------------------------------------------------------------- /src/combine_tiers.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import pandas as pd 5 | import geopandas as gpd 6 | import sqlalchemy as sa 7 | import match.helpers as helpers 8 | from dotenv import load_dotenv 9 | from shapely.geometry import Polygon 10 | 11 | load_dotenv() 12 | 13 | STAGING_PATH = os.environ["WSB_STAGING_PATH"] 14 | OUTPUT_PATH = os.environ["WSB_OUTPUT_PATH"] 15 | EPSG = os.environ["WSB_EPSG"] 16 | 17 | # Connect to local PostGIS instance 18 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 19 | 20 | #%% 21 | # load geometries for each tier ------------------------------------------- 22 | 23 | print("Loading geometries for Tiers 1-3...") 24 | 25 | # Tier 1: LABELED (and CONTRIBUTED) boundaries 26 | t1 = gpd.GeoDataFrame.from_postgis(""" 27 | SELECT pwsid, centroid_lat, centroid_lon, centroid_quality, geometry, geometry_source_detail 28 | FROM pws_contributors 29 | WHERE 30 | source_system IN ('labeled', 'contributed') AND 31 | NOT st_isempty(geometry) 32 | ORDER BY source_system, pwsid;""", 33 | conn, geom_col="geometry") 34 | 35 | # If there are duplicates, it's likely because we have a contributed AND a labeled bound. 36 | # Take only the contributed. 37 | before_count = len(t1) 38 | t1 = t1.drop_duplicates(subset="pwsid", keep="first") 39 | 40 | if len(t1) < before_count: 41 | print(f"Prioritized {before_count - len(t1)} contributed records over labeled in T1.") 42 | 43 | print("Retrieved Tier 1: Labeled boundaries.") 44 | 45 | # Tier 2: MATCHED boundaries (only the best) 46 | t2 = gpd.GeoDataFrame.from_postgis(""" 47 | SELECT 48 | m.master_key AS pwsid, 49 | t.source_system_id AS matched_bound_geoid, 50 | t.name AS matched_bound_name, 51 | t.centroid_lat, 52 | t.centroid_lon, 53 | t.centroid_quality, 54 | t.geometry, 55 | t.geometry_source_detail 56 | FROM matches_ranked m 57 | JOIN pws_contributors t ON m.candidate_contributor_id = t.contributor_id 58 | WHERE 59 | m.best_match AND 60 | t.source_system = 'tiger'""", 61 | conn, geom_col="geometry") 62 | 63 | print("Retrieved Tier 2: Matched boundaries.") 64 | 65 | # Tier 3: MODELED boundaries - use median result geometry but bring in CIs 66 | t3 = (gpd 67 | .read_file(os.path.join(STAGING_PATH, "tier3_median.gpkg")) 68 | [[ 69 | "pwsid", ".pred_lower", ".pred", ".pred_upper", 70 | "centroid_lat", "centroid_lon", "centroid_quality", 71 | "geometry", "geometry_source_detail" 72 | ]] 73 | .rename(columns={ 74 | ".pred_lower": "pred_05", 75 | ".pred": "pred_50", 76 | ".pred_upper": "pred_95" 77 | })) #type:ignore 78 | 79 | print("Retrieved Tier 3: Modeled boundaries.") 80 | 81 | #%% 82 | 83 | # Assign tier labels 84 | t1["tier"] = 1 85 | t2["tier"] = 2 86 | t3["tier"] = 3 87 | 88 | #%% 89 | # Pull in base attributes from SDWIS ---------------------------------- 90 | 91 | # read and format matched output 92 | print("Reading SDWIS for base attributes...") 93 | 94 | base = pd.read_sql(f""" 95 | SELECT * 96 | FROM pws_contributors 97 | WHERE source_system = 'sdwis';""", conn) 98 | 99 | base = base.drop(columns=[ 100 | "tier", "centroid_lat", "centroid_lon", "centroid_quality", 101 | "geometry", "geometry_source_detail"]) 102 | 103 | # Overwrite the contributor_id 104 | base["contributor_id"] = "master." + base["pwsid"] 105 | base["source_system"] = "master" 106 | 107 | #%% 108 | # combine tiers ----------------------------------------------------------- 109 | 110 | # Combine geometries from Tiers 1-3 111 | # Where we have duplicates, prefer Tier 1 > 2 > 3 112 | combined = gpd.GeoDataFrame(pd 113 | .concat([t1, t2, t3]) 114 | .sort_values(by="tier") #type:ignore 115 | .drop_duplicates(subset="pwsid", keep="first") 116 | [["pwsid", "tier", "centroid_lat", "centroid_lon", "centroid_quality", 117 | "geometry", "geometry_source_detail", "pred_05", "pred_50", "pred_95"]]) 118 | 119 | # Join again to get matched boundary info 120 | # we do this to get boundary info for ALL tiers 121 | combined = combined.merge( 122 | t2[["pwsid", "matched_bound_geoid", "matched_bound_name"]], on="pwsid", how="left") 123 | 124 | # Fix data types 125 | combined["matched_bound_geoid"] = combined["matched_bound_geoid"].astype(pd.Int64Dtype()) 126 | 127 | # Join to base 128 | temm = gpd.GeoDataFrame( 129 | base.merge(combined, on="pwsid", how="left"), 130 | crs=f"epsg:{EPSG}") 131 | 132 | # Allow NA when we have no geometry 133 | temm["tier"] = temm["tier"].astype(pd.Int64Dtype()) 134 | 135 | # Replace empty geometries 136 | temm.loc[temm["geometry"].is_empty | temm["geometry"].isna(), "geometry"] = Polygon([]) #type:ignore 137 | 138 | # Verify - We should have the same number of rows in df and in temm 139 | assert len(temm) == len(base) 140 | 141 | print("Combined a spatial layer using best available tiered data.\n") 142 | 143 | #%% 144 | 145 | # Save to the database 146 | helpers.load_to_postgis("master", 147 | temm.drop(columns=["matched_bound_geoid", "matched_bound_name", "pred_05", "pred_50", "pred_95"])) 148 | 149 | #%% 150 | # Export 151 | 152 | # The file outputs have a subset of columns 153 | columns = [ 154 | "pwsid", "name", "primacy_agency_code", "state", "city_served", 155 | "county", "population_served_count", "service_connections_count", 156 | "service_area_type_code", "owner_type_code", 157 | "is_wholesaler_ind", "primacy_type", 158 | "primary_source_code", "tier", 159 | "centroid_lat", "centroid_lon", "centroid_quality", 160 | "geometry", "geometry_source_detail", "pred_05", "pred_50", "pred_95"] 161 | 162 | # Backwards compatibility 163 | output = (temm[columns] 164 | .rename(columns={ 165 | "name": "pws_name", 166 | "state": "state_code", 167 | "county": "county_served" 168 | })) 169 | 170 | #%% 171 | # paths to write 172 | path_geopkg = os.path.join(OUTPUT_PATH, "temm.gpkg") 173 | output.to_file(path_geopkg, driver="GPKG") 174 | 175 | print("Wrote data to geopackage.\n") -------------------------------------------------------------------------------- /src/downloaders/download_contributed_pws.R: -------------------------------------------------------------------------------- 1 | # Download contributed public water system boundaries --------------------- 2 | library(fs) 3 | 4 | # path to save raw data 5 | data_path <- Sys.getenv("WSB_DATA_PATH") 6 | 7 | # Allow for longer timeout for download file 8 | options(timeout = 10000) 9 | 10 | # Data Source: Current Github managed by CGS/IOW, where final, accepted 11 | # individual contributor public water systems are added to SL's base map layer 12 | 13 | contributed_pws_url <- paste0("https://github.com/cgs-earth/ref_pws/raw/main/02_output/", 14 | "contributed_pws.gpkg") 15 | 16 | # create dir to store file, download, and un-zip 17 | dir_create(path(data_path, "contributed_pws")) 18 | 19 | # local path to download files 20 | file_contributed_pws <- path(data_path, "contributed_pws/contributed_pws.gpkg") 21 | 22 | download.file(contributed_pws_url, file_contributed_pws, mode="wb") 23 | 24 | cat("Downloaded contributed PWS data.\n") 25 | -------------------------------------------------------------------------------- /src/downloaders/download_echo.R: -------------------------------------------------------------------------------- 1 | # download ECHO admin data ----------------------------------------------- 2 | 3 | library(glue) 4 | library(fs) 5 | 6 | # path to save raw data 7 | data_path <- Sys.getenv("WSB_DATA_PATH") 8 | 9 | # Allow for longer timeout for download file 10 | options(timeout = 10000) 11 | 12 | # Data Source: EPA's ECHO Exporter ZIP and SDWA zip (in case it is useful) 13 | echo_url <- paste0("https://echo.epa.gov/files/echodownloads/", 14 | "echo_exporter.zip") 15 | # create dir to store file, download, and un-zip 16 | dir_create(path(data_path, "echo")) 17 | 18 | # local path to download files 19 | file_echo <- path(data_path, "echo/echo_exporter.zip") 20 | 21 | download.file(echo_url, file_echo) 22 | 23 | unzip(file_echo, exdir = path(data_path, "echo")) 24 | 25 | cat("Downloaded and unzipped ECHO data.\n") 26 | -------------------------------------------------------------------------------- /src/downloaders/download_frs.R: -------------------------------------------------------------------------------- 1 | # download FRS centroids -------------------------------------------------- 2 | 3 | library(fs) 4 | 5 | # path to save raw data 6 | data_path <- Sys.getenv("WSB_DATA_PATH") 7 | 8 | # Allow for longer timeout to map download file 9 | options(timeout = 10000) 10 | 11 | # Data Source: EPA's Facility Reigstry Service (FRS) 12 | frs_url <- paste0("https://edg.epa.gov/data/public/OEI/FRS/", 13 | "FRS_Interests_Download.zip") 14 | 15 | # create dir to store file, download, and un-zip 16 | dir_create(path(data_path, "frs")) 17 | download.file(frs_url, path(data_path, "frs/frs.zip")) 18 | unzip(path(data_path, "frs/frs.zip"), exdir = path(data_path, "frs")) 19 | cat("Downloaded and unzipped FRS data.\n") 20 | -------------------------------------------------------------------------------- /src/downloaders/download_helpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Feb 1 11:04:42 2022 5 | 6 | @author: nb, jjg 7 | """ 8 | 9 | import os 10 | import pandas as pd 11 | import glob 12 | 13 | 14 | 15 | def create_dir(path, dir): 16 | 17 | """ 18 | A function that creates a directory for downloading data. 19 | 20 | Inputs: 21 | -path: file path relative to root project path 22 | -dir: name of directory 23 | 24 | Output: a folder for data downloads. 25 | """ 26 | 27 | dir_path = os.path.join(path, dir) 28 | 29 | if os.path.exists(dir_path): 30 | print(f'Directory {dir} exists.') 31 | else: 32 | os.mkdir(dir_path) 33 | print(f'Created directory {dir}.') 34 | 35 | return dir_path 36 | 37 | 38 | 39 | def get_row_count(directory, file): 40 | 41 | """ 42 | A function that counts the final rows in a given csv. 43 | 44 | Inputs: 45 | -directory: directory file path relative to root path 46 | -file: name of file 47 | 48 | Output: row count of input file 49 | 50 | """ 51 | path = os.path.join(directory, file) 52 | with open(path) as f: 53 | row_count = sum(1 for line in f) 54 | return row_count 55 | 56 | 57 | def write_aria_download_txt(download_txt_name, path, base_filename, table_filter=None, 58 | step_size=10000, count_cur=0, count_end=200): 59 | """ 60 | Write aria download text file for base_filename in path. Default file step size 61 | (size of each partial download) is 10000 rows. Downloads up to count_end (default 200) files. 62 | Tables with more than 2MM rows require adjustments to step size and count_end. 63 | 64 | Inputs: 65 | -download_txt_name: name of download text file name supplied in download_with_aria() 66 | -path: folder directory for a given download (e.g. data/sdwis) 67 | -base_filename: filename withint download folder (e.g. WATER_SYSTEM) 68 | -table_filter: optional filter to SDWIS tables, e.g. filter by state code 69 | -step_size: step size of each partial download; default is 10000 rows 70 | -count_end: last step in download (default is 200 files) 71 | 72 | Note: EPA Download is inclusive. If URL includes 'ROWS/0:2', 73 | it downloads three rows (indices 0, 1, 2). 74 | """ 75 | 76 | if table_filter: 77 | base_url = f'https://data.epa.gov/efservice/{base_filename}/{table_filter}/ROWS' 78 | else: 79 | base_url = f'https://data.epa.gov/efservice/{base_filename}/ROWS' 80 | 81 | urls_txt_path = os.path.join(path, download_txt_name) 82 | 83 | with open(urls_txt_path, 'w') as f: 84 | while count_cur < count_end: 85 | 86 | row_start = count_cur * step_size 87 | row_end = row_start + step_size - 1 88 | rows = f'{str(row_start)}:{str(row_end)}' 89 | 90 | url = f"{base_url}/{rows}/csv" 91 | f.write(url + '\n') 92 | 93 | filename = f'{base_filename}_{count_cur}.csv' 94 | f.write(f' out={filename}' + '\n') 95 | 96 | count_cur += 1 97 | 98 | return urls_txt_path 99 | 100 | 101 | def download_with_aria(data_path, filename, table_filter=None, count_end=200): 102 | 103 | """ 104 | Create text file based on filename and base url to direct downloader. 105 | Download url files using aria download text file for base_filename in path. 106 | 107 | Inputs: 108 | -data_path: directory file path relative to root path where downloads happen 109 | -file: name of file 110 | -table_filter: optional filter to SDWIS tables, e.g. filter by state code 111 | 112 | Outputs: a folder of csv files in increments of step_size rows. 113 | 114 | Note: setting --auto-file-renaming=false prevents data from being appended to existing 115 | downloads; a new download requires manual deletion of the previous downloads. 116 | """ 117 | 118 | # Create subdirectory 119 | dir_path = create_dir(data_path,filename) 120 | 121 | # Make text file of chunked aria urls and filenames 122 | aria_download_filename = f'aria_download_{filename}.txt' 123 | 124 | urls_txt_path = write_aria_download_txt(aria_download_filename, dir_path, filename, table_filter, count_end=200) 125 | 126 | # Download with aria 127 | os.system(f'aria2c --input-file={urls_txt_path} --dir={dir_path} --auto-file-renaming=false') 128 | 129 | 130 | def stitch_files(filename, data_path): 131 | 132 | """ 133 | Create single csv file based on a folder of downloaded csvs. 134 | 135 | Inputs: 136 | -data_path: directory file path relative to root path where downloads happen 137 | -filename: name of file 138 | 139 | Outputs: a single csv file in the root project directory for use in transformers. 140 | """ 141 | 142 | extension = 'csv' 143 | csv_file_path = os.path.join(data_path, filename) 144 | os.chdir(csv_file_path) 145 | 146 | all_filenames = [i for i in glob.glob('*.{}'.format(extension))] 147 | 148 | #combine all files in the list 149 | combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ]) 150 | 151 | #export to csv 152 | combined_csv.to_csv(f"../{filename}.csv", index=False, encoding='utf-8-sig') 153 | -------------------------------------------------------------------------------- /src/downloaders/download_mhp.R: -------------------------------------------------------------------------------- 1 | # Download mobile home parks point data ----------------------------------- 2 | 3 | library(fs) 4 | 5 | # path to save raw data 6 | data_path <- Sys.getenv("WSB_DATA_PATH") 7 | 8 | # Allow for longer timeout to map download file 9 | options(timeout = 10000) 10 | 11 | # Data Source: MPH ArcGIS geojson water system boundary 12 | mhp_url <- paste0("https://opendata.arcgis.com/datasets/", 13 | "4cdbccc5c538452aa91ceee277c460f9_0.geojson") 14 | 15 | # create dir to store file and download 16 | dir_create(path(data_path, "mhp")) 17 | download.file(mhp_url, path(data_path, "/mhp/mhp.geojson")) 18 | cat("Downloaded mobile home park point data.\n") 19 | -------------------------------------------------------------------------------- /src/downloaders/download_sdwis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Feb 1 11:06:58 2022 5 | 6 | @author: nb, jjg 7 | """ 8 | 9 | 10 | # Libraries 11 | import os, sys 12 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 13 | from downloaders.download_helpers import create_dir, get_row_count 14 | from downloaders.download_helpers import download_with_aria, stitch_files 15 | from dotenv import load_dotenv 16 | 17 | load_dotenv() 18 | 19 | #%% 20 | # Create file directory 21 | data_path = os.environ["WSB_DATA_PATH"] 22 | directory = 'sdwis' 23 | 24 | # Set output directory 25 | create_dir(data_path, directory) 26 | 27 | sdwis_data_path = os.path.join(data_path, "sdwis") 28 | 29 | #%% Download smaller files to sdwis directory 30 | 31 | # SERVICE_AREA, GEOGRAPHIC_AREA 32 | 33 | filenames = ['SERVICE_AREA', 'GEOGRAPHIC_AREA'] 34 | 35 | 36 | for filename in filenames: 37 | if os.path.exists(os.path.join(sdwis_data_path, filename + ".csv")): 38 | pass 39 | print(f"{filename}.csv exists, skipping download.") 40 | 41 | else: 42 | print(f'Downloading {filename}') 43 | 44 | base_url = f'https://data.epa.gov/efservice/{filename}/ROWS/0:100000000/csv' 45 | 46 | os.system(f'aria2c --out={filename}.csv --dir={sdwis_data_path} {base_url} --auto-file-renaming=false') 47 | 48 | # Print row count 49 | row_count = get_row_count(sdwis_data_path, f'{filename}.csv') 50 | print(f'Row count of {filename}.csv: {row_count}') 51 | 52 | 53 | #%% Download larger files 54 | # While the smaller files above work without timing out, SDWIS has a 10K query limit 55 | # on tables; the following script could be used for the above tables as well, but currently 56 | # are limited to the larger of the 4 files to avoid time outs 57 | 58 | # Current working assumption is that there are no more than 2MM rows for 59 | # water_system and water_system_facility; this could theoretically change over time 60 | # and the analyst would need to adjust the default value 61 | 62 | 63 | #%% Download WATER_SYSTEM 64 | filename = 'WATER_SYSTEM' 65 | 66 | 67 | if os.path.exists(os.path.join(sdwis_data_path, filename + "/")): 68 | pass 69 | print(f"{filename} folder exists, skipping download.") 70 | 71 | else: 72 | download_with_aria(sdwis_data_path,filename, count_end=200) 73 | 74 | # Stitch and count rows 75 | if not os.path.exists(os.path.join(sdwis_data_path, f'{filename}.csv')): 76 | stitch_files(filename, sdwis_data_path) 77 | directory = os.path.join(sdwis_data_path, f'{filename}/') 78 | row_count = get_row_count(sdwis_data_path, f'{filename}.csv') 79 | print(f'Row count of {filename}.csv: {row_count}') 80 | 81 | else: 82 | print(f'{filename}.csv already exists and will not re-stitch.') 83 | -------------------------------------------------------------------------------- /src/downloaders/download_tigris_ne.R: -------------------------------------------------------------------------------- 1 | # download TIGRIS places and Natural Earth (ne) coastline ----------------- 2 | 3 | library(fs) 4 | library(sf) 5 | library(tigris) 6 | library(rmapshaper) 7 | library(readr) 8 | library(tidyverse) 9 | library(tidycensus) 10 | 11 | 12 | # path to save raw data 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | census_api_key <- Sys.getenv("CENSUS_API_KEY") 15 | 16 | # Tell tidycensus our key (don't forget to activate it first!) 17 | census_api_key(census_api_key) 18 | 19 | # download large files without timeout error 20 | options(timeout = 100000, tigris_use_cache = TRUE) 21 | states_list <- c(state.abb, "DC") 22 | 23 | # create dirs 24 | dir_create(path(data_path, "tigris")) 25 | dir_create(path(data_path, "ne/ocean")) 26 | 27 | # download all TIGRIS places, simplify polygons, save 28 | places <- tigris::places(states_list) 29 | 30 | places <- places %>% 31 | rmapshaper::ms_simplify( 32 | keep_shapes = TRUE, 33 | # https://github.com/ateucher/rmapshaper/issues/83 34 | # and https://github.com/ateucher/rmapshaper#using-the-system-mapshaper or 35 | # https://docs.npmjs.com/resolving-eacces-permissions-errors-when-installing-packages-globally 36 | sys = TRUE) 37 | 38 | write_rds(places, path(data_path, "tigris/tigris_places.rds")) 39 | cat("Downloaded and wrote TIGRIS places.\n") 40 | 41 | # download and write population data for TIGRIS places 42 | pop <- get_decennial( 43 | geography = "place", # census-designated places 44 | state = states_list, 45 | year = 2020, 46 | variables = "P1_001N", # selects population data for 2020 47 | geometry = FALSE, 48 | cb = FALSE 49 | ) %>% 50 | select( 51 | geoid = GEOID, 52 | name = NAME, 53 | population = value 54 | ) %>% 55 | write_csv(., path(data_path, "tigris/tigris_pop.csv")) 56 | 57 | # download and unzip Natural Earth oceans polygons, used to 58 | # remove water bodies from TIGRIS places in the transformer 59 | url_ne <- paste0("https://www.naturalearthdata.com/", 60 | "http//www.naturalearthdata.com/", 61 | "download/10m/physical/ne_10m_ocean.zip") 62 | download.file(url_ne, 63 | destfile = path(data_path, "ne/ocean/ocean.zip")) 64 | 65 | unzip(zipfile = path(data_path, "ne/ocean/ocean.zip"), 66 | exdir = path(data_path, "ne/ocean/ne-ocean-10m")) 67 | cat("Downloaded and wrote Natural Earth Oceans.\n") 68 | -------------------------------------------------------------------------------- /src/downloaders/download_ucmr.R: -------------------------------------------------------------------------------- 1 | # download UCMR occurrence data ----------------------------------------------- 2 | 3 | library(glue) 4 | library(fs) 5 | 6 | # Allow for longer timeout for download file 7 | options(timeout = 10000) 8 | 9 | # path to save raw data 10 | data_path <- Sys.getenv("WSB_DATA_PATH") 11 | 12 | # Data Source: UCMR Program, which records zip codes served 13 | ucmr3_url <- paste0("https://www.epa.gov/sites/default/files/2017-02/", 14 | "ucmr-3-occurrence-data.zip") 15 | 16 | ucmr4_url <- paste0("https://www.epa.gov/sites/default/files/2020-04/", 17 | "ucmr_4_occurrence_data.zip?VersionId=", 18 | "m3C_dKBtBPyz35yDVL_1uZLjGjHiZtwf") 19 | 20 | # create dir to store downloaded files 21 | dir_create(path(data_path, "ucmr")) 22 | 23 | # local paths to download files 24 | file_ucmr3 <- path(data_path, "ucmr/ucmr3.zip") 25 | file_ucmr4 <- path(data_path, "ucmr/ucmr4.zip") 26 | 27 | # download and unzip 28 | download.file(ucmr3_url, file_ucmr3) 29 | download.file(ucmr4_url, file_ucmr4, mode="wb") 30 | 31 | unzip(file_ucmr3, exdir = path(data_path, "ucmr")) 32 | unzip(file_ucmr4, exdir = path(data_path, "ucmr")) 33 | 34 | cat("Downloaded and unzipped UCMR3 and UCMR4 data.\n") 35 | -------------------------------------------------------------------------------- /src/downloaders/states/download_ar_wsb.R: -------------------------------------------------------------------------------- 1 | # Download AR water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Arkansas ArcGIS shapefile water system boundary 6 | url <- paste0("https://geostor-vectors.s3.amazonaws.com/Utilities/SHP/", 7 | "PUBLIC_WATER_SYSTEMS.zip") 8 | 9 | download_wsb(url, "ar") 10 | -------------------------------------------------------------------------------- /src/downloaders/states/download_az_wsb.R: -------------------------------------------------------------------------------- 1 | # Download AZ water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Arizona ArcGIS geojson water system boundary 6 | url <- paste0("https://opendata.arcgis.com/datasets/", 7 | "9992e59e46bb466584f9694f897f350a_0.geojson") 8 | 9 | download_wsb(url, "az") 10 | -------------------------------------------------------------------------------- /src/downloaders/states/download_ct_wsb.R: -------------------------------------------------------------------------------- 1 | # Download CT water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Connecticut ArcGIS shapefile water system boundary 6 | url <- paste0("https://portal.ct.gov/-/media/Departments-and-Agencies/", 7 | "DPH/dph/drinking_water/GIS/", 8 | "Buffered_Community_PWS_Service_Areas.zip") 9 | 10 | download_wsb(url, "ct") 11 | -------------------------------------------------------------------------------- /src/downloaders/states/download_il_wsb.R: -------------------------------------------------------------------------------- 1 | # Download IL water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: IL Geospatial Data Clearinghouse 6 | url <- paste0("https://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISWS/Hydrology/zips/", 7 | "Illinois_Municipal_Water_Use_2012.zip") 8 | 9 | download_wsb(url, "il") 10 | 11 | -------------------------------------------------------------------------------- /src/downloaders/states/download_ks_wsb.R: -------------------------------------------------------------------------------- 1 | # Download KS water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Kansas ArcGIS shapefile water system boundary 6 | url <- paste0("https://data.kansasgis.org/catalog/", 7 | "administrative_boundaries/shp/pws/PWS_bnd_2021_0430.zip") 8 | 9 | download_wsb(url, "ks") 10 | -------------------------------------------------------------------------------- /src/downloaders/states/download_mo_wsb.R: -------------------------------------------------------------------------------- 1 | # Download MO water service boundaries ------------------------------------ 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Missouri ArcGIS geojson water system boundary 6 | url <- paste0("https://opendata.arcgis.com/datasets/", 7 | "c3bee75a86e04856b28d7f1ce2a24e6f_0.geojson") 8 | 9 | download_wsb(url, "mo") 10 | -------------------------------------------------------------------------------- /src/downloaders/states/download_nc_wsb.R: -------------------------------------------------------------------------------- 1 | # Download NC water service boundaries ------------------------------------ 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: North Carolina ArcGIS geojson water system boundary 6 | url <- paste0("https://opendata.arcgis.com/datasets/", 7 | "58548b90bdfd4148829103ac7f4db9ce_4.geojson") 8 | 9 | download_wsb(url, "nc") 10 | -------------------------------------------------------------------------------- /src/downloaders/states/download_nj_wsb.R: -------------------------------------------------------------------------------- 1 | # Download NJ water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: New Jersey ArcGIS geojson water system boundary 6 | url <- paste0("https://opendata.arcgis.com/datasets/", 7 | "00e7ff046ddb4302abe7b49b2ddee07e_13.geojson") 8 | 9 | download_wsb(url, "nj") 10 | -------------------------------------------------------------------------------- /src/downloaders/states/download_nm_wsb.R: -------------------------------------------------------------------------------- 1 | # Download NM water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: New Mexico ArcGIS geojson water system boundary 6 | url <- paste0("https://catalog.newmexicowaterdata.org/dataset/", 7 | "5d069bbb-1bfe-4c83-bbf7-3582a42fce6e/resource/", 8 | "ccb9f5ce-aed4-4896-a2f1-aba39953e7bb/download/pws_nm.geojson") 9 | 10 | download_wsb(url, "nm") 11 | -------------------------------------------------------------------------------- /src/downloaders/states/download_ok_wsb.R: -------------------------------------------------------------------------------- 1 | # Download OK water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Oklahoma ArcGIS geojson water system boundary 6 | url <- paste0("https://opendata.arcgis.com/datasets/", 7 | "d015bc14d3b84b8985ff3a4fd55c0844_0.geojson") 8 | 9 | download_wsb(url, "ok") 10 | -------------------------------------------------------------------------------- /src/downloaders/states/download_pa_wsb.R: -------------------------------------------------------------------------------- 1 | # Download PA water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Pennsylvania ArcGIS geojson water system boundary 6 | url <- "https://www.pasda.psu.edu/json/PublicWaterSupply2022_01.geojson" 7 | 8 | download_wsb(url, "pa") 9 | -------------------------------------------------------------------------------- /src/downloaders/states/download_ri_wsb.R: -------------------------------------------------------------------------------- 1 | # Download RI water system data ------------------------------------------- 2 | library(geojsonsf) 3 | library(fs) 4 | library(sf) 5 | library(urltools) 6 | 7 | # Path to save raw data 8 | data_path <- Sys.getenv("WSB_DATA_PATH") 9 | 10 | # Data Source: Arkansas ArcGIS shapefile water system boundary 11 | url <- paste0("https://risegis.ri.gov/hosting/rest/services/RIDEM/RI_DrinkingWater_ServiceAreas/", 12 | "FeatureServer/1/query?where=1%3D1&outFields=*&f=geojson") 13 | 14 | # Use geojson reader for ESRI Rest end points to get data 15 | ri <- geojson_sf(url) 16 | 17 | # Create outputted file directory 18 | dir_path <- paste0(data_path, "/boundary/ri") 19 | dir_create(dir_path) 20 | 21 | # Write RI geojson 22 | path_out <- paste0(dir_path, "/ri.geojson") 23 | if(file_exists(path_out)) file_delete(path_out) 24 | 25 | st_write(ri, path_out) 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/downloaders/states/download_state_helpers.R: -------------------------------------------------------------------------------- 1 | # State WSB downloader helper functions ----------------------------------- 2 | 3 | # suppress warning: package ‘fs’ was built under R version ... 4 | suppressWarnings(suppressMessages(library(fs))) 5 | 6 | # path to save raw data 7 | data_path <- Sys.getenv("WSB_DATA_PATH") 8 | 9 | # function to download url 10 | # file_ext argument is optional; if not provided, generated from url ending 11 | download_wsb <- function(url, state, file_ext) { 12 | 13 | cat("Starting download for", toupper(state), "boundary data...\n\n") 14 | 15 | # create outputted file directory 16 | dir_path <- path(data_path, paste0("boundary/", state)) 17 | dir_create(dir_path) 18 | 19 | # get file extension to create outputted file name and path 20 | if (missing(file_ext)) { 21 | file_ext <- sub(".*\\.", "", url) 22 | } 23 | file_name <- paste0(state, ".", file_ext) 24 | file_path <- path(dir_path, file_name) 25 | 26 | # download url 27 | download.file(url, file_path) 28 | 29 | # unzip if the downloaded file has a zip extension 30 | if (file_ext == "zip") { 31 | unzip_wsb(file_path, dir_path, state) 32 | } else { 33 | cat("Downloaded", toupper(state), "boundary data.\n\n\n") 34 | } 35 | } 36 | 37 | # function to unzip file 38 | unzip_wsb <- function(file_path, dir_path, state) { 39 | # unzip file 40 | unzip(zipfile = file_path, exdir = dir_path) 41 | cat("Downloaded and unzipped", toupper(state), "boundary data.\n\n\n") 42 | } 43 | -------------------------------------------------------------------------------- /src/downloaders/states/download_ut_wsb.R: -------------------------------------------------------------------------------- 1 | # Download UT water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Utah ArcGIS geojson water system boundary 6 | url <- paste0("https://services.arcgis.com/ZzrwjTRez6FJiOq4/arcgis/rest/", 7 | "services/CulinaryWaterServiceAreas/FeatureServer/0/", 8 | "query?outFields=*&where=1%3D1&f=geojson") 9 | 10 | download_wsb(url, "ut", "geojson") 11 | -------------------------------------------------------------------------------- /src/downloaders/states/download_wa_wsb.R: -------------------------------------------------------------------------------- 1 | # Download WA water system data ------------------------------------------- 2 | 3 | source(here::here("src/downloaders/states/download_state_helpers.R")) 4 | 5 | # Data Source: Washington ArcGIS Geodatabase water system boundary 6 | url <- paste0("https://opendata.arcgis.com/datasets/", 7 | "b09475f47a5a46ca90fe6a168fb22e6d_0.geojson") 8 | 9 | download_wsb(url, "wa") 10 | -------------------------------------------------------------------------------- /src/functions/f_clean_whitespace_nas.R: -------------------------------------------------------------------------------- 1 | # trim whitespace and replace common NA values with actual NAs 2 | f_clean_whitespace_nas <- function(df){ 3 | 4 | # if df is spatial, detach geometry before cleaning cols 5 | if(sum(class(df) == "sf") == 1) { 6 | geom = df$geometry 7 | df = st_drop_geometry(df) 8 | } 9 | 10 | # apply whitespace and NA cleaning 11 | df = dplyr::mutate_all(df, stringr::str_trim, "both") |> 12 | # all whitespace becomes "", so the next pattern handles all cases 13 | dplyr::mutate_all(dplyr::na_if, "") |> 14 | dplyr::mutate_all(dplyr::na_if, "NULL") |> 15 | dplyr::mutate_all(dplyr::na_if, "NA") |> 16 | dplyr::mutate_all(dplyr::na_if, "N/A") 17 | 18 | # reattach geometry if the object is spatial 19 | if(exists("geom")) { 20 | df = st_as_sf(bind_cols(df, geometry = geom)) 21 | } 22 | 23 | return(df) 24 | } 25 | -------------------------------------------------------------------------------- /src/functions/f_drop_imposters.R: -------------------------------------------------------------------------------- 1 | # drops imposters, which are geometries that report being in one state 2 | # but that actually are located in another. This function filters to 3 | # valid rows where input geom falls within state geoms (non-imposters), 4 | # and sinks a log file of invalid geoms (imposters) to review. 5 | # See GH Issue #45: https://github.com/SimpleLab-Inc/wsb/issues/45 6 | 7 | f_drop_imposters <- function(d, path_log){ 8 | 9 | # error if the supplied input is not sf 10 | if(!"sf" %in% class(d)){ 11 | stop("Input object is not of type `sf`.", call. = FALSE) 12 | } 13 | 14 | # error if the supplied object does not have a PWSID field name 15 | if(!"state" %in% colnames(d)){ 16 | stop("Column `state` missing from input object.", call. = FALSE) 17 | } 18 | 19 | # if the log path doesn't exist, create it 20 | if(!dir_exists(here::here("log"))) dir_create(here::here("log")) 21 | 22 | # reported state name 23 | d = mutate(d, state_reported = state) 24 | 25 | # create state name to abbreviation key with built-in R objects 26 | key = tibble(name = state.name, state_intersection = state.abb) 27 | 28 | # pull usa state geometries, project to input data CRS 29 | usa = USAboundaries::us_states(resolution = "high") %>% 30 | st_transform(st_crs(d)$epsg) %>% 31 | select(state_intersection = state_abbr, geometry) %>% 32 | suppressMessages() 33 | 34 | # spatial join input data to usa state polygons. 35 | # filter to valid and invalid geometries for returned objects 36 | cat("Joining input object geometries to USA state polygons...") 37 | d_joined = st_join(d, usa) 38 | cat("done.\n\n") 39 | 40 | # valid geometries: reported state == intersected state 41 | d_valid = d_joined %>% 42 | filter( 43 | state_reported == state_intersection | 44 | # also return when an input geometry doesn't intersect the USA geom 45 | is.na(state_intersection) 46 | ) %>% 47 | select(-state_reported, -state_intersection) 48 | 49 | # imposters: reported state != intersected state 50 | d_imposter = d_joined %>% 51 | filter( 52 | state_reported != state_intersection | 53 | # also return when a state isn't reported 54 | is.na(state_reported) 55 | ) %>% 56 | select(state_reported, state_intersection, everything()) %>% 57 | st_drop_geometry() 58 | 59 | # print stats on valid/invalid geometries 60 | nrow_d = nrow(d_joined) %>% formatC(big.mark = ",") 61 | nrow_dv = nrow(d_valid) %>% formatC(big.mark = ",") 62 | nrow_imp = nrow(d_imposter) %>% formatC(big.mark = ",") 63 | p_valid = round(((nrow(d_valid)/nrow(d_joined))*100), 2) 64 | 65 | cat(nrow_dv, "/", nrow_d, "rows are valid", "(", 66 | p_valid, "% of input data).\n\n") 67 | 68 | # sink invalid pwsids (even with dupes, e.g. FRS) to a log file 69 | write_csv(d_imposter, path_log) 70 | cat("Wrote", nrow_imp, "imposters for review to", path_log, "\n") 71 | 72 | # return valid geometries as an object 73 | return(d_valid) 74 | cat("Returned valid goemetries in return object.\n\n") 75 | } 76 | -------------------------------------------------------------------------------- /src/match/0-init.py: -------------------------------------------------------------------------------- 1 | """ 2 | This scripts simply sets up the database. 3 | """ 4 | #%% 5 | 6 | import os 7 | from dotenv import load_dotenv 8 | import sqlalchemy as sa 9 | 10 | load_dotenv() 11 | 12 | # Connect to local PostGIS instance 13 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 14 | 15 | #%% 16 | 17 | # Read in the SQL and execute against the database 18 | this_folder = os.path.dirname(__file__) 19 | 20 | with open(this_folder + "/init_model.sql") as file: 21 | sql = file.read() 22 | 23 | conn.execute(sql) -------------------------------------------------------------------------------- /src/match/2-cleansing.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os 3 | import pandas as pd 4 | import geopandas as gpd 5 | from dotenv import load_dotenv 6 | import sqlalchemy as sa 7 | 8 | load_dotenv() 9 | 10 | pd.options.display.max_columns = None 11 | 12 | EPSG = os.environ["WSB_EPSG"] 13 | PROJ = os.environ["WSB_EPSG_AW"] 14 | 15 | # Connect to local PostGIS instance 16 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 17 | 18 | 19 | def _run_cleanse_rule(conn, rule_name: str, sql: str): 20 | result = conn.execute(sql) 21 | print(f"Ran cleanse rule '{rule_name}': {result.rowcount} rows affected") 22 | 23 | #%% 24 | # First apply a bunch of SQL cleanses 25 | 26 | PO_BOX_REGEX = r'^P[\. ]?O\M\.? *BOX +\d+$' 27 | 28 | # Upper-case columns 29 | for col in [ 30 | "name", "address_line_1", "address_line_2", "city", "state", 31 | "county", "city_served", "centroid_quality" 32 | ]: 33 | _run_cleanse_rule(conn, 34 | f"Upper-case {col}", 35 | f""" 36 | UPDATE pws_contributors 37 | SET {col} = UPPER({col}) 38 | WHERE 39 | {col} ~ '[a-z]'; 40 | """) 41 | 42 | _run_cleanse_rule(conn, 43 | "NULL out nonexistent zip code '99999'", 44 | f""" 45 | UPDATE pws_contributors 46 | SET zip = NULL 47 | WHERE 48 | zip = '99999'; 49 | """) 50 | 51 | _run_cleanse_rule(conn, 52 | "Remove PO BOX from address_line_1", 53 | f""" 54 | UPDATE pws_contributors 55 | SET 56 | address_quality = 'PO BOX', 57 | address_line_1 = NULL 58 | WHERE 59 | address_line_1 ~ '{PO_BOX_REGEX}'; 60 | """) 61 | 62 | _run_cleanse_rule(conn, 63 | "Remove PO BOX from address_line_2", 64 | f""" 65 | UPDATE pws_contributors 66 | SET 67 | address_quality = 'PO BOX', 68 | address_line_2 = NULL 69 | WHERE 70 | address_line_2 ~ '{PO_BOX_REGEX}'; 71 | """) 72 | 73 | _run_cleanse_rule(conn, 74 | "If there's an address in line 2 but not line 1, move it", 75 | f""" 76 | UPDATE pws_contributors 77 | SET 78 | address_line_1 = address_line_2, 79 | address_line_2 = NULL 80 | WHERE 81 | (address_line_1 IS NULL OR address_line_1 = '') AND 82 | address_line_2 IS NOT NULL; 83 | """) 84 | 85 | _run_cleanse_rule(conn, 86 | "Standardize geometry quality", 87 | f""" 88 | UPDATE pws_contributors 89 | SET centroid_quality = 'ZIP CODE CENTROID' 90 | WHERE 91 | centroid_quality = 'ZIP CODE-CENTROID'; 92 | """) 93 | 94 | #%% 95 | ##################### 96 | # Handle Impostors 97 | ##################### 98 | 99 | print("Checking for impostors...") 100 | 101 | # Pull data from the DB 102 | df = gpd.GeoDataFrame.from_postgis(""" 103 | SELECT 104 | contributor_id, 105 | source_system, 106 | state, 107 | primacy_agency_code, 108 | geometry 109 | FROM pws_contributors 110 | WHERE 111 | source_system IN ('echo', 'frs') AND 112 | geometry IS NOT NULL AND 113 | NOT st_isempty(geometry) 114 | """, conn, geom_col="geometry" 115 | ).set_index("contributor_id") 116 | 117 | # Convert to projected 118 | df = df.to_crs(PROJ) 119 | 120 | # How many entries where primacy_agency_code differs from primacy_agency? 738 121 | # How many entries where primacy_agency_code is numeric? 379 122 | # Entries where state is numeric? 0 123 | # Entries where state is null? 0 124 | 125 | # In cases where primacy_agency_code is numeric, sub in the state 126 | mask = df["primacy_agency_code"].str.contains(r"\d\d", regex=True) 127 | df.loc[mask, "primacy_agency_code"] = df.loc[mask]["state"] 128 | 129 | #%% 130 | 131 | # Read in state boundaries and convert to projected CRS 132 | states = (gpd 133 | .read_file("../layers/us_states.geojson") 134 | [["stusps", "geometry"]] 135 | .rename(columns={"stusps": "state"}) 136 | .set_index("state") 137 | .to_crs(PROJ)) 138 | 139 | #%% 140 | 141 | # Series 1 is pwsid + geometry 142 | s1 = df["geometry"] 143 | 144 | # Series 2 is generic state bounds joined to each pwsid on primacy_agency_code 145 | s2 = (df 146 | .drop(columns="geometry") 147 | .join(states, on="primacy_agency_code") 148 | ["geometry"]) 149 | 150 | # Calculate the distance between the supplied boundary and the expected state 151 | distances = s1.distance(s2, align=True) 152 | 153 | # Any that are >50 kilometers are impostors 154 | impostors = (df 155 | .loc[distances[(distances > 50_000)].index] 156 | .to_crs("epsg:" + EPSG) 157 | .reset_index()) 158 | 159 | print(f"Found {len(impostors)} impostors.") 160 | 161 | #%% 162 | # Save to the database 163 | impostors.to_postgis("impostors", conn, if_exists="replace") 164 | 165 | #%% 166 | # Remove the address, lat/lon, and geometry when it's an "impostor" 167 | conn.execute(""" 168 | UPDATE pws_contributors 169 | SET 170 | address_line_1 = NULL, 171 | address_line_2 = NULL, 172 | city = NULL, 173 | state = NULL, 174 | zip = NULL, 175 | geometry = 'GEOMETRYCOLLECTION EMPTY', 176 | centroid_lat = NULL, 177 | centroid_lon = NULL 178 | WHERE 179 | contributor_id IN (SELECT contributor_id FROM impostors); 180 | """, conn) 181 | 182 | print("Null'd out impostor addresses and lat/lon.") -------------------------------------------------------------------------------- /src/match/4-rank_boundary_matches.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import numpy as np 5 | import pandas as pd 6 | import sqlalchemy as sa 7 | from dotenv import load_dotenv 8 | 9 | from match.match_scorer import MatchScorer 10 | 11 | load_dotenv() 12 | 13 | STAGING_PATH = os.environ["WSB_STAGING_PATH"] 14 | EPSG = os.environ["WSB_EPSG"] 15 | PROJ = os.environ["WSB_EPSG_AW"] 16 | 17 | # Connect to local PostGIS instance 18 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 19 | 20 | #%% 21 | matches = pd.read_sql(""" 22 | SELECT 23 | m.master_key, 24 | m.candidate_contributor_id, 25 | m.match_rule, 26 | s.name AS sdwis_name, 27 | s.population_served_count AS sdwis_pop, 28 | c.name AS tiger_name, 29 | c.population_served_count AS tiger_pop 30 | FROM matches m 31 | JOIN pws_contributors c ON m.candidate_contributor_id = c.contributor_id AND c.source_system = 'tiger' 32 | JOIN pws_contributors s ON s.master_key = m.master_key AND s.source_system = 'sdwis'; 33 | """, conn) 34 | 35 | print("Read matches from database.") 36 | 37 | 38 | #%% ########################## 39 | # Generate some TIGER match stats 40 | ############################## 41 | 42 | # How often do we match to multiple tigers? 43 | pws_to_tiger_match_counts = (matches 44 | .groupby("master_key") 45 | .size()) 46 | 47 | pws_to_tiger_match_counts.name = "pws_to_tiger_match_count" 48 | 49 | # Let's also do it the other direction 50 | tiger_to_pws_match_counts = (matches 51 | .groupby("candidate_contributor_id") 52 | .size()) 53 | 54 | tiger_to_pws_match_counts.name = "tiger_to_pws_match_count" 55 | 56 | # 1850 situations with > 1 match 57 | print(f"{(pws_to_tiger_match_counts > 1).sum()} PWS's matched to multiple TIGERs") 58 | 59 | # 3631 TIGERs matched to multiple PWSs 60 | print(f"{(tiger_to_pws_match_counts > 1).sum()} TIGER's matched to multiple PWS's") 61 | 62 | #%% ######################### 63 | # Figure out our strongest match rules 64 | ############################# 65 | scorer = MatchScorer() 66 | scored_matches = scorer.score_tiger_matches(matches) 67 | 68 | #%% 69 | """ 70 | Use the "scored" data to determine which rules (and combos of rules) 71 | are most effective. 72 | """ 73 | 74 | # Assign a "rank" to each match rule and combo of match rules 75 | match_rule_ranks = (matches 76 | .join(scored_matches, on=["master_key", "candidate_contributor_id"]) 77 | .groupby(["match_rule"]) 78 | .agg( 79 | points = ("score", "sum"), 80 | total = ("score", "size") 81 | )) #type:ignore 82 | 83 | match_rule_ranks["score"] = match_rule_ranks["points"] / match_rule_ranks["total"] 84 | match_rule_ranks = match_rule_ranks.sort_values("score", ascending=False) 85 | match_rule_ranks["match_rule_rank"] = np.arange(len(match_rule_ranks)) 86 | 87 | print("Identified best match rules based on labeled data.") 88 | 89 | #%% ########################### 90 | # Rank all PWS<->TIGER matches 91 | ############################### 92 | 93 | # Assign the match rule ranks back to the matches 94 | matches_ranked = matches.join( 95 | match_rule_ranks[["match_rule_rank"]], on="match_rule", how="left") 96 | 97 | # Flag any that have name matches 98 | matches_ranked["name_match"] = matches.apply(lambda x: x["tiger_name"] in x["sdwis_name"], axis=1) 99 | 100 | # Flag the best population within each TIGER match set 101 | # (Note this should be done AFTER removing the best PWS->TIGER, if we're doing that) 102 | matches_ranked["pop_diff"] = abs(matches["tiger_pop"] - matches["sdwis_pop"]) 103 | 104 | # To get PWS<->TIGER to be 1:1, we'll rank on different metrics 105 | # and then select the top one. We need to do this twice: 106 | # Once to make PWS->Tiger N:1 and then to make Tiger->PWS 1:1 107 | 108 | #%% 109 | # Through experimentation, this seemed to be the best ranking: 110 | # name_match, match_rule_rank, pop_diff 111 | # and selecting within the candidate_contributor groups first, 112 | # master_key groups second. 113 | 114 | # Assign numeric ranks to every match 115 | matches_ranked = (matches_ranked 116 | .sort_values( 117 | ["name_match", "match_rule_rank", "pop_diff"], 118 | ascending=[False, True, True]) 119 | # Re-number and bring that index into the df 120 | # This gives us a simple column to rank on 121 | .reset_index(drop=True) 122 | .reset_index(drop=False) 123 | .rename(columns={"index": "overall_rank"})) 124 | 125 | # I guess this is technically unnecessary, cause it's equivalent to sorting on overall_rank... 126 | # but maybe it make things a little clearer? 127 | matches_ranked["master_group_ranking"] = \ 128 | (matches_ranked 129 | .groupby("master_key") 130 | ["overall_rank"] 131 | .rank("dense") 132 | .astype("int")) 133 | 134 | #%% 135 | # Identify the 1-1 matches using the overall_rank 136 | best_matches = (matches_ranked 137 | .sort_values(["overall_rank"]) 138 | .drop_duplicates(subset="candidate_contributor_id", keep="first") 139 | .drop_duplicates(subset="master_key", keep="first")).index 140 | 141 | matches_ranked["best_match"] = matches_ranked.index.isin(best_matches) 142 | 143 | #%% 144 | 145 | print("Scoring 1:1 matches...") 146 | 147 | # Score it. how'd we do? 148 | scored_best_matches = scorer.score_tiger_matches( 149 | matches_ranked 150 | .loc[matches_ranked["best_match"]] 151 | [["master_key", "candidate_contributor_id"]]) 152 | 153 | # ~ 96% 154 | score = scored_best_matches["score"].sum() * 100 / len(scored_best_matches) 155 | 156 | print(f"Boundary match score: {score:.2f}") 157 | 158 | #%% 159 | matches_ranked.to_sql("matches_ranked", conn, if_exists="replace", index=False) 160 | -------------------------------------------------------------------------------- /src/match/5-select_modeled_centroids.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script takes centroids from ECHO, FRS, UCMR, and MHP 3 | and tries to select the best one to feed into the model 4 | for each PWSID. 5 | """ 6 | 7 | #%% 8 | 9 | import os 10 | import numpy as np 11 | import pandas as pd 12 | import geopandas as gpd 13 | import sqlalchemy as sa 14 | from shapely.geometry import Polygon 15 | from dotenv import load_dotenv 16 | 17 | import match.helpers as helpers 18 | 19 | load_dotenv() 20 | 21 | STAGING_PATH = os.environ["WSB_STAGING_PATH"] 22 | EPSG = os.environ["WSB_EPSG"] 23 | PROJ = os.environ["WSB_EPSG_AW"] 24 | 25 | # Connect to local PostGIS instance 26 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 27 | 28 | 29 | #%% 30 | # Load up the data sources 31 | 32 | print("Pulling in data from database...", end="") 33 | 34 | sdwis = gpd.GeoDataFrame.from_postgis(""" 35 | SELECT * 36 | FROM pws_contributors 37 | WHERE source_system = 'sdwis';""", 38 | conn, geom_col="geometry") 39 | 40 | stack = pd.read_sql(""" 41 | 42 | -- ECHO, FRS, and UCMR area all already-labeled with PWS 43 | SELECT 44 | c.contributor_id, c.source_system, c.master_key, 45 | c.centroid_lat, c.centroid_lon, c.centroid_quality, 46 | 1 as master_group_ranking 47 | FROM pws_contributors c 48 | WHERE source_system IN ('echo', 'frs', 'ucmr') 49 | 50 | UNION ALL 51 | 52 | -- Since we don't know PWSID's for MHP and TIGER, we need 53 | -- to join to matches to sub in their matcheda MK's 54 | 55 | -- Join MHP to matches 56 | SELECT 57 | c.contributor_id, c.source_system, m.master_key, 58 | c.centroid_lat, c.centroid_lon, c.centroid_quality, 59 | 1 as master_group_ranking 60 | FROM pws_contributors c 61 | JOIN matches m ON m.candidate_contributor_id = c.contributor_id 62 | WHERE source_system = 'mhp' 63 | 64 | UNION ALL 65 | 66 | -- Join Tiger to matches 67 | SELECT 68 | c.contributor_id, c.source_system, m.master_key, 69 | c.centroid_lat, c.centroid_lon, c.centroid_quality, 70 | -- This helps us decide the best tiger match 71 | m.master_group_ranking 72 | FROM pws_contributors c 73 | JOIN matches_ranked m ON m.candidate_contributor_id = c.contributor_id 74 | WHERE source_system = 'tiger' 75 | 76 | ORDER BY master_key;""", 77 | conn) 78 | 79 | print("done.") 80 | 81 | # Add sourcing notes to the geometries 82 | stack["centroid_quality"] = stack["source_system"].str.upper() + ": " + stack["centroid_quality"] 83 | 84 | 85 | #%% ########################### 86 | # Find the best centroid from the candidate contributors 87 | ############################### 88 | 89 | # Ranking: 90 | # Best MHP > 91 | # Echo (if not state or county centroid) > 92 | # UCMR > 93 | # Boundary > 94 | # Echo (if state or county centroid) 95 | 96 | # We want the best centroid from all contributors. 97 | # Assign a ranking: 98 | # MHP = 1 99 | # Echo = 2 if not state/county centroid 100 | # FRS = 3 101 | # UCMR = 4 102 | # Boundary = 5 103 | # Echo = 6 if state/county centroid 104 | 105 | stack["system_rank"] = stack["source_system"].map({ 106 | "mhp": 1, 107 | "echo": 2, 108 | "frs": 3, 109 | "ucmr": 4, 110 | "tiger": 5 111 | }) 112 | 113 | # Change Echo to 6 if state/county centroid 114 | mask = ( 115 | (stack["source_system"] == "echo") & 116 | (stack["centroid_quality"].isin(["ECHO: STATE CENTROID", "ECHO: COUNTY CENTROID"]))) 117 | 118 | stack.loc[mask, "system_rank"] = 6 119 | 120 | #%% 121 | # In case there are multiple matches from the same system, 122 | # we need tiebreakers. 123 | # Go by: 124 | # 1) System Ranking 125 | # 2) match_rank 126 | # 3) contributor_id (tiebreaker - to ensure consistency) 127 | 128 | # Note that only MHP and Tiger could potentially have multiple matches 129 | 130 | # Keep only the first entry in each subset 131 | best_centroid = (stack 132 | .sort_values([ 133 | "master_key", 134 | "system_rank", 135 | "master_group_ranking", 136 | "contributor_id"]) 137 | .drop_duplicates(subset="master_key", keep="first") 138 | .set_index("master_key")) 139 | 140 | 141 | #%% ########################## 142 | # Generate the final table 143 | ############################## 144 | 145 | # Start with SDWIS as the base, but drop/override a few columns 146 | output = (sdwis 147 | .drop(columns=["centroid_lat", "centroid_lon", "centroid_quality"]) 148 | .assign( 149 | contributor_id = "modeled." + sdwis["pwsid"], 150 | source_system = "modeled", 151 | source_system_id = sdwis["pwsid"], 152 | master_key = sdwis["pwsid"], 153 | tier = 3, 154 | geometry_source_detail = "Modeled" 155 | )) 156 | 157 | 158 | # Supplement with best centroid 159 | output = (output 160 | .merge(best_centroid[[ 161 | "centroid_lat", 162 | "centroid_lon", 163 | "centroid_quality", 164 | ]], on="master_key", how="left")) 165 | 166 | # Verify: We should still have exactly the number of pwsid's as we started with 167 | if not (len(output) == len(sdwis)): 168 | raise Exception("Output was filtered or denormalized") 169 | 170 | print("Joined several data sources into final output.") 171 | 172 | #%% 173 | output = gpd.GeoDataFrame(output) 174 | output["geometry"] = Polygon([]) 175 | output = output.set_crs(epsg=EPSG, allow_override=True) 176 | 177 | #%% ######################## 178 | # Save back to the DB 179 | ############################ 180 | 181 | helpers.load_to_postgis("modeled", output) -------------------------------------------------------------------------------- /src/match/helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | import sqlalchemy as sa 5 | import pandas as pd 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | 10 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 11 | 12 | 13 | def load_to_postgis(source_system: str, df: pd.DataFrame): 14 | 15 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 16 | TARGET_TABLE = "pws_contributors" 17 | 18 | print(f"Removing existing {source_system} data from database...", end="") 19 | conn.execute(f"DELETE FROM {TARGET_TABLE} WHERE source_system = '{source_system}';") 20 | print("done") 21 | 22 | print(f"Loading {source_system} to database...", end="") 23 | df.to_postgis(TARGET_TABLE, conn, if_exists="append") 24 | print("done.") 25 | 26 | 27 | def get_pwsids_of_interest(): 28 | 29 | sdwis = pd.read_csv( 30 | DATA_PATH + "/sdwis_water_system.csv", 31 | usecols=["pwsid", "pws_activity_code", "pws_type_code"], 32 | dtype="string") 33 | 34 | # Filter to only active community water systems 35 | # Starts as 400k, drops to ~50k after this filter 36 | # Keep only "A" for active 37 | return sdwis.loc[ 38 | (sdwis["pws_activity_code"].isin(["A"])) & 39 | (sdwis["pws_type_code"] == "CWS") 40 | ]["pwsid"] 41 | -------------------------------------------------------------------------------- /src/match/init_model.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS pws_contributors; 2 | 3 | CREATE TABLE pws_contributors ( 4 | contributor_id TEXT NOT NULL PRIMARY KEY, 5 | source_system TEXT NOT NULL, 6 | source_system_id TEXT NOT NULL, 7 | master_key TEXT NOT NULL, 8 | tier INT, 9 | pwsid TEXT, 10 | name TEXT, 11 | address_line_1 TEXT, 12 | address_line_2 TEXT, 13 | city TEXT, 14 | state CHAR(2), 15 | zip CHAR(5), 16 | county TEXT, 17 | address_quality TEXT, 18 | city_served TEXT, 19 | primacy_agency_code TEXT, 20 | primacy_type TEXT, 21 | population_served_count INT, 22 | service_connections_count INT, 23 | owner_type_code CHAR(1), 24 | service_area_type_code TEXT, 25 | is_wholesaler_ind BOOLEAN, 26 | primary_source_code TEXT, 27 | centroid_lat DECIMAL(10, 8), 28 | centroid_lon DECIMAL(11, 8), 29 | centroid_quality TEXT, 30 | geometry_source_detail TEXT, 31 | geometry GEOMETRY(GEOMETRY, 4326) 32 | ); 33 | 34 | CREATE INDEX ix__pws_contributors__source_system ON pws_contributors (source_system); 35 | CREATE INDEX ix__pws_contributors__source_system_id ON pws_contributors (source_system_id); 36 | CREATE INDEX ix__pws_contributors__master_key ON pws_contributors (master_key); -------------------------------------------------------------------------------- /src/match/map_contributed.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import geopandas as gpd 5 | import match.helpers as helpers 6 | from dotenv import load_dotenv 7 | 8 | load_dotenv() 9 | 10 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 11 | 12 | #%% 13 | 14 | contrib = gpd.read_file(os.path.join(DATA_PATH, "contributed_pws.gpkg")) 15 | 16 | #%% 17 | 18 | # Remove GeometryCollections -- they cause problems later. 19 | # (Polygons and MultiPolygons are OK) 20 | 21 | before = len(contrib) 22 | contrib = contrib[~(contrib.geom_type == "GeometryCollection")] 23 | 24 | if len(contrib) < before: 25 | print(f"Removed {before - len(contrib)} GeometryCollection type geometries.") 26 | 27 | #%% 28 | # Check assumptions 29 | assert contrib["pwsid"].is_unique 30 | 31 | #%% 32 | 33 | df = gpd.GeoDataFrame().assign( 34 | source_system_id = contrib["pwsid"], 35 | source_system = "contributed", 36 | contributor_id = "contributed." + contrib["pwsid"], 37 | master_key = contrib["pwsid"], 38 | pwsid = contrib["pwsid"], 39 | state = contrib["state"], 40 | name = contrib["pws_name"], 41 | geometry = contrib["geometry"], 42 | centroid_lat = contrib["centroid_lat"], 43 | centroid_lon = contrib["centroid_long"], 44 | centroid_quality = "CALCULATED FROM GEOMETRY", 45 | geometry_source_detail = contrib["geometry_source_detail"] 46 | ) 47 | 48 | #%% 49 | 50 | helpers.load_to_postgis("contributed", df) -------------------------------------------------------------------------------- /src/match/map_echo.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import pandas as pd 5 | import geopandas as gpd 6 | import match.helpers as helpers 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 12 | EPSG = os.environ["WSB_EPSG"] 13 | 14 | #%% 15 | 16 | usecols=[ 17 | "pwsid", "fac_lat", "fac_long", "fac_name", 18 | "fac_street", "fac_city", "fac_state", "fac_zip", "fac_county", 19 | "fac_collection_method", "fac_reference_point", "fac_accuracy_meters", 20 | "fac_indian_cntry_flg", "fac_percent_minority", "fac_pop_den", "ejscreen_flag_us"] 21 | 22 | echo_df = pd.read_csv( 23 | os.path.join(DATA_PATH, "echo.csv"), 24 | usecols=usecols, dtype="str") 25 | 26 | #%% 27 | 28 | pwsids = helpers.get_pwsids_of_interest() 29 | 30 | # Filter to only those in our SDWIS list and with lat/long 31 | # 47,951 SDWIS match to ECHO, 1494 don't match 32 | echo_df = echo_df.loc[ 33 | echo_df["pwsid"].isin(pwsids) & 34 | echo_df["fac_lat"].notna()].copy() 35 | 36 | # If fac_state is NA, copy from pwsid 37 | mask = echo_df["fac_state"].isna() 38 | echo_df.loc[mask, "fac_state"] = echo_df.loc[mask, "pwsid"].str[0:2] 39 | 40 | # Convert to geopandas 41 | echo: gpd.GeoDataFrame = gpd.GeoDataFrame( 42 | echo_df, 43 | geometry=gpd.points_from_xy(echo_df["fac_long"], echo_df["fac_lat"]), 44 | crs="EPSG:4326") 45 | 46 | # Cleanse out "UNK" 47 | echo = echo.replace({"UNK": pd.NA}) 48 | 49 | echo.head() 50 | 51 | #%% 52 | 53 | df = gpd.GeoDataFrame().assign( 54 | source_system_id = echo["pwsid"], 55 | source_system = "echo", 56 | contributor_id = "echo." + echo["pwsid"], 57 | master_key = echo["pwsid"], 58 | pwsid = echo["pwsid"], 59 | state = echo["fac_state"], 60 | name = echo["fac_name"], 61 | address_line_1 = echo["fac_street"], 62 | city = echo["fac_city"], 63 | county = echo["fac_county"], 64 | zip = echo["fac_zip"], 65 | primacy_agency_code = echo["pwsid"].str[0:2], 66 | centroid_lat = echo["fac_lat"], 67 | centroid_lon = echo["fac_long"], 68 | geometry = echo["geometry"], 69 | centroid_quality = echo["fac_collection_method"], 70 | ) 71 | 72 | #%% 73 | 74 | helpers.load_to_postgis("echo", df) -------------------------------------------------------------------------------- /src/match/map_frs.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import pandas as pd 5 | import geopandas as gpd 6 | from dotenv import load_dotenv 7 | 8 | import match.helpers as helpers 9 | 10 | load_dotenv() 11 | 12 | pd.options.display.max_columns = None 13 | 14 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 15 | EPSG = os.environ["WSB_EPSG"] 16 | 17 | 18 | #%% 19 | 20 | frs = gpd.read_file(os.path.join(DATA_PATH, "frs.gpkg")) 21 | print("Read FRS file.") 22 | 23 | pwsids = helpers.get_pwsids_of_interest() 24 | print("Retrieved PWSID's of interest.") 25 | 26 | # Bring in echo so that we can compare FRS and avoid duplication 27 | echo = pd.read_csv(DATA_PATH + "/echo.csv", dtype="str", 28 | usecols=["pwsid", "fac_name", "fac_lat", "fac_long"]) 29 | 30 | print("Read ECHO (to avoid duplication)") 31 | 32 | # Filter to those in SDWIS 33 | # And only those with interest_type "WATER TREATMENT PLANT". Other interest types are already in Echo. 34 | frs = frs[ 35 | frs["pwsid"].isin(pwsids) & 36 | (frs["interest_type"] == "WATER TREATMENT PLANT")] 37 | 38 | # We only need a subset of the columns 39 | keep_columns = [ 40 | "registry_id", "pwsid", "state_code", "primary_name", "location_address", 41 | "city_name", "postal_code", "county_name", 42 | "latitude83", "longitude83", "geometry", "ref_point_desc", 43 | "collect_mth_desc"] 44 | 45 | frs = frs[keep_columns] 46 | 47 | # Exclude FRS that are identical to echo on name and lat/long. 48 | # Maybe later, we also want to allow them through if they have different addresses. 49 | frs = frs.loc[frs 50 | # Find matches to echo, then only include those from FRS that _didn't_ match 51 | .reset_index() 52 | .merge(echo, 53 | left_on=["pwsid", "primary_name", "latitude83", "longitude83"], 54 | right_on=["pwsid", "fac_name", "fac_lat", "fac_long"], 55 | how="outer", indicator=True) 56 | .query("_merge == 'left_only'") 57 | ["index"] 58 | ] 59 | print("Filtered FRS") 60 | 61 | # Furthermore, drop entries where all the columns of interest are duplicated 62 | frs = frs.drop_duplicates(subset=list(set(frs.columns) - set("registry_id")), keep="first") 63 | 64 | print(f"{len(frs)} FRS entries remain after removing various duplicates") 65 | 66 | #%% 67 | 68 | df = gpd.GeoDataFrame().assign( 69 | source_system_id = frs["pwsid"], 70 | source_system = "frs", 71 | contributor_id = "frs." + frs["registry_id"] + "." + frs["pwsid"], # Apparently neither registry_id nor pwsid is fully unique, but together they are 72 | master_key = frs["pwsid"], 73 | pwsid = frs["pwsid"], 74 | state = frs["state_code"], 75 | name = frs["primary_name"], 76 | address_line_1 = frs["location_address"], 77 | city = frs["city_name"], 78 | zip = frs["postal_code"], 79 | county = frs["county_name"], 80 | primacy_agency_code = frs["pwsid"].str[0:2], 81 | centroid_lat = frs["latitude83"], 82 | centroid_lon = frs["longitude83"], 83 | geometry = frs["geometry"], 84 | centroid_quality = frs["collect_mth_desc"] 85 | ) 86 | 87 | # Some light cleansing 88 | df["zip"] = df["zip"].str[0:5] 89 | 90 | # %% 91 | helpers.load_to_postgis("frs", df) 92 | -------------------------------------------------------------------------------- /src/match/map_labeled.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import pandas as pd 5 | import geopandas as gpd 6 | from dotenv import load_dotenv 7 | 8 | import match.helpers as helpers 9 | 10 | load_dotenv() 11 | 12 | pd.options.display.max_columns = None 13 | 14 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 15 | EPSG = os.environ["WSB_EPSG"] 16 | 17 | 18 | #%% 19 | 20 | labeled = gpd.read_file(os.path.join(DATA_PATH, "wsb_labeled_clean.gpkg")) 21 | print("Read Labeled WSB file.") 22 | 23 | pwsids = helpers.get_pwsids_of_interest() 24 | print("Retrieved PWSID's of interest.") 25 | 26 | #%% 27 | # Filter to those in SDWIS 28 | labeled = labeled[labeled["pwsid"].isin(pwsids)] 29 | 30 | #%% 31 | # Null out a few bad lat/long 32 | mask = ( 33 | (labeled["centroid_lat"] < -90) | (labeled["centroid_lat"] > 90) | 34 | (labeled["centroid_long"] < -180) | (labeled["centroid_long"] > 180)) 35 | 36 | labeled.loc[mask, "centroid_lat"] = pd.NA 37 | labeled.loc[mask, "centroid_long"] = pd.NA 38 | 39 | print(f"Nulled out {mask.sum()} bad lat/long.") 40 | 41 | #%% 42 | 43 | df = gpd.GeoDataFrame().assign( 44 | source_system_id = labeled["pwsid"], 45 | source_system = "labeled", 46 | contributor_id = "labeled." + labeled["pwsid"], 47 | master_key = labeled["pwsid"], 48 | pwsid = labeled["pwsid"], 49 | state = labeled["state"], 50 | primacy_agency_code = labeled["pwsid"].str[0:2], 51 | name = labeled["pws_name"], 52 | # address_line_1 = labeled["location_address"], 53 | city = labeled["city"], 54 | # zip = labeled["postal_code"], 55 | county = labeled["county"], 56 | # Need to convert these to EPSG:4326 before we can save them 57 | centroid_lat = labeled["centroid_lat"], 58 | centroid_lon = labeled["centroid_long"], 59 | centroid_quality = "CALCULATED FROM GEOMETRY", 60 | geometry = labeled["geometry"], 61 | geometry_source_detail = labeled["geometry_source_detail"] 62 | ) 63 | 64 | #%% 65 | 66 | print("Labeled record counts:") 67 | print(df 68 | .groupby("primacy_agency_code") 69 | .size() 70 | .sort_index()) 71 | 72 | # %% 73 | helpers.load_to_postgis("labeled", df) -------------------------------------------------------------------------------- /src/match/map_mhp.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import pandas as pd 5 | import geopandas as gpd 6 | from dotenv import load_dotenv 7 | import match.helpers as helpers 8 | 9 | load_dotenv() 10 | 11 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 12 | EPSG = os.environ["WSB_EPSG"] 13 | 14 | #%% 15 | mhp = gpd.read_file(os.path.join(DATA_PATH, "mhp_clean.gpkg")) 16 | 17 | # A little cleansing 18 | mhp = mhp.replace({"NOT AVAILABLE": pd.NA}) 19 | 20 | #%% 21 | 22 | df = gpd.GeoDataFrame().assign( 23 | source_system_id = mhp["mhp_id"], 24 | source_system = "mhp", 25 | contributor_id = "mhp." + mhp["mhp_id"], 26 | master_key = "UNK-mhp." + mhp["mhp_id"], 27 | name = mhp["mhp_name"], 28 | address_line_1 = mhp["address"], 29 | city = mhp["city"], 30 | state = mhp["state"], 31 | zip = mhp["zipcode"], 32 | county = mhp["county"], 33 | centroid_lat = mhp["latitude"], 34 | centroid_lon = mhp["longitude"], 35 | geometry = mhp["geometry"], 36 | centroid_quality = mhp["val_method"], 37 | geometry_source_detail = mhp["source"] 38 | ) 39 | 40 | #%% 41 | 42 | helpers.load_to_postgis("mhp", df) -------------------------------------------------------------------------------- /src/match/map_sdwis.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | from shapely.geometry import Polygon 5 | import pandas as pd 6 | import geopandas as gpd 7 | from dotenv import load_dotenv 8 | import match.helpers as helpers 9 | 10 | load_dotenv() 11 | 12 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 13 | EPSG = os.environ["WSB_EPSG"] 14 | 15 | #%% ########################################## 16 | # SDWIS 17 | ############################################## 18 | 19 | """ 20 | # SDWIS Schema 21 | 22 | Table relationships: 23 | - water_system 24 | - water_system : water_system_facility is 1 : 0/N (~300 pwsid's missing, N=9.8 mean (wow!)) 25 | - water_system : service_area is 1 : 0/N, but almost 1:N (~1k pwsid's missing, N=1.2 mean) 26 | - water_system : geographic_area is 1 : 0/1, but almost 1:1 (~1k pwsid's missing) 27 | 28 | Here are the useful columns we want from SDWIS and supplemental tables 29 | ws.pwsid - the PK 30 | ws.pws_name - name 31 | ws.pws_activity_code - active or not 32 | ws.pws_type_code - Filtered to "cws" only so maybe we don't need it 33 | ws.address_line1 - "The address applicable to the legal entity", whatever that means 34 | ws.address_line2 35 | ws.city_name 36 | ws.zip_code 37 | ws.primacy_agency_code 38 | wsf.facility_id - Optional. This denormalizes the data substantially. 39 | sa.service_area_type_code - for municipal vs mobile home park 40 | ga.city_served - this column is not populated in ws unfortunately 41 | ga.county_served - Maybe this will be helpful? 42 | """ 43 | 44 | ######### 45 | # 1) SDWIS water_systems - PWSID is unique 46 | keep_columns = ["pwsid", "pws_name", "primacy_agency_code", 47 | "address_line1", "address_line2", "city_name", "zip_code", "state_code", 48 | "population_served_count", "service_connections_count", "owner_type_code", 49 | "primacy_type", "is_wholesaler_ind", "primary_source_code"] 50 | 51 | sdwis = pd.read_csv( 52 | os.path.join(DATA_PATH, "sdwis_water_system.csv"), 53 | usecols=keep_columns, 54 | dtype="string") 55 | 56 | pwsids = helpers.get_pwsids_of_interest() 57 | 58 | sdwis = sdwis.loc[sdwis["pwsid"].isin(pwsids)] 59 | 60 | # If state_code is NA, copy from primacy_agency_code 61 | mask = sdwis["state_code"].isna() 62 | sdwis.loc[mask, "state_code"] = sdwis.loc[mask, "primacy_agency_code"] 63 | 64 | 65 | ######### 66 | # Supplement with geographic_area 67 | 68 | # geographic_area - PWSID is unique, very nearly 1:1 with water_system 69 | # ~1k PWSID's appear in water_system but not geographic_area 70 | # We're trying to get city_served and county_served, but these columns aren't always populated 71 | sdwis_ga = pd.read_csv( 72 | os.path.join(DATA_PATH, "sdwis_geographic_area.csv"), 73 | usecols=["pwsid", "city_served", "county_served"], 74 | dtype="string") 75 | 76 | # Verify: pwsid is unique 77 | if not sdwis_ga["pwsid"].is_unique: 78 | raise Exception("Failed assumption: pwsid in geographic_area is assumed to be unique") 79 | 80 | sdwis = sdwis.merge(sdwis_ga, on="pwsid", how="left") 81 | 82 | ######### 83 | # Supplement with service_area 84 | 85 | # This is N:1 with sdwis, which is annoying 86 | # (each pws has on average 1.2 service_area_type_codes) 87 | 88 | # service_area - PWSID + service_area_type_code is unique 89 | # ~1k PWSID's appear in water_system but not service_area 90 | sdwis_sa = pd.read_csv( 91 | os.path.join(DATA_PATH, "sdwis_service_area.csv"), 92 | usecols=["pwsid", "service_area_type_code"]) 93 | 94 | # Filter to the pws's we're interested in 95 | sdwis_sa = sdwis_sa.loc[sdwis_sa["pwsid"].isin(sdwis["pwsid"])] 96 | 97 | # Supplement sdwis. I'll group it into a python list to avoid denormalized 98 | # Could also do a comma-delimited string. We'll see what seems more useful in practice. 99 | sdwis_sa = sdwis_sa.groupby("pwsid")["service_area_type_code"].apply(list) 100 | 101 | sdwis = sdwis.merge(sdwis_sa, on="pwsid", how="left") 102 | 103 | # Verification 104 | if not sdwis["pwsid"].is_unique: 105 | raise Exception("Expected sdwis pwsid to be unique") 106 | 107 | sdwis.head() 108 | 109 | #%% 110 | 111 | df = gpd.GeoDataFrame().assign( 112 | source_system_id = sdwis["pwsid"], 113 | source_system = "sdwis", 114 | contributor_id = "sdwis." + sdwis["pwsid"], 115 | master_key = sdwis["pwsid"], 116 | pwsid = sdwis["pwsid"], 117 | state = sdwis["state_code"], 118 | name = sdwis["pws_name"], 119 | address_line_1 = sdwis["address_line1"], 120 | address_line_2 = sdwis["address_line2"], 121 | city = sdwis["city_name"], 122 | zip = sdwis["zip_code"], 123 | county = sdwis["county_served"], 124 | city_served = sdwis["city_served"], 125 | geometry = Polygon([]), # Empty geometry. 126 | primacy_agency_code = sdwis["primacy_agency_code"], 127 | primacy_type = sdwis["primacy_type"], 128 | population_served_count = sdwis["population_served_count"], 129 | service_connections_count = sdwis["service_connections_count"].astype("float").astype("int"), 130 | owner_type_code = sdwis["owner_type_code"], 131 | service_area_type_code = sdwis["service_area_type_code"].astype("str"), 132 | is_wholesaler_ind = sdwis["is_wholesaler_ind"], 133 | primary_source_code = sdwis["primary_source_code"], 134 | ) 135 | 136 | df = df.set_crs(epsg=EPSG, allow_override=True) 137 | 138 | #%% 139 | helpers.load_to_postgis("sdwis", df) -------------------------------------------------------------------------------- /src/match/map_tiger.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import pandas as pd 5 | import geopandas as gpd 6 | from dotenv import load_dotenv 7 | import match.helpers as helpers 8 | 9 | load_dotenv() 10 | 11 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 12 | EPSG = os.environ["WSB_EPSG"] 13 | 14 | # Bring in the FIPS -> State Abbr crosswalk 15 | state_cw = (pd 16 | .read_csv("../crosswalks/state_fips_to_abbr.csv", dtype="str") 17 | .set_index("code")) 18 | 19 | #%% 20 | 21 | tiger = gpd.read_file(os.path.join(DATA_PATH, "tiger_places_clean.gpkg")) 22 | 23 | # Ensure strings with leading zeros 24 | tiger["statefp"] = tiger["statefp"].astype("int").astype("str").str.zfill(2) 25 | 26 | # Augment with state code 27 | tiger = (tiger 28 | .join(state_cw, on="statefp", how="left")) 29 | 30 | # TODO - It would be nice to also know county, zip code, etc., 31 | # but it doesn't seem like we can get this from the data as it stands. 32 | # Might need a lookup table. 33 | 34 | #%% 35 | 36 | df = gpd.GeoDataFrame().assign( 37 | source_system_id = tiger["geoid"], 38 | source_system = "tiger", 39 | contributor_id = "tiger." + tiger["geoid"], 40 | master_key = "UNK-tiger." + tiger["geoid"], 41 | name = tiger["name"], 42 | state = tiger["state"], 43 | population_served_count = tiger["population"].astype(pd.Int64Dtype()), 44 | geometry = tiger["geometry"], 45 | centroid_lat = tiger["intptlat"], 46 | centroid_lon = tiger["intptlon"], 47 | centroid_quality = "CALCULATED FROM GEOMETRY", 48 | geometry_source_detail = "2020 Census" 49 | ) 50 | 51 | #%% 52 | 53 | helpers.load_to_postgis("tiger", df) -------------------------------------------------------------------------------- /src/match/map_ucmr.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | import geopandas as gpd 5 | import pandas as pd 6 | from dotenv import load_dotenv 7 | import match.helpers as helpers 8 | 9 | load_dotenv() 10 | 11 | DATA_PATH = os.environ["WSB_STAGING_PATH"] 12 | EPSG = os.environ["WSB_EPSG"] 13 | 14 | #%% 15 | 16 | ucmr = pd.read_csv(os.path.join(DATA_PATH, "ucmr.csv")) 17 | 18 | ucmr = gpd.GeoDataFrame( 19 | ucmr, 20 | geometry=gpd.points_from_xy(ucmr["centroid_long"], ucmr["centroid_lat"]), 21 | crs="EPSG:4326") 22 | 23 | print("Loaded UCMR") 24 | 25 | pwsids = helpers.get_pwsids_of_interest() 26 | ucmr = ucmr[ucmr["pwsid"].isin(pwsids)] 27 | print("Filtered to PWSID's of interest.") 28 | 29 | #%% 30 | 31 | df = gpd.GeoDataFrame().assign( 32 | source_system_id = ucmr["pwsid"], 33 | source_system = "ucmr", 34 | contributor_id = "ucmr." + ucmr["pwsid"], 35 | master_key = ucmr["pwsid"], 36 | pwsid = ucmr["pwsid"], 37 | zip = ucmr["zipcode"].str[0:5], 38 | centroid_lat = ucmr["centroid_lat"], 39 | centroid_lon = ucmr["centroid_long"], 40 | geometry = ucmr["geometry"], 41 | centroid_quality = "ZIP CODE CENTROID" 42 | ) 43 | 44 | #%% 45 | 46 | helpers.load_to_postgis("ucmr", df) -------------------------------------------------------------------------------- /src/match/match_scorer.py: -------------------------------------------------------------------------------- 1 | #%% 2 | 3 | import os 4 | from typing import List, Optional 5 | import numpy as np 6 | import pandas as pd 7 | import geopandas as gpd 8 | import sqlalchemy as sa 9 | from dotenv import load_dotenv 10 | 11 | load_dotenv() 12 | 13 | STAGING_PATH = os.environ["WSB_STAGING_PATH"] 14 | EPSG = os.environ["WSB_EPSG"] 15 | PROJ = os.environ["WSB_EPSG_AW"] 16 | 17 | # Connect to local PostGIS instance 18 | conn = sa.create_engine(os.environ["POSTGIS_CONN_STR"]) 19 | 20 | class MatchScorer: 21 | 22 | def __init__(self): 23 | self.boundary_df = (self.get_data("tiger", ["contributor_id", "geometry"]) 24 | .set_index("contributor_id")) 25 | 26 | self.labeled_df = self.get_data("labeled", ["pwsid", "master_key", "geometry"]) 27 | 28 | def score_tiger_matches(self, matches: pd.DataFrame, proximity_buffer: int = 1000) -> pd.DataFrame: 29 | 30 | """ 31 | Given a set of matches to boundary data, compare it to known geometries 32 | (labeled data) to evaluate whether each match is good or bad. This can 33 | be used to evaluate the effectiveness of our matching. 34 | 35 | The match DF should have columns: master_key, candidate_contributor_id 36 | """ 37 | 38 | # Extract a series of "known geometries" from the labeled geometry data 39 | known_geometries = gpd.GeoSeries( 40 | self.labeled_df[["pwsid", "geometry"]] 41 | .merge(matches[["master_key", "candidate_contributor_id"]], left_on="pwsid", right_on="master_key") 42 | .set_index(["pwsid", "candidate_contributor_id"]) 43 | ["geometry"]) 44 | 45 | # Extract a series of "potential geometries" from the matched boundary data 46 | candidate_matches = gpd.GeoDataFrame(matches 47 | .join(self.boundary_df["geometry"], on="candidate_contributor_id") 48 | .rename(columns={"master_key": "pwsid"}) 49 | .set_index(["pwsid", "candidate_contributor_id"]) 50 | [["geometry"]]) 51 | 52 | # Filter to only the PWS's that appear in both series 53 | # 7,423 match 54 | known_geometries = (known_geometries 55 | .loc[known_geometries.index.isin(candidate_matches.index)] 56 | .sort_index()) 57 | 58 | candidate_matches = (candidate_matches 59 | .loc[candidate_matches.index.isin(known_geometries.index)] 60 | .sort_index()) 61 | 62 | print("Retrieved and aligned data.") 63 | 64 | # Switch to a projected CRS 65 | known_geometries = known_geometries.to_crs(PROJ) 66 | candidate_matches = candidate_matches.to_crs(PROJ) 67 | 68 | print("Converted to a projected CRS.") 69 | 70 | distances = known_geometries.distance(candidate_matches, align=True) 71 | print("Calculated distances.") 72 | 73 | # A few empty labeled geometries cause NA distances. Filter only non-NA 74 | distances = distances[distances.notna()] 75 | distances.name = "distance" 76 | 77 | # re-join to the match table 78 | candidate_matches = candidate_matches.join(distances, on=["pwsid", "candidate_contributor_id"], how="inner") 79 | 80 | # Assign a score - 1 if a good match, 0 if not a good match 81 | candidate_matches["score"] = candidate_matches["distance"] <= proximity_buffer 82 | 83 | print("Assigned scores.") 84 | 85 | return candidate_matches 86 | 87 | def get_data(self, system: str, columns: List[str] = ["*"]) -> pd.DataFrame: 88 | print(f"Pulling {system} data from database...", end="") 89 | 90 | df = gpd.GeoDataFrame.from_postgis(f""" 91 | SELECT {", ".join(columns)} 92 | FROM pws_contributors 93 | WHERE source_system = '{system}';""", 94 | conn, geom_col="geometry") 95 | 96 | print("done.") 97 | 98 | return df -------------------------------------------------------------------------------- /src/model/02_linear.R: -------------------------------------------------------------------------------- 1 | # linear model ------------------------------------------------------------ 2 | 3 | library(tidyverse) 4 | library(tidymodels) 5 | library(sf) 6 | library(fs) 7 | 8 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 9 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 10 | 11 | # read dataset and log transform the response - only for linear model 12 | d <- read_csv(path(staging_path, "model_input_clean.csv")) %>% 13 | mutate(radius = log10(radius), 14 | # multiply correlated predictors 15 | density = population_served_count * service_connections_count) 16 | 17 | # Stash lat/long 18 | lat_long <- d %>% 19 | select(pwsid, centroid_lon, centroid_lat) %>% 20 | st_as_sf(coords = c("centroid_lon", "centroid_lat"), crs = epsg) %>% 21 | suppressMessages() 22 | 23 | cat("\n\nRead `model_input_clean.csv` from preprocess script.\n") 24 | 25 | # unlabeled data (du) and labeled data (dl) 26 | du <- d %>% filter(is.na(radius)) 27 | dl <- d %>% filter(!is.na(radius)) 28 | 29 | # split labeled data (dl) into train and test with stratified random sampling 30 | # in each of the radius quartiles to account for the lognormal distribution 31 | # of the response variable (radius) and avoid overfitting to small radius obs 32 | set.seed(55) 33 | dl_split <- initial_split(dl, prop = 0.8, strata = radius) 34 | train <- training(dl_split) 35 | test <- testing(dl_split) 36 | 37 | cat("Split data into train and test sets.\n") 38 | 39 | # lm recipe 40 | lm_recipe <- 41 | # specify the model - interaction terms come later 42 | recipe( 43 | radius ~ 44 | service_connections_count + 45 | # use the cleaned owner type code from preprocess.R, which converts 46 | # 2 "N" owner type codes to "M" so that models can evaluate 47 | owner_type_code_clean + 48 | satc + 49 | is_wholesaler_ind, 50 | data = train 51 | ) %>% 52 | # convert predictors to log10 53 | step_log(service_connections_count, base = 10) %>% 54 | # encode categorical variables 55 | step_dummy(all_nominal_predictors()) %>% 56 | # specify interaction effects 57 | step_interact(~service_connections_count:starts_with("owner_type_code")) %>% 58 | step_interact(~service_connections_count:starts_with("satc")) %>% 59 | step_interact(~service_connections_count:starts_with("is_wholesaler_ind")) 60 | 61 | # specify model and engine for linear model and rf 62 | lm_mod <- linear_reg() %>% set_engine("lm") 63 | 64 | # lm workflow 65 | lm_wflow <- 66 | workflow() %>% 67 | add_model(lm_mod) %>% 68 | add_recipe(lm_recipe) 69 | 70 | # fit the linear model on the training set 71 | lm_fit <- fit(lm_wflow, train) 72 | cat("Fit model on training set.\n") 73 | 74 | # predict on the test set and bind mean predictions and CIs 75 | # lm_test_res <- test %>% 76 | # select(radius) %>% 77 | # bind_cols(predict(lm_fit, test)) %>% 78 | # bind_cols(predict(lm_fit, test, type = "conf_int")) 79 | 80 | # plot residuals 81 | # lm_test_res %>% 82 | # ggplot(aes(radius, .pred)) + 83 | # geom_point(alpha = 0.4) + 84 | # geom_abline(lty = 2, color = "red") + 85 | # labs(y = "Predicted radius (log10)", x = "Radius (log10)") + 86 | # # scale and size the x- and y-axis uniformly 87 | # coord_obs_pred() 88 | 89 | # RMSE 90 | # lm_metrics <- metric_set(rmse, rsq, mae) 91 | # lm_metrics(lm_test_res, truth = radius, estimate = .pred) 92 | 93 | 94 | # apply modeled radii to centroids for all data and write ----------------- 95 | 96 | # fit the model on all data, apply the spatial buffer, and write 97 | t3m <- d %>% 98 | select(pwsid, radius, centroid_lat, centroid_lon, centroid_quality, geometry_source_detail) %>% 99 | bind_cols(predict(lm_fit, d)) %>% 100 | bind_cols(predict(lm_fit, d, type = "conf_int", level = 0.95)) %>% 101 | # exponentiate results back to median (unbiased), and 5/95 CIs 102 | mutate(across(c("radius", starts_with(".")), ~10^(.x))) %>% 103 | # add matched output lat/lng centroids and make spatial 104 | left_join(lat_long, by = "pwsid") %>% 105 | st_as_sf() %>% 106 | # convert to projected metric CRS for accurate, efficient buffer. 107 | # The project CRS (4326) is inappropriate because units are degrees. 108 | st_transform(3310) 109 | cat("Fit model on all data and added 5/95 CIs.\n") 110 | 111 | # create buffers for median, CI lower, and CI upper (5/95) predictions 112 | # (in units meters) and then transform back into projet CRS 113 | t3m_med <- st_buffer(t3m, t3m$.pred ) %>% st_transform(epsg) 114 | t3m_cil <- st_buffer(t3m, t3m$.pred_lower) %>% st_transform(epsg) 115 | t3m_ciu <- st_buffer(t3m, t3m$.pred_upper) %>% st_transform(epsg) 116 | cat("Created median and 5/95 CI buffers.\n") 117 | 118 | # paths to write modeled data 119 | path_t3m_med <- path(staging_path, "tier3_median.gpkg") 120 | path_t3m_cil <- path(staging_path, "tier3_ci_upper_95.gpkg") 121 | path_t3m_ciu <- path(staging_path, "tier3_ci_lower_05.gpkg") 122 | 123 | # write and delete layer if it already exists 124 | st_write(t3m_med, path_t3m_med, delete_dsn = TRUE, quiet = TRUE) 125 | st_write(t3m_cil, path_t3m_cil, delete_dsn = TRUE, quiet = TRUE) 126 | st_write(t3m_ciu, path_t3m_ciu, delete_dsn = TRUE, quiet = TRUE) 127 | cat("Wrote Tier 3 model putput to `WSB_STAGING_PATH`.\n") 128 | -------------------------------------------------------------------------------- /src/model/README.md: -------------------------------------------------------------------------------- 1 | # Tier 3 model 2 | 3 | _Last updated 2022-03-28_ 4 | 5 | This subdirectory depends on the postgis database, in particular the records created by `5-select_modeled_centroids.py`. It contains two scripts, one which preprocesses data for the Tier 3 model, and another that generates predictions and write those to the staging path for `src/combine_tiers.py`, which compiles the final TEMM spatial layer. 6 | 7 | In order, run `01_preprocess.R` followed by `02_linear.R`. 8 | 9 | For preprocessing and modeling documentation, see: `src/analysis/sandbox/model_explore/model_march.html`. 10 | 11 | The code herein was originally prototyped in the "model_explore" Little Sandbox, which contains additional models (random forest, xgboost) and superseded code (archived preprocess and linear model scripts). 12 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_ar.R: -------------------------------------------------------------------------------- 1 | # transform MA water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform MA polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Lookup for AR pwisds where name does not automatically match to shapefile 19 | pwsid_supp <- read_csv("crosswalks/ar_pwsid_lookup.csv") 20 | 21 | # Read layer for MA water service boundaries, clean, transform CRS 22 | ar_wsb <- st_read(path(data_path, "boundary/ar/PUBLIC_WATER_SYSTEMS.shp")) %>% 23 | # clean whitespace 24 | f_clean_whitespace_nas() %>% 25 | # transform to area weighted CRS 26 | st_transform(epsg_aw) %>% 27 | # correct invalid geometries 28 | st_make_valid() %>% 29 | janitor::clean_names() 30 | 31 | cat("Read AR boundary layer; cleaned whitespace; corrected geometries.\n ") 32 | 33 | # Match water system names to sdwis to get pwsid 34 | # Note that comparing to this csv requires having downloaded and cleaned the SDWIS data 35 | # TODO: identify best strategy for this 36 | 37 | # Get active cws in AR 38 | ar_sdwis <- read_csv(path(staging_path, "sdwis_water_system.csv")) %>% 39 | filter(primacy_agency_code == "AR", 40 | pws_activity_code == "A") 41 | 42 | # Select names and object ids from spatial dataset 43 | ar_names <- ar_wsb %>% select(objectid, pws_name) %>% 44 | st_drop_geometry() 45 | 46 | # Join spatial dataset system names with sdwis 47 | ar_pwsids <- ar_names %>% left_join(ar_sdwis, by = c("pws_name")) %>% 48 | select(objectid, pws_name, pwsid) 49 | 50 | # Pull out the number of missing ids 51 | # From this list, pwsids were manually assigned to create the lookup 52 | na_pwsids <- ar_pwsids %>% filter(is.na(pwsid)) %>% 53 | left_join(pwsid_supp, by = c("pws_name")) %>% 54 | select(objectid, pws_name, pwsid.y) %>% 55 | rename(pwsid = pwsid.y) 56 | 57 | # Concatenate pwsid dataframes 58 | ar_pwsids <- ar_pwsids %>% 59 | rbind(na_pwsids) %>% 60 | distinct() %>% 61 | filter(!is.na(pwsid)) 62 | 63 | # Rejoin pwsid with shapefiles 64 | ar_wsb <- ar_wsb %>% 65 | left_join(ar_pwsids, by = c("objectid", "pws_name")) %>% 66 | # drop 12 geometries with no matching pwsid 67 | filter(!is.na(pwsid)) 68 | 69 | # Compute centroids, convex hulls, and radius assuming circular 70 | ar_wsb <- ar_wsb %>% 71 | bind_rows() %>% 72 | mutate( 73 | state = "AR", 74 | # importantly, area calculations occur in area weighted epsg 75 | st_areashape = st_area(geometry), 76 | convex_hull = st_geometry(st_convex_hull(geometry)), 77 | area_hull = st_area(convex_hull), 78 | radius = sqrt(area_hull/pi) 79 | ) %>% 80 | # transform back to standard epsg 81 | st_transform(epsg) %>% 82 | # compute centroids 83 | mutate( 84 | centroid = st_geometry(st_centroid(geometry)), 85 | centroid_long = st_coordinates(centroid)[, 1], 86 | centroid_lat = st_coordinates(centroid)[, 2], 87 | ) %>% 88 | # select columns and rename for staging 89 | select( 90 | # data source columns 91 | pwsid, 92 | pws_name, 93 | state, 94 | # county, 95 | # city, 96 | # owner, 97 | # geospatial columns 98 | st_areashape, 99 | centroid_long, 100 | centroid_lat, 101 | radius, 102 | geometry 103 | ) 104 | cat("Computed area, centroids, and radii from convex hulls.\n") 105 | cat("Combined into one layer; added geospatial columns.\n") 106 | 107 | # delete layer if it exists, then write to geopackage 108 | path_out <- path(staging_path, "wsb_labeled_ar.gpkg") 109 | if(file_exists(path_out)) file_delete(path_out) 110 | 111 | st_write(ar_wsb, path_out) 112 | cat("Wrote clean, labeled data to file.\n\n\n") -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_az.R: -------------------------------------------------------------------------------- 1 | # transform AZ water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform AZ polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for AZ water service boundaries, clean, transform CRS 19 | az_wsb <- st_read(path(data_path, "boundary/az/az.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # transform to area weighted CRS 23 | st_transform(epsg_aw) %>% 24 | # correct invalid geometries 25 | st_make_valid() 26 | 27 | cat("Read AZ boundary layer; cleaned whitespace; corrected geometries.\n") 28 | 29 | # Compute centroids, convex hulls, and radius assuming circular 30 | az_wsb <- az_wsb %>% 31 | bind_rows() %>% 32 | mutate( 33 | state = "AZ", 34 | # importantly, area calculations occur in area weighted epsg 35 | st_areashape = st_area(geometry), 36 | convex_hull = st_geometry(st_convex_hull(geometry)), 37 | area_hull = st_area(convex_hull), 38 | radius = sqrt(area_hull/pi) 39 | ) %>% 40 | # transform back to standard epsg 41 | st_transform(epsg) %>% 42 | # compute centroids 43 | mutate( 44 | centroid = st_geometry(st_centroid(geometry)), 45 | centroid_long = st_coordinates(centroid)[, 1], 46 | centroid_lat = st_coordinates(centroid)[, 2], 47 | ) %>% 48 | # select columns and rename for staging 49 | select( 50 | # data source columns 51 | pwsid = ADEQ_ID, 52 | pws_name = CWS_NAME, 53 | state, 54 | county = COUNTY, 55 | city = CITY_SRVD, 56 | owner = OWNER_NAME, 57 | # geospatial columns 58 | st_areashape, 59 | centroid_long, 60 | centroid_lat, 61 | radius, 62 | geometry 63 | ) 64 | cat("Computed area, centroids, and radii from convex hulls.\n") 65 | cat("Combined into one layer; added geospatial columns.\n") 66 | 67 | # delete layer if it exists, then write to geopackage 68 | path_out <- path(staging_path, "wsb_labeled_az.gpkg") 69 | if(file_exists(path_out)) file_delete(path_out) 70 | 71 | st_write(az_wsb, path_out) 72 | cat("Wrote clean, labeled data to file.\n\n\n") 73 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_ca.R: -------------------------------------------------------------------------------- 1 | # transform CA water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform CA polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for CA water service boundaries, clean, transform CRS 19 | ca_wsb <- st_read( 20 | dsn = path(data_path, "boundary/ca/SABL_Public_220207/", 21 | "SABL_Public_220207.shp")) %>% 22 | # clean whitespace 23 | f_clean_whitespace_nas() %>% 24 | # transform to area weighted CRS 25 | st_transform(epsg_aw) %>% 26 | # correct invalid geometries 27 | st_make_valid() 28 | 29 | cat("Read CA boundary layer; cleaned whitespace; corrected geometries.\n ") 30 | 31 | ca_wsb <- ca_wsb %>% 32 | bind_rows() %>% 33 | # compute area, convex hulls, and radius assuming circular 34 | mutate( 35 | state = "CA", 36 | # importantly, area calculations occur in area weighted epsg 37 | st_areashape = st_area(geometry), 38 | convex_hull = st_geometry(st_convex_hull(geometry)), 39 | area_hull = st_area(convex_hull), 40 | radius = sqrt(area_hull/pi) 41 | ) %>% 42 | # transform back to standard epsg 43 | st_transform(epsg) %>% 44 | # compute centroid 45 | mutate ( 46 | centroid = st_geometry(st_centroid(geometry)), 47 | centroid_long = st_coordinates(centroid)[, 1], 48 | centroid_lat = st_coordinates(centroid)[, 2] 49 | ) %>% 50 | # select columns and rename for staging 51 | select( 52 | # data source columns 53 | pwsid = WATER_SYST, 54 | pws_name = WATER_SY_1, 55 | state, 56 | county = COUNTY, 57 | # city, 58 | # owner, 59 | # geospatial columns 60 | st_areashape, 61 | centroid_long, 62 | centroid_lat, 63 | radius, 64 | geometry 65 | ) 66 | cat("Computed area, centroids, and radii from convex hulls.\n") 67 | cat("Combined into one layer; added geospatial columns.\n") 68 | 69 | # delete layer if it exists, then write to geopackage 70 | path_out <- path(staging_path, "wsb_labeled_ca.gpkg") 71 | if(file_exists(path_out)) file_delete(path_out) 72 | 73 | st_write(ca_wsb, path_out) 74 | cat("Wrote clean, labeled data to file.\n\n\n") 75 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_ct.R: -------------------------------------------------------------------------------- 1 | # transform CT water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform CT polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for CT water service boundaries, clean, transform CRS 19 | ct_wsb <- st_read( 20 | path(data_path, "boundary/ct/Buffered_Community_PWS_Service_Areas.shp")) %>% 21 | # clean whitespace 22 | f_clean_whitespace_nas() %>% 23 | # transform to area weighted CRS 24 | st_transform(epsg_aw) %>% 25 | # correct invalid geometries 26 | st_make_valid() 27 | 28 | cat("Read CT boundary layer; cleaned whitespace; corrected geometries.\n ") 29 | 30 | # Compute centroids, convex hulls, and radius assuming circular 31 | ct_wsb <- ct_wsb %>% 32 | bind_rows() %>% 33 | mutate( 34 | state = "CT", 35 | # importantly, area calculations occur in area weighted epsg 36 | st_areashape = st_area(geometry), 37 | convex_hull = st_geometry(st_convex_hull(geometry)), 38 | area_hull = st_area(convex_hull), 39 | radius = sqrt(area_hull/pi) 40 | ) %>% 41 | # transform back to standard epsg 42 | st_transform(epsg) %>% 43 | # compute centroids 44 | mutate( 45 | centroid = st_geometry(st_centroid(geometry)), 46 | centroid_long = st_coordinates(centroid)[, 1], 47 | centroid_lat = st_coordinates(centroid)[, 2], 48 | ) %>% 49 | # select columns and rename for staging 50 | select( 51 | # data source columns 52 | pwsid = pwsid, 53 | pws_name = pws_name, 54 | state, 55 | # county, 56 | # city, 57 | # owner, 58 | # geospatial columns 59 | st_areashape, 60 | centroid_long, 61 | centroid_lat, 62 | radius, 63 | geometry 64 | ) 65 | cat("Computed area, centroids, and radii from convex hulls.\n") 66 | cat("Combined into one layer; added geospatial columns.\n") 67 | 68 | # delete layer if it exists, then write to geopackage 69 | path_out <- path(staging_path, "wsb_labeled_ct.gpkg") 70 | if(file_exists(path_out)) file_delete(path_out) 71 | 72 | st_write(ct_wsb, path_out) 73 | cat("Wrote clean, labeled data to file.\n\n\n") 74 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_il.R: -------------------------------------------------------------------------------- 1 | # transform IL water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform IL polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for IL water service boundaries, clean, transform CRS 19 | il_wsb <- st_read( 20 | dsn = path(data_path, "boundary/il/Illinois_Municipal_Water_use_2012/Municipal_Water_Use_Statewide.gdb"), 21 | layer = "Municipal_Use_2012") %>% 22 | rename(geometry = "Shape") %>% 23 | # clean whitespace 24 | f_clean_whitespace_nas() %>% 25 | # transform to area weighted CRS 26 | st_transform(epsg_aw) %>% 27 | # correct invalid geometries 28 | st_make_valid() %>% 29 | janitor::clean_names() 30 | 31 | cat("Read IL boundary layer; cleaned whitespace; corrected geometries.\n ") 32 | 33 | # Compute centroids, convex hulls, and radius assuming circular 34 | # Combine data and merge geometries for rows with duplicate pwsids 35 | il_wsb <- il_wsb %>% 36 | mutate( 37 | state = "IL", 38 | # 161 cities have no pwsid listed 39 | # Of these, 123 have a seller pwsid listed 40 | pwsid = if_else(is.na(fac_id), seller_fac_id, fac_id), 41 | # Facility id blank = seller system, with name in "buys from" 42 | # Spot checking on name_1 for cases where fac_id is listed looks consistent 43 | pws_name = if_else(!is.na(fac_id), buys_from, name_1), 44 | # Preliminary geometry calculations 45 | # Calculate area sums and convex hulls 46 | st_areashape = st_area(geometry), 47 | convex_hull = st_geometry(st_convex_hull(geometry)), 48 | area_hull = st_area(convex_hull), 49 | ) %>% 50 | group_by(pwsid) %>% 51 | # mutate these new columns, knowing full well that duplicate rows 52 | # will be created, but that they will be dropped in the next step 53 | mutate( 54 | # combine all fragmented geometries 55 | geometry = st_union(geometry), 56 | # new area is the sum of the area of all polygons 57 | st_areashape = sum(st_areashape), 58 | area_hull = sum(area_hull), 59 | # new radius is calculated from the new area 60 | radius = sqrt(area_hull/pi), 61 | # combine data into list-formatted strings for character columns 62 | across(where(is.character), ~toString(unique(.))) 63 | ) %>% 64 | # only take the first result from each group 65 | slice(1) %>% 66 | ungroup() %>% 67 | # convert back to the project standard epsg 68 | st_transform(epsg) %>% 69 | # compute new centroids and note that when multipolygons are separated 70 | # by space, these are suspect and should not be used. Importantly, this 71 | # calculation occurs in the EPSG consistent with other staged data! 72 | mutate( 73 | centroid = st_geometry(st_centroid(geometry)), 74 | centroid_long = st_coordinates(centroid)[, 1], 75 | centroid_lat = st_coordinates(centroid)[, 2] 76 | ) %>% 77 | # select columns and rename for staging 78 | select( 79 | # data source columns 80 | pwsid, 81 | pws_name, 82 | state, 83 | # county, 84 | city = name_1, 85 | # owner, 86 | # geospatial columns 87 | st_areashape, 88 | centroid_long, 89 | centroid_lat, 90 | radius, 91 | geometry 92 | ) 93 | cat("Computed area, centroids, and radii from convex hulls.\n") 94 | cat("Combined into one layer; added geospatial columns.\n") 95 | 96 | # verify that there is only one pwsid per geometry 97 | n <- il_wsb %>% 98 | count(pwsid) %>% 99 | filter(n > 1) %>% 100 | nrow() 101 | cat(n, "duplicate pwsids in labeled data following fix.\n") 102 | 103 | # delete layer if it exists, then write to geopackage 104 | path_out <- path(staging_path, "wsb_labeled_il.gpkg") 105 | if(file_exists(path_out)) file_delete(path_out) 106 | 107 | st_write(il_wsb, path_out) 108 | cat("Wrote clean, labeled data to file.\n\n\n") 109 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_ks.R: -------------------------------------------------------------------------------- 1 | # transform KS water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform KS polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for KS water service boundaries, clean, transform CRS 19 | ks_wsb <- st_read(path(data_path, "boundary/ks/PWS_bnd_2021_0430.shp")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # transform to area weighted CRS 23 | st_transform(epsg_aw) %>% 24 | # correct invalid geometries 25 | st_make_valid() 26 | 27 | cat("Read KS boundary layer; cleaned whitespace; corrected geometries.\n ") 28 | 29 | # Compute centroids, convex hulls, and radius assuming circular 30 | ks_wsb <- ks_wsb %>% 31 | bind_rows() %>% 32 | mutate( 33 | state = "KS", 34 | # importantly, area calculations occur in area weighted epsg 35 | st_areashape = st_area(geometry), 36 | convex_hull = st_geometry(st_convex_hull(geometry)), 37 | area_hull = st_area(convex_hull), 38 | radius = sqrt(area_hull/pi) 39 | ) %>% 40 | # transform back to standard epsg 41 | st_transform(epsg) %>% 42 | # compute centroids 43 | mutate( 44 | centroid = st_geometry(st_centroid(geometry)), 45 | centroid_long = st_coordinates(centroid)[, 1], 46 | centroid_lat = st_coordinates(centroid)[, 2], 47 | ) %>% 48 | # select columns and rename for staging 49 | select( 50 | # data source columns 51 | pwsid = FED_ID, 52 | pws_name = NAMEWCPSTA, 53 | state, 54 | # county, 55 | # city, 56 | # owner, 57 | # geospatial columns 58 | st_areashape, 59 | centroid_long, 60 | centroid_lat, 61 | radius, 62 | geometry 63 | ) 64 | cat("Computed area, centroids, and radii from convex hulls.\n") 65 | cat("Combined into one layer; added geospatial columns.\n") 66 | 67 | # delete layer if it exists, then write to geopackage 68 | path_out <- path(staging_path, "wsb_labeled_ks.gpkg") 69 | if(file_exists(path_out)) file_delete(path_out) 70 | 71 | st_write(ks_wsb, path_out) 72 | cat("Wrote clean, labeled data to file.\n\n\n") 73 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_mo.R: -------------------------------------------------------------------------------- 1 | # transform MO water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform MO polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for MO water service boundaries, clean, transform CRS 19 | mo_wsb <- st_read(dsn = path(data_path, "boundary/mo/mo.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # drop multiple systems in one boundary, for now 23 | filter(str_detect(IPWS, "^MO\\d{5}")) %>% 24 | # transform to area weighted CRS 25 | st_transform(epsg_aw) %>% 26 | # correct invalid geometries 27 | st_make_valid() 28 | 29 | cat("Read MO boundary layer; cleaned whitespace; corrected geometries.\n ") 30 | 31 | # Compute centroids, convex hulls, and radius assuming circular 32 | mo_wsb <- mo_wsb %>% 33 | bind_rows() %>% 34 | mutate( 35 | state = "MO", 36 | # importantly, area calculations occur in area weighted epsg 37 | st_areashape = st_area(geometry), 38 | convex_hull = st_geometry(st_convex_hull(geometry)), 39 | area_hull = st_area(convex_hull), 40 | radius = sqrt(area_hull/pi) 41 | ) %>% 42 | # transform back to standard epsg 43 | st_transform(epsg) %>% 44 | # compute centroid 45 | mutate ( 46 | centroid = st_geometry(st_centroid(geometry)), 47 | centroid_long = st_coordinates(centroid)[, 1], 48 | centroid_lat = st_coordinates(centroid)[, 2] 49 | ) %>% 50 | # select columns and rename for staging 51 | select( 52 | # data source columns 53 | pwsid = IPWS, 54 | pws_name = PWSSNAME, 55 | state, 56 | county = COUNTY, 57 | # city, 58 | # owner, 59 | # geospatial columns 60 | st_areashape, 61 | centroid_long, 62 | centroid_lat, 63 | radius, 64 | geometry 65 | ) 66 | cat("Computed area, centroids, and radii from convex hulls.\n") 67 | cat("Combined into one layer; added geospatial columns.\n") 68 | 69 | 70 | # delete layer if it exists, then write to geopackage 71 | path_out <- path(staging_path, "wsb_labeled_mo.gpkg") 72 | if(file_exists(path_out)) file_delete(path_out) 73 | 74 | st_write(mo_wsb, path_out) 75 | cat("Wrote clean, labeled data to file.\n\n\n") 76 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_nc.R: -------------------------------------------------------------------------------- 1 | # transform NC water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform NC polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for NC water service boundaries, clean, transform CRS 19 | nc_wsb <- st_read(dsn = path(data_path, "boundary/nc/nc.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # transform to area weighted CRS 23 | st_transform(epsg_aw) %>% 24 | # correct invalid geometries 25 | st_make_valid() 26 | 27 | cat("Read NC boundary layer; cleaned whitespace; corrected geometries.\n ") 28 | 29 | # Compute centroids, convex hulls, and radius assuming circular 30 | nc_wsb <- nc_wsb %>% 31 | bind_rows() %>% 32 | mutate( 33 | state = "NC", 34 | wasyid = paste0("NC", wasyid), 35 | # importantly, area calculations occur in area weighted epsg 36 | st_areashape = st_area(geometry), 37 | convex_hull = st_geometry(st_convex_hull(geometry)), 38 | area_hull = st_area(convex_hull), 39 | radius = sqrt(area_hull/pi) 40 | ) %>% 41 | # transform back to standard epsg 42 | st_transform(epsg) %>% 43 | # compute centroids 44 | mutate( 45 | centroid = st_geometry(st_centroid(geometry)), 46 | centroid_long = st_coordinates(centroid)[, 1], 47 | centroid_lat = st_coordinates(centroid)[, 2], 48 | ) %>% 49 | # select columns and rename for staging 50 | select( 51 | # data source columns 52 | pwsid = wasyid, 53 | pws_name = wasyname, 54 | state, 55 | county = wapcs, 56 | # city, 57 | # owner, 58 | # geospatial columns 59 | st_areashape, 60 | centroid_long, 61 | centroid_lat, 62 | radius, 63 | geometry 64 | ) 65 | cat("Computed area, centroids, and radii from convex hulls.\n") 66 | cat("Combined into one layer; added geospatial columns.\n") 67 | 68 | 69 | # delete layer if it exists, then write to geopackage 70 | path_out <- path(staging_path, "wsb_labeled_nc.gpkg") 71 | if(file_exists(path_out)) file_delete(path_out) 72 | 73 | st_write(nc_wsb, path_out) 74 | cat("Wrote clean, labeled data to file.\n\n\n") 75 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_nj.R: -------------------------------------------------------------------------------- 1 | # transform NJ water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform NJ polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for NJ water service boundaries, clean, transform CRS 19 | nj_wsb <- st_read(dsn = path(data_path, "boundary/nj/nj.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # transform to area weighted CRS 23 | st_transform(epsg_aw) %>% 24 | # correct invalid geometries 25 | st_make_valid() 26 | 27 | cat("Read NJ boundary layer; cleaned whitespace; corrected geometries.\n ") 28 | 29 | # Compute centroids, convex hulls, and radius assuming circular 30 | nj_wsb <- nj_wsb %>% 31 | bind_rows() %>% 32 | mutate( 33 | state = "NJ", 34 | # importantly, area calculations occur in area weighted epsg 35 | st_areashape = st_area(geometry), 36 | convex_hull = st_geometry(st_convex_hull(geometry)), 37 | area_hull = st_area(convex_hull), 38 | radius = sqrt(area_hull/pi) 39 | ) %>% 40 | # transform back to standard epsg 41 | st_transform(epsg) %>% 42 | mutate( 43 | centroid = st_geometry(st_centroid(geometry)), 44 | centroid_long = st_coordinates(centroid)[, 1], 45 | centroid_lat = st_coordinates(centroid)[, 2], 46 | ) %>% 47 | # select columns and rename for staging 48 | select( 49 | # data source columns 50 | pwsid = PWID, 51 | pws_name = SYS_NAME, 52 | state, 53 | # county, # county code is first 2 digits of PWID 54 | # city, 55 | # owner, 56 | # geospatial columns 57 | st_areashape, 58 | centroid_long, 59 | centroid_lat, 60 | radius, 61 | geometry 62 | ) 63 | cat("Computed area, centroids, and radii from convex hulls.\n") 64 | cat("Combined into one layer; added geospatial columns.\n") 65 | 66 | 67 | # delete layer if it exists, then write to geopackage 68 | path_out <- path(staging_path, "wsb_labeled_nj.gpkg") 69 | if(file_exists(path_out)) file_delete(path_out) 70 | 71 | st_write(nj_wsb, path_out) 72 | cat("Wrote clean, labeled data to file.\n\n\n") 73 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_nm.R: -------------------------------------------------------------------------------- 1 | # transform NM water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform NM polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for NM water service boundaries, clean, transform CRS 19 | nm_wsb <- st_read(dsn = path(data_path, "boundary/nm/nm.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # drop rows where WaterSystem_ID is NA 23 | drop_na(Water_System_ID) %>% 24 | # filter for Water_System_ID matching pattern 25 | filter(str_detect(Water_System_ID, "^NM\\d{7}")) %>% 26 | # select first 9 characters of Water_System_ID 27 | mutate(Water_System_ID = substr(Water_System_ID, 1, 9)) %>% 28 | # transform to area weighted CRS 29 | st_transform(epsg_aw) %>% 30 | # correct invalid geometries 31 | st_make_valid() 32 | 33 | cat("Read NM boundary layer; cleaned whitespace; corrected geometries.\n ") 34 | 35 | # Compute centroids, convex hulls, and radius assuming circular 36 | nm_wsb <- nm_wsb %>% 37 | bind_rows() %>% 38 | mutate( 39 | state = "NM", 40 | geometry_source_detail = Polygon_Basis, 41 | # importantly, area calculations occur in area weighted epsg 42 | st_areashape = st_area(geometry), 43 | convex_hull = st_geometry(st_convex_hull(geometry)), 44 | area_hull = st_area(convex_hull), 45 | radius = sqrt(area_hull/pi) 46 | ) %>% 47 | # transform back to standard epsg 48 | st_transform(epsg) %>% 49 | # compute centroids 50 | mutate( 51 | centroid = st_geometry(st_centroid(geometry)), 52 | centroid_long = st_coordinates(centroid)[, 1], 53 | centroid_lat = st_coordinates(centroid)[, 2], 54 | ) %>% 55 | # select columns and rename for staging 56 | select( 57 | # data source columns 58 | pwsid = Water_System_ID, 59 | pws_name = PublicSystemName, 60 | state, 61 | county = CN, 62 | city = City, 63 | # owner, 64 | # geospatial columns 65 | st_areashape, 66 | centroid_long, 67 | centroid_lat, 68 | radius, 69 | geometry, 70 | geometry_source_detail 71 | ) 72 | cat("Computed area, centroids, and radii from convex hulls.\n") 73 | cat("Combined into one layer; added geospatial columns.\n") 74 | 75 | 76 | # delete layer if it exists, then write to geopackage 77 | path_out <- path(staging_path, "wsb_labeled_nm.gpkg") 78 | if(file_exists(path_out)) file_delete(path_out) 79 | 80 | st_write(nm_wsb, path_out) 81 | cat("Wrote clean, labeled data to file.\n\n\n") 82 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_ok.R: -------------------------------------------------------------------------------- 1 | # transform OK water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform OK polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for OK water service boundaries, clean, transform CRS 19 | ok_wsb <- st_read(path(data_path, "boundary/ok/ok.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # transform to area weighted CRS 23 | st_transform(epsg_aw) %>% 24 | # correct invalid geometries 25 | st_make_valid() 26 | 27 | cat("Read OK boundary layer; cleaned whitespace; corrected geometries.\n ") 28 | 29 | # Compute centroids, convex hulls, and radius assuming circular 30 | ok_wsb <- ok_wsb %>% 31 | bind_rows() %>% 32 | mutate( 33 | state = "OK", 34 | geometry_source_detail = source, 35 | # importantly, area calculations occur in area weighted epsg 36 | st_areashape = st_area(geometry), 37 | convex_hull = st_geometry(st_convex_hull(geometry)), 38 | area_hull = st_area(convex_hull), 39 | radius = sqrt(area_hull/pi) 40 | ) %>% 41 | # transform back to standard epsg 42 | st_transform(epsg) %>% 43 | # compute centroids 44 | mutate( 45 | centroid = st_geometry(st_centroid(geometry)), 46 | centroid_long = st_coordinates(centroid)[, 1], 47 | centroid_lat = st_coordinates(centroid)[, 2], 48 | ) %>% 49 | # select columns and rename for staging 50 | select( 51 | # data source columns 52 | pwsid, 53 | pws_name = name, 54 | state, 55 | county, 56 | # city, 57 | # owner, 58 | # geospatial columns 59 | st_areashape, 60 | centroid_long, 61 | centroid_lat, 62 | radius, 63 | geometry, 64 | geometry_source_detail 65 | ) 66 | cat("Computed area, centroids, and radii from convex hulls.\n") 67 | cat("Combined into one layer; added geospatial columns.\n") 68 | 69 | 70 | # delete layer if it exists, then write to geopackage 71 | path_out <- path(staging_path, "wsb_labeled_ok.gpkg") 72 | if(file_exists(path_out)) file_delete(path_out) 73 | 74 | st_write(ok_wsb, path_out) 75 | cat("Wrote clean, labeled data to file.\n\n\n") 76 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_pa.R: -------------------------------------------------------------------------------- 1 | # transform PA water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform PA polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for PA water service boundaries, clean, transform CRS 19 | pa_wsb <- st_read(dsn = path(data_path, "boundary/pa/pa.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # transform to area weighted CRS 23 | st_transform(epsg_aw) %>% 24 | # correct invalid geometries 25 | st_make_valid() 26 | 27 | cat("Read PA boundary layer; cleaned whitespace; corrected geometries.\n ") 28 | 29 | # Compute centroids, convex hulls, and radius assuming circular 30 | pa_wsb <- pa_wsb %>% 31 | bind_rows() %>% 32 | mutate( 33 | state = "PA", 34 | PWS_ID = paste0("PA", PWS_ID), 35 | # importantly, area calculations occur in area weighted epsg 36 | st_areashape = st_area(geometry), 37 | convex_hull = st_geometry(st_convex_hull(geometry)), 38 | area_hull = st_area(convex_hull), 39 | radius = sqrt(area_hull/pi) 40 | ) %>% 41 | # transform back to standard epsg 42 | st_transform(epsg) %>% 43 | mutate( 44 | centroid = st_geometry(st_centroid(geometry)), 45 | centroid_long = st_coordinates(centroid)[, 1], 46 | centroid_lat = st_coordinates(centroid)[, 2], 47 | ) %>% 48 | # select columns and rename for staging 49 | select( 50 | # data source columns 51 | pwsid = PWS_ID, 52 | pws_name = NAME, 53 | state, 54 | county = CNTY_NAME, 55 | # city, 56 | # owner, 57 | # geospatial columns 58 | st_areashape, 59 | centroid_long, 60 | centroid_lat, 61 | radius, 62 | geometry 63 | ) 64 | cat("Computed area, centroids, and radii from convex hulls.\n") 65 | cat("Combined into one layer; added geospatial columns.\n") 66 | 67 | 68 | # delete layer if it exists, then write to geopackage 69 | path_out <- path(staging_path, "wsb_labeled_pa.gpkg") 70 | if(file_exists(path_out)) file_delete(path_out) 71 | 72 | st_write(pa_wsb, path_out) 73 | cat("Wrote clean, labeled data to file.\n\n\n") 74 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_ri.R: -------------------------------------------------------------------------------- 1 | # Transform RI water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform RI polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # Helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # Path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read manually curated list of pwsids developed by EPIC to link shapefiles for 19 | # water districts with water systems (see here: https://docs.google.com/spreadsheets/d/13aVFXj9Ty5EsRNFuHczpX04HoCzc689LVGKcJIREXY4/edit#gid=0) 20 | pwsid_lookup <- read.csv(here::here("crosswalks/ri_pwsid_lookup.csv")) %>% 21 | select(PWSID, H20_DISTRI, NAME, pws_name) 22 | 23 | # Read layer for AZ water service boundaries, clean, transform CRS 24 | ri_wsb <- st_read(path(data_path, "boundary/ri/ri.geojson")) %>% 25 | # clean whitespace 26 | f_clean_whitespace_nas() %>% 27 | # transform to area weighted CRS 28 | st_transform(epsg_aw) %>% 29 | # calculate geometries and areas of individual polygons 30 | mutate( 31 | state = "RI", 32 | # area calculations occur in area weighted epsg 33 | st_areashape = st_area(geometry), 34 | convex_hull = st_geometry(st_convex_hull(geometry)), 35 | area_hull = st_area(convex_hull) 36 | ) 37 | 38 | cat("Read RI boundary layer; cleaned whitespace; corrected geometries.\n") 39 | 40 | ri_wsb <- ri_wsb %>% 41 | # join to pwsids 42 | left_join(pwsid_lookup, on = c("H20_DISTRI", "NAME")) %>% 43 | # clean up names 44 | janitor::clean_names() %>% 45 | # only keep boundaries with a pwsid (others are GW/SW sources it appears) 46 | filter(!is.na(pwsid)) %>% 47 | # group by pwsid to calculate total area based in multipolygons 48 | group_by(pwsid) %>% 49 | # mutate these new columns, knowing full well that duplicate rows 50 | # will be created, but that they will be dropped in the next step 51 | mutate( 52 | # combine all fragmented geometries 53 | geometry = st_union(geometry), 54 | # new area is the sum of the area of all polygons 55 | st_areashape = sum(st_areashape), 56 | area_hull = sum(area_hull), 57 | # new radius is calculated from the new area 58 | radius = sqrt(area_hull/pi), 59 | # combine data into list-formatted strings for character columns 60 | across(where(is.character), ~toString(unique(.))) 61 | ) %>% 62 | # only take the first result from each group 63 | slice(1) %>% 64 | ungroup() %>% 65 | # convert back to the project standard epsg 66 | st_transform(epsg) %>% 67 | # correct invalid geometries 68 | st_make_valid() %>% 69 | # compute new centroids and note that when multipolygons are separated 70 | # by space, these are suspect and should not be used. Importantly, this 71 | # calculation occurs in the EPSG consistent with other staged data! 72 | 73 | 74 | # Strangely, this step fails when run from run_pipeline in an ipykernel. 75 | # The error is "Found 1 feature with invalid spherical geometry." 76 | # But I thought st_make_valid should've solved this. 77 | # The workaround is to run this step manually from R. 78 | mutate( 79 | centroid = st_geometry(st_centroid(geometry)), 80 | centroid_long = st_coordinates(centroid)[, 1], 81 | centroid_lat = st_coordinates(centroid)[, 2] 82 | ) %>% 83 | # select columns and rename for staging 84 | select( 85 | # data source columns 86 | pwsid, 87 | pws_name, 88 | state, 89 | county, 90 | # geospatial columns 91 | st_areashape, 92 | centroid_long, 93 | centroid_lat, 94 | radius, 95 | geometry 96 | ) 97 | 98 | cat("Recalculated area, radius, centroids for multipolygon pwsids.\n") 99 | cat("Combined string values for multipolygon pwsids.\n") 100 | 101 | # verify that there is only one pwsid per geometry 102 | n <- ri_wsb %>% 103 | count(pwsid) %>% 104 | filter(n > 1) %>% 105 | nrow() 106 | cat(n, "duplicate pwsids in labeled data following fix.\n") 107 | 108 | 109 | # delete layer if it exists, then write to geopackage 110 | path_out <- path(staging_path, "wsb_labeled_ri.gpkg") 111 | if(file_exists(path_out)) file_delete(path_out) 112 | 113 | st_write(ri_wsb, path_out) -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_tx.R: -------------------------------------------------------------------------------- 1 | # transform TX water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform TX polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for TX water service boundaries, clean, transform CRS 19 | tx_wsb <- st_read(path(data_path, 20 | "boundary/tx/PWS_shapefile/PWS_Export.shp")) %>% 21 | # clean whitespace 22 | f_clean_whitespace_nas() %>% 23 | # transform to area weighted CRS 24 | st_transform(epsg_aw) %>% 25 | # correct invalid geometries 26 | st_make_valid() 27 | 28 | cat("Read TX boundary layer; cleaned whitespace; corrected geometries.\n ") 29 | 30 | # Compute centroids, convex hulls, and radius assuming circular 31 | tx_wsb <- tx_wsb %>% 32 | bind_rows() %>% 33 | mutate( 34 | state = "TX", 35 | geometry_source_detail = Source, 36 | # importantly, area calculations occur in area weighted epsg 37 | st_areashape = st_area(geometry), 38 | convex_hull = st_geometry(st_convex_hull(geometry)), 39 | area_hull = st_area(convex_hull), 40 | radius = sqrt(area_hull/pi) 41 | ) %>% 42 | # transform back to standard epsg 43 | st_transform(epsg) %>% 44 | # compute centroids 45 | mutate( 46 | centroid = st_geometry(st_centroid(geometry)), 47 | centroid_long = st_coordinates(centroid)[, 1], 48 | centroid_lat = st_coordinates(centroid)[, 2], 49 | ) %>% 50 | # select columns and rename for staging 51 | select( 52 | # data source columns 53 | pwsid = PWSId, 54 | pws_name = pwsName, 55 | state, 56 | # county, 57 | # city, 58 | # owner, 59 | # geospatial columns 60 | st_areashape, 61 | centroid_long, 62 | centroid_lat, 63 | radius, 64 | geometry, 65 | geometry_source_detail 66 | ) 67 | cat("Computed area, centroids, and radii from convex hulls.\n") 68 | cat("Combined into one layer; added geospatial columns.\n") 69 | 70 | 71 | # delete layer if it exists, then write to geopackage 72 | path_out <- path(staging_path, "wsb_labeled_tx.gpkg") 73 | if(file_exists(path_out)) file_delete(path_out) 74 | 75 | st_write(tx_wsb, path_out) 76 | cat("Wrote clean, labeled data to file.\n\n\n") 77 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_ut.R: -------------------------------------------------------------------------------- 1 | # transform UT water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform UT polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for UT water service boundaries, clean, transform CRS 19 | ut_wsb <- st_read(dsn = path(data_path, "boundary/ut/ut.geojson"), 20 | quiet = TRUE) %>% 21 | # clean whitespace 22 | f_clean_whitespace_nas() %>% 23 | # drop rows where DWSYSNUM is NA 24 | drop_na(DWSYSNUM) %>% 25 | # filter for DWSYSNUM matching pattern 26 | filter(str_detect(DWSYSNUM, "^UTAH\\d{5}$")) %>% 27 | # replace missing DWNAME with WRENAME 28 | mutate(DWNAME = ifelse(is.na(DWNAME), WRENAME, DWNAME)) %>% 29 | # transform to area weighted CRS 30 | st_transform(epsg_aw) %>% 31 | # correct invalid geometries 32 | st_make_valid() 33 | 34 | cat("Read UT boundary layer; cleaned whitespace; corrected geometries.\n ") 35 | 36 | # Compute centroids, convex hulls, and radius assuming circular 37 | ut_wsb <- ut_wsb %>% 38 | bind_rows() %>% 39 | mutate( 40 | state = "UT", 41 | geometry_source_detail = DATASOURCE, 42 | # importantly, area calculations occur in area weighted epsg 43 | st_areashape = st_area(geometry), 44 | convex_hull = st_geometry(st_convex_hull(geometry)), 45 | area_hull = st_area(convex_hull), 46 | radius = sqrt(area_hull/pi) 47 | ) %>% 48 | # transform back to standard epsg 49 | st_transform(epsg) %>% 50 | mutate( 51 | centroid = st_geometry(st_centroid(geometry)), 52 | centroid_long = st_coordinates(centroid)[, 1], 53 | centroid_lat = st_coordinates(centroid)[, 2], 54 | ) %>% 55 | # select columns and rename for staging 56 | select( 57 | # data source columns 58 | pwsid = DWSYSNUM, 59 | pws_name = DWNAME, 60 | state, 61 | county = COUNTY, 62 | # city, 63 | # owner, 64 | # geospatial columns 65 | st_areashape, 66 | centroid_long, 67 | centroid_lat, 68 | radius, 69 | geometry, 70 | geometry_source_detail 71 | ) 72 | cat("Computed area, centroids, and radii from convex hulls.\n") 73 | cat("Combined into one layer; added geospatial columns.\n") 74 | 75 | 76 | # delete layer if it exists, then write to geopackage 77 | path_out <- path(staging_path, "wsb_labeled_ut.gpkg") 78 | if(file_exists(path_out)) file_delete(path_out) 79 | 80 | st_write(ut_wsb, path_out) 81 | cat("Wrote clean, labeled data to file.\n\n\n") 82 | -------------------------------------------------------------------------------- /src/transformers/states/transform_wsb_wa.R: -------------------------------------------------------------------------------- 1 | # transform WA water system data to standard model ------------------- 2 | 3 | cat("Preparing to transform WA polygon boundary data.\n\n") 4 | 5 | library(fs) 6 | library(sf) 7 | library(tidyverse) 8 | 9 | # helper function 10 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 11 | 12 | # path to save raw data, staging data, and standard projection 13 | data_path <- Sys.getenv("WSB_DATA_PATH") 14 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 15 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 16 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 17 | 18 | # Read layer for WA water service boundaries, clean, transform CRS 19 | wa_wsb <- st_read(path(data_path, "boundary/wa/wa.geojson")) %>% 20 | # clean whitespace 21 | f_clean_whitespace_nas() %>% 22 | # filter for five-character pwsid's 23 | filter(str_detect(WS_ID, "^.{5}$")) %>% 24 | # transform to area weighted CRS 25 | st_transform(epsg_aw) %>% 26 | # correct invalid geometries 27 | st_make_valid() 28 | 29 | cat("Read WA boundary layer; cleaned whitespace; corrected geometries.\n ") 30 | 31 | # Compute centroids, convex hulls, and radius assuming circular 32 | wa_wsb <- wa_wsb %>% 33 | bind_rows() %>% 34 | mutate( 35 | state = "WA", 36 | WS_ID = paste0("WA53", WS_ID), 37 | # importantly, area calculations occur in area weighted epsg 38 | st_areashape = st_area(geometry), 39 | convex_hull = st_geometry(st_convex_hull(geometry)), 40 | area_hull = st_area(convex_hull), 41 | radius = sqrt(area_hull/pi) 42 | ) %>% 43 | # transform back to standard epsg 44 | st_transform(epsg) %>% 45 | # compute centroids 46 | mutate( 47 | centroid = st_geometry(st_centroid(geometry)), 48 | centroid_long = st_coordinates(centroid)[, 1], 49 | centroid_lat = st_coordinates(centroid)[, 2], 50 | ) %>% 51 | # select columns and rename for staging 52 | select( 53 | # data source columns 54 | pwsid = WS_ID, 55 | pws_name = WS_Name, 56 | state, 57 | county = County, 58 | # city, 59 | # owner, 60 | # geospatial columns 61 | st_areashape, 62 | centroid_long, 63 | centroid_lat, 64 | radius, 65 | geometry 66 | ) 67 | cat("Computed area, centroids, and radii from convex hulls.\n") 68 | cat("Combined into one layer; added geospatial columns.\n") 69 | 70 | 71 | # delete layer if it exists, then write to geopackage 72 | path_out <- path(staging_path, "wsb_labeled_wa.gpkg") 73 | if(file_exists(path_out)) file_delete(path_out) 74 | 75 | st_write(wa_wsb, path_out) 76 | cat("Wrote clean, labeled data to file.\n\n\n") 77 | -------------------------------------------------------------------------------- /src/transformers/transform_contributed_pws.R: -------------------------------------------------------------------------------- 1 | # Transform contributed pws shapefiles ------------------------------------ 2 | cat("Preparing to transform individually contributed pws shapefiles.\n\n") 3 | 4 | library(fs) 5 | library(sf) 6 | library(tidyverse) 7 | 8 | # helper function 9 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 10 | 11 | # path to save raw data, staging data, and standard projection 12 | data_path <- Sys.getenv("WSB_DATA_PATH") 13 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 14 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 15 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 16 | 17 | # Read layer for IL water service boundaries, clean, transform CRS 18 | pws_wsb <- st_read(path(data_path, "contributed_pws/contributed_pws.gpkg"), 19 | geometry_column = "geom", 20 | stringsAsFactors = FALSE) %>% 21 | rename(geometry = geom) %>% 22 | filter(!is.na(pwsid)) %>% 23 | # clean whitespace 24 | f_clean_whitespace_nas() %>% 25 | # transform to area weighted CRS 26 | st_transform(epsg_aw) %>% 27 | # correct invalid geometries 28 | st_make_valid() %>% 29 | janitor::clean_names() 30 | 31 | cat("Read individual pws shapefiles; cleaned whitespace; corrected geometries.\n ") 32 | 33 | # Compute centroids, convex hulls, and radius assuming circular 34 | # Combine data and merge geometries for rows with duplicate pwsids 35 | pws_wsb <- pws_wsb %>% 36 | mutate( 37 | state = substr(pwsid, 1, 2), 38 | geometry_source_detail = data_source, 39 | # importantly, area calculations occur in area weighted epsg 40 | st_areashape = st_area(geometry), 41 | convex_hull = st_geometry(st_convex_hull(geometry)), 42 | area_hull = st_area(convex_hull), 43 | radius = sqrt(area_hull/pi) 44 | ) %>% 45 | # transform back to standard epsg 46 | st_transform(epsg) %>% 47 | st_make_valid() %>% 48 | # compute centroid 49 | mutate ( 50 | centroid = st_geometry(st_centroid(geometry)), 51 | centroid_long = st_coordinates(centroid)[, 1], 52 | centroid_lat = st_coordinates(centroid)[, 2], 53 | 54 | ) %>% 55 | # select columns and rename for staging 56 | select( 57 | # data source columns 58 | pwsid, 59 | pws_name, 60 | state, 61 | # county, 62 | # city, 63 | # owner, 64 | # geospatial columns 65 | st_areashape, 66 | centroid_long, 67 | centroid_lat, 68 | radius, 69 | geometry, 70 | geometry_source_detail 71 | ) 72 | cat("Computed area, centroids, and radii from convex hulls.\n") 73 | cat("Combined into one layer; added geospatial columns.\n") 74 | 75 | # delete layer if it exists, then write to geopackage 76 | path_out <- path(staging_path, "contributed_pws.gpkg") 77 | if(file_exists(path_out)) file_delete(path_out) 78 | 79 | st_write(pws_wsb, path_out) 80 | cat("Wrote clean, labeled data to geopackage.\n\n\n") 81 | 82 | -------------------------------------------------------------------------------- /src/transformers/transform_echo.R: -------------------------------------------------------------------------------- 1 | # transform ECHO data ----------------------------------- 2 | 3 | library(fs) 4 | library(sf) 5 | library(tidyverse) 6 | 7 | # source functions 8 | dir_ls(here::here("src/functions")) %>% walk(~source(.x)) 9 | 10 | # helper function 11 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 12 | 13 | # path to save raw data and standard projection 14 | data_path <- Sys.getenv("WSB_DATA_PATH") 15 | echo_data_path <- path(data_path, "echo") 16 | echo_file <- path(echo_data_path, "ECHO_EXPORTER.CSV") 17 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 18 | path_out <- path(staging_path, "echo.csv") 19 | 20 | cols <- c('REGISTRY_ID', 'FAC_NAME', 'FAC_NAME', 'FAC_STREET', 21 | 'FAC_CITY', 'FAC_STATE', 'FAC_ZIP', 'FAC_COUNTY', 22 | 'FAC_FIPS_CODE', 'FAC_LAT', 'FAC_LONG', 'FAC_INDIAN_CNTRY_FLG', 23 | 'FAC_FEDERAL_FLG', 'FAC_COLLECTION_METHOD', 24 | 'FAC_REFERENCE_POINT', 'FAC_ACCURACY_METERS', 25 | 'FAC_DERIVED_HUC', 'FAC_MAJOR_FLAG', 'FAC_ACTIVE_FLAG', 26 | 'FAC_QTRS_WITH_NC', 'SDWIS_FLAG', 'SDWA_IDS', 27 | 'SDWA_SYSTEM_TYPES', 'SDWA_INFORMAL_COUNT', 28 | 'SDWA_FORMAL_ACTION_COUNT', 'SDWA_COMPLIANCE_STATUS', 29 | 'SDWA_SNC_FLAG', 'FAC_DERIVED_TRIBES', 'FAC_DERIVED_HUC', 30 | 'FAC_DERIVED_WBD', 'FAC_DERIVED_STCTY_FIPS', 31 | 'FAC_DERIVED_ZIP', 'FAC_DERIVED_CD113', 'FAC_DERIVED_CB2010', 32 | 'FAC_PERCENT_MINORITY', 'FAC_POP_DEN', 'EJSCREEN_FLAG_US') 33 | 34 | bool_cols = c('fac_major_flag', 'fac_active_flag', 'sdwis_flag', 35 | 'sdwa_snc_flag', 'fac_indian_cntry_flg', 'fac_federal_flg', 36 | 'ejscreen_flag_us') 37 | 38 | # read in ECHO data and clean 39 | echo <- read_csv(echo_file, col_select=cols) %>% 40 | # make column names lowercase 41 | janitor::clean_names() %>% 42 | # clean whitespace and nulls 43 | f_clean_whitespace_nas() %>% 44 | # drop duplicates 45 | unique() %>% 46 | # drop null SDWA_IDS 47 | filter(!is.na(sdwa_ids)) %>% 48 | # split space-delimited pwsid's in sdwa_ids into lists 49 | mutate(sdwa_ids = str_split(sdwa_ids, " ")) %>% 50 | # explode rows with multiple pwsid's 51 | unnest(sdwa_ids) %>% 52 | # rename sdwa_ids to pwsid 53 | rename(pwsid = sdwa_ids) %>% 54 | # for bool_cols, map N to 0, Y to 1, and '' to NaN 55 | mutate_at(bool_cols, recode, `N`=0, `Y`=1, .default=NaN) %>% 56 | # convert bool_cols to boolean type 57 | mutate_at(bool_cols, as.logical) 58 | 59 | # Delete output file if exists 60 | if(file_exists(path_out)) file_delete(path_out) 61 | 62 | # Drop geometry and write as a CSV 63 | echo %>% write_csv(path_out) 64 | -------------------------------------------------------------------------------- /src/transformers/transform_frs.R: -------------------------------------------------------------------------------- 1 | # Transform EPA facility registry service data --------------------------------- 2 | 3 | library(fs) 4 | library(sf) 5 | library(tidyverse) 6 | 7 | # source functions 8 | dir_ls(here::here("src/functions")) %>% walk(~source(.x)) 9 | 10 | # path to save raw data and standard projection 11 | data_path <- Sys.getenv("WSB_DATA_PATH") 12 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 13 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 14 | 15 | # Read un-zipped geodatabase (~6GB so querying on water to reduce file size) 16 | # First look at available layers 17 | frs_layers <- st_layers(dsn = path(data_path, "frs/FRS_INTERESTS.gdb")) 18 | 19 | # SQL query to target facilities with water focus 20 | get_water_frs <- " 21 | SELECT * 22 | FROM FACILITY_INTERESTS 23 | WHERE INTEREST_TYPE IN ( 24 | 'COMMUNITY WATER SYSTEM', 25 | 'NON-TRANSIENT NON-COMMUNITY WATER SYSTEM', 26 | 'TRANSIENT NON-COMMUNITY WATER SYSTEM', 27 | 'WATER TREATMENT PLANT', 28 | 'DRINKING WATER PROGRAM', 29 | 'DRINKING WATER SYSTEM' 30 | )" 31 | 32 | # Read layer for FRS_INTERESTS with conditional query on `INTEREST_TYPE`. 33 | # Then, transform to standard epsg. 34 | frs_water <- path(data_path, "frs/FRS_INTERESTS.gdb") %>% 35 | st_read(query = get_water_frs, 36 | layer = "FACILITY_INTERESTS", 37 | stringsAsFactors = FALSE) %>% 38 | st_transform(epsg) 39 | 40 | cat("Read labeled FRS layer and transformed to CRS:", epsg, "\n ") 41 | 42 | # Visualize points 43 | #plot(st_geometry(frs_water), pch = 1, col = 'blue') 44 | 45 | 46 | # General cleaning -------------------------------------------------------- 47 | 48 | # Set column names to lower case, clean names, clean whitespace, 49 | # split PWSID and Facility ID from pgm_sys_id, add reported state name 50 | frs_water <- frs_water %>% 51 | rename(geometry = Shape) %>% 52 | janitor::clean_names() %>% 53 | f_clean_whitespace_nas() %>% 54 | mutate(pwsid = word(pgm_sys_id, 1), 55 | state = substr(pwsid, 1, 2), 56 | facility_id = word(pgm_sys_id, 2), 57 | facility_id = ifelse(pwsid == facility_id, NA, facility_id)) 58 | 59 | # Write to geopackage ---------------------------------------------------- 60 | path_out <- path(staging_path, "frs.gpkg") 61 | if(file_exists(path_out)) file_delete(path_out) 62 | 63 | st_write(frs_water, path_out) 64 | cat("Wrote FRS data to geopackage. \n") 65 | -------------------------------------------------------------------------------- /src/transformers/transform_labeled.R: -------------------------------------------------------------------------------- 1 | # combine transformed state water system data ---------------------------- 2 | 3 | library(fs) 4 | library(sf) 5 | library(tidyverse) 6 | library(mapview) 7 | 8 | # path to save staging data and standard projection 9 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 10 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 11 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 12 | 13 | # list, read, and combine all staged state wsb files 14 | wsb_labeled <- dir_ls(staging_path, 15 | regex = "wsb_labeled_[a-z][a-z].gpkg$") %>% 16 | map_df(~st_read(., quiet = TRUE)) %>% 17 | rename(geometry = geom) %>% 18 | # remove NA pwsid 19 | filter(!is.na(pwsid)) %>% 20 | suppressMessages() 21 | 22 | # combine data and merge geometries for rows with duplicate pwsids -------- 23 | 24 | # show there are rows with duplicate pwsids 25 | multi <- st_drop_geometry(wsb_labeled) %>% 26 | count(pwsid, sort = TRUE) %>% 27 | filter(n > 1) 28 | cat("Detected", nrow(multi), "groups of rows with duplicate pwsids.\n") 29 | 30 | # add column indicating if row has a duplicated pwsid 31 | wsb_labeled <- wsb_labeled %>% 32 | # label duplicated pwsid geometries 33 | mutate(is_multi = ifelse(pwsid %in% multi$pwsid, TRUE, FALSE)) 34 | cat("Added `is_multi` field to wsb labeled data.\n") 35 | 36 | # separate rows without duplicated pwsids 37 | wsb_labeled_no_multi <- wsb_labeled %>% 38 | filter(is_multi == FALSE) 39 | 40 | # for rows with duplicated pwsids: 41 | # union geometries, recalculate area, centroids, radius 42 | wsb_labeled_multi <- wsb_labeled %>% 43 | # filter for rows with duplicated pwsid's 44 | filter(is_multi == TRUE) %>% 45 | st_make_valid() %>% 46 | # importantly, all calculations take place in AW epsg 47 | st_transform(epsg_aw) %>% 48 | group_by(pwsid) %>% 49 | # mutate these new columns, knowing full well that duplicate rows 50 | # will be created, but that they will be dropped in the next step 51 | mutate( 52 | # combine all fragmented geometries 53 | geometry = st_union(geometry), 54 | # new area is the sum of the area of all polygons 55 | st_areashape = st_area(geometry), 56 | convex_hull = st_geometry(st_convex_hull(geometry)), 57 | area_hull = st_area(convex_hull), 58 | # new radius is calculated from the new area 59 | radius = sqrt(area_hull/pi), 60 | # combine data into list-formatted strings for character columns 61 | across(where(is.character), ~toString(unique(.))) 62 | ) %>% 63 | # only take the first result from each group 64 | slice(1) %>% 65 | ungroup() %>% 66 | # convert back to the project standard epsg 67 | st_transform(epsg) %>% 68 | st_make_valid() %>% 69 | # compute new centroids and note that when multipolygons are separated 70 | # by space, these are suspect and should not be used. Importantly, this 71 | # calculation occurs in the EPSG consistent with other staged data! 72 | mutate( 73 | centroid = st_geometry(st_centroid(geometry)), 74 | centroid_long = st_coordinates(centroid)[, 1], 75 | centroid_lat = st_coordinates(centroid)[, 2] 76 | ) %>% 77 | # remove centroid, convex_hull, and area_hull columns 78 | select(-c(centroid, convex_hull, area_hull)) %>% 79 | # replace empty or string "NA" cells with NA 80 | mutate(across(where(is.character), ~ gsub("^$|^ $|^NA$", NA, .))) %>% 81 | # convert columns with class units to numeric 82 | # before this, cols st_areashape and radius are 83 | # numeric, but have the class "units" 84 | mutate(across(where(is.numeric), as.numeric)) 85 | 86 | cat("Recalculated area, radius, centroids for multipolygon pwsids.\n") 87 | cat("Combined string values for multipolygon pwsids.\n") 88 | 89 | # view 90 | # mapview::mapview(wsb_labeled_multi, zcol = "pwsid", burst = TRUE) 91 | 92 | # combine wsb labeled data with corrected rows 93 | wsb_labeled_clean <- bind_rows(wsb_labeled_no_multi, wsb_labeled_multi) %>% 94 | # remove is_multi column 95 | select(-is_multi) 96 | 97 | # verify that there is only one pwsid per geometry 98 | n <- wsb_labeled_clean %>% 99 | st_drop_geometry() %>% 100 | count(pwsid) %>% 101 | filter(n > 1) %>% 102 | nrow() 103 | cat(n, "duplicate pwsids in labeled data following fix.\n") 104 | 105 | # delete layer if it exists, then write to geopackage 106 | path_out <- path(staging_path, "wsb_labeled_clean.gpkg") 107 | if(file_exists(path_out)) file_delete(path_out) 108 | 109 | st_write(wsb_labeled_clean, path_out) 110 | cat("Wrote clean, labeled data to file.\n") 111 | -------------------------------------------------------------------------------- /src/transformers/transform_mhp.R: -------------------------------------------------------------------------------- 1 | # Transform mobile home park point data ---------------------------------- 2 | 3 | library(fs) 4 | library(sf) 5 | library(tidyverse) 6 | 7 | # helper functions 8 | dir_ls(here::here("src/functions")) %>% walk(~source(.x)) 9 | 10 | # path to save raw data and standard projection 11 | data_path <- Sys.getenv("WSB_DATA_PATH") 12 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 13 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 14 | 15 | # Read un-zipped geodatabase, clean names, transform to standard epsg 16 | mhp_sp <- st_read(dsn = path(data_path, "mhp/mhp.geojson")) %>% 17 | janitor::clean_names() %>% 18 | st_transform(crs = epsg) 19 | 20 | cat("Read MHP layer, cleaned names, & transformed to CRS:", epsg, "\n ") 21 | 22 | # Visualize points 23 | #plot(st_geometry(mhp_sp), pch = 1, col = 'blue') 24 | 25 | 26 | # Clean attribute data ---------------------------------------------------- 27 | 28 | mhp_sp <- mhp_sp %>% 29 | # clean size column and replace -999 missing units with NA 30 | mutate(size = as.factor(tolower(size)), 31 | units = na_if(units, -999)) %>% 32 | # clean column names 33 | rename( 34 | object_id = objectid, 35 | mhp_id = mhpid, 36 | mhp_name = name, 37 | zipcode = zip, 38 | county_fips = countyfips, 39 | source_date = sourcedate, 40 | rev_geo_flag = revgeoflag 41 | ) %>% 42 | f_clean_whitespace_nas() 43 | 44 | # Write clean mobile home park centroids 45 | path_out <- path(staging_path, "mhp_clean.gpkg") 46 | if(file_exists(path_out)) file_delete(path_out) 47 | 48 | st_write(mhp_sp, path_out) 49 | -------------------------------------------------------------------------------- /src/transformers/transform_sdwis_geo_areas.py: -------------------------------------------------------------------------------- 1 | #%% 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | Created on Fri Feb 4 11:32:27 2022 6 | 7 | @author: jjg 8 | """ 9 | 10 | 11 | # Libraries 12 | import pandas as pd 13 | import numpy as np 14 | import os, sys 15 | 16 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 17 | 18 | from transformers.transform_sdwis_helpers import clean_up_columns, trim_whitespace, date_type 19 | 20 | from dotenv import load_dotenv 21 | 22 | # %% File path and data import 23 | load_dotenv() 24 | 25 | data_path = os.environ["WSB_DATA_PATH"] 26 | staging_path = os.environ["WSB_STAGING_PATH"] 27 | sdwis_data_path = os.path.join(data_path, "sdwis") 28 | 29 | file = "GEOGRAPHIC_AREA.CSV" 30 | 31 | # We only use a few columns from this data. Most other columns 32 | # are better in the primary SDWIS file. 33 | 34 | # Though, these columns are potentially valuable, just currently unused: 35 | # area_type_code 36 | # tribal_code 37 | 38 | geo_area = pd.read_csv(os.path.join(sdwis_data_path, file)) 39 | 40 | # %% Basic cleaning 41 | 42 | # Remove table name from column headers 43 | geo_area = clean_up_columns(geo_area) 44 | 45 | # Trim whitespace 46 | geo_area = trim_whitespace(geo_area) 47 | 48 | # Drop duplicates 49 | geo_area = geo_area.drop_duplicates() 50 | 51 | # Narrow to columns of interest 52 | geo_area = geo_area[["pwsid", "city_served", "county_served"]] 53 | 54 | 55 | # %% Clean city_served column 56 | 57 | geo_area["city_served"] = (geo_area["city_served"] 58 | .str.replace(r"\.?-\.?\s*\d{4}", "", regex=True) # Remove "-" followed by 0 or 1 ".", 0 or more spaces, and four digits 59 | .str.replace(r"'", "'", regex=True) # Replace "'" with "'" 60 | .str.replace(r"\(\s*[A-Z]\s*\)", "", regex=True) # Replace parenthetical with single letter (plus any spaces) in it, e.g. (V) or (T) 61 | .str.replace(r"\s\s+", " ", regex=True)) # Replace excess whitespace within line with a single space 62 | 63 | # Trim whitespace again 64 | geo_area = trim_whitespace(geo_area) 65 | 66 | #%% Deduplicate 67 | 68 | # In a previous SDWIS download, the records with area_type_code = "TR" were 69 | # excluded. Now they're included. 70 | 71 | # But records with area_type_code = "TR" are contributing duplicates; 72 | # there's often another record of a different area_type_code. 73 | 74 | # Some notes about these duplicates: 75 | # The ones with area_type_code = "TR" also have the tribal_code attribute populated. 76 | # city_served and county_served is only populated when area_type_code != "TR". 77 | 78 | # How to eliminate these duplicates? 79 | # Since we specifically need the city_served and county_served data 80 | # downstream, we can eliminate records that have NA's in both fields. 81 | # This also eliminates the duplicates. 82 | 83 | geo_area = geo_area[ 84 | geo_area["city_served"].notna() | 85 | geo_area["county_served"].notna()] 86 | 87 | 88 | # %% Raise duplication issue on key fields 89 | 90 | if not geo_area["pwsid"].is_unique: 91 | raise Exception("pwsid is not unique.") 92 | #%% 93 | # Save csv in staging 94 | 95 | geo_area.to_csv(os.path.join(staging_path, "sdwis_geographic_area.csv"), index = False) 96 | -------------------------------------------------------------------------------- /src/transformers/transform_sdwis_helpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Feb 3 15:19:40 2022 5 | 6 | @author: jjg 7 | """ 8 | 9 | # Libraries 10 | from typing import List 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | # Clean up columns 16 | def clean_up_columns(df: pd.DataFrame): 17 | """ 18 | Remove table names from column headers and set to lower case. 19 | 20 | Args: 21 | df : data frame for transformation 22 | 23 | Output: 24 | df_clean : cleaned data frame 25 | 26 | """ 27 | # Remove column header issues 28 | df.columns = (df.columns.str.replace('.*\\.', '', regex = True)) 29 | 30 | # set all names to lowercase 31 | df.columns = df.columns.str.lower() 32 | 33 | # remove column extras 34 | df = df.dropna(axis = 1, how = "all") 35 | 36 | return df 37 | 38 | 39 | # Standardize date columns 40 | def date_type(df: pd.DataFrame, date_columns: List[str]): 41 | """ 42 | Clean up date columns using pandas datetime 43 | 44 | Args: 45 | df : data frame for transformation 46 | 47 | Output: 48 | df : cleaned data frame 49 | 50 | """ 51 | # set date columns to date 52 | for x in date_columns: 53 | df[x] = ( 54 | pd.to_datetime(df[x], format="%d-%b-%y") 55 | .dt.normalize()) 56 | 57 | 58 | # Trims all white space 59 | def trim_whitespace(df: pd.DataFrame): 60 | 61 | df = df.copy() 62 | 63 | for col in df.select_dtypes(include=[object]): 64 | df[col] = df[col].str.strip() 65 | 66 | return df -------------------------------------------------------------------------------- /src/transformers/transform_sdwis_service.py: -------------------------------------------------------------------------------- 1 | #%% 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | Created on Fri Feb 11 16:54:19 2022 6 | 7 | @author: jjg 8 | """ 9 | 10 | 11 | # Libraries 12 | import pandas as pd 13 | import numpy as np 14 | import os, sys 15 | 16 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 17 | 18 | from transformers.transform_sdwis_helpers import clean_up_columns, trim_whitespace, date_type 19 | 20 | from dotenv import load_dotenv 21 | 22 | # %% File path and data import 23 | load_dotenv() 24 | 25 | data_path = os.environ["WSB_DATA_PATH"] 26 | staging_path = os.environ["WSB_STAGING_PATH"] 27 | sdwis_data_path = os.path.join(data_path, "sdwis") 28 | 29 | file = "SERVICE_AREA.CSV" 30 | service_area = pd.read_csv(os.path.join(sdwis_data_path, file)) 31 | 32 | # %% Basic cleaning 33 | 34 | # Remove table name from column headers 35 | service_area = clean_up_columns(service_area) 36 | 37 | # Trim whitespace 38 | service_area = trim_whitespace(service_area) 39 | 40 | # Drop duplicates 41 | service_area = service_area.drop_duplicates() 42 | 43 | # Drop fully empty columns (cities_served, counties_served -- get from other tables) 44 | service_area = service_area.dropna(how='all', axis=1) 45 | 46 | 47 | # %% Sanitize booleans 48 | bool_cols = ["is_primary_service_area_code"] 49 | 50 | for i in bool_cols: 51 | service_area[i] = service_area[i].map({'N': 0, 'Y': 1, '': np.NaN, np.NaN : np.NaN}) 52 | service_area[i] = service_area[i].astype('boolean') 53 | 54 | 55 | # %% Raise duplication issue on key fields 56 | 57 | if service_area[["pwsid", "service_area_type_code"]].duplicated().any(): 58 | raise Exception("pwsid is not unique.") 59 | 60 | # %% Save csv in staging 61 | 62 | service_area.to_csv(os.path.join(staging_path, "sdwis_service_area.csv"), index = False) 63 | -------------------------------------------------------------------------------- /src/transformers/transform_sdwis_ws.py: -------------------------------------------------------------------------------- 1 | #%% 2 | #!/usr/bin/env python3 3 | # -*- coding: utf-8 -*- 4 | """ 5 | Created on Thu Feb 3 15:11:24 2022 6 | 7 | @author: jjg 8 | """ 9 | 10 | # Libraries 11 | import pandas as pd 12 | import numpy as np 13 | import os, sys 14 | 15 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 16 | 17 | from transformers.transform_sdwis_helpers import clean_up_columns, trim_whitespace, date_type 18 | 19 | from dotenv import load_dotenv 20 | 21 | # %% File path and data import 22 | load_dotenv() 23 | 24 | data_path = os.environ["WSB_DATA_PATH"] 25 | staging_path = os.environ["WSB_STAGING_PATH"] 26 | sdwis_data_path = os.path.join(data_path, "sdwis") 27 | 28 | file = "WATER_SYSTEM.CSV" 29 | water_system = pd.read_csv(os.path.join(sdwis_data_path, file)) 30 | 31 | # %% Basic cleaning 32 | 33 | # Remove table name from column headers 34 | water_system = clean_up_columns(water_system) 35 | 36 | # Trim whitespace 37 | water_system = trim_whitespace(water_system) 38 | 39 | # Drop duplicates 40 | water_system = water_system.drop_duplicates() 41 | 42 | # Drop fully empty columns (cities_served, counties_served -- get from other tables) 43 | water_system = water_system.dropna(how='all', axis=1) 44 | 45 | 46 | # %% Sanitize booleans 47 | bool_cols = ["npm_candidate", "is_wholesaler_ind", \ 48 | "is_school_or_daycare_ind", "source_water_protection_code"] 49 | 50 | for i in bool_cols: 51 | water_system[i] = water_system[i].map({'N': 0, 'Y': 1, '': np.NaN, np.NaN : np.NaN}) 52 | water_system[i] = water_system[i].astype('boolean') 53 | 54 | # %% Standardize dates 55 | 56 | date_cols = ['outstanding_perform_begin_date','pws_deactivation_date', \ 57 | 'source_protection_begin_date'] 58 | 59 | date_type(water_system, date_cols) 60 | 61 | # %% Simplify zip-code column to 5 digit 62 | 63 | water_system["zip_code"] = water_system["zip_code"].str[0:5] 64 | 65 | 66 | # %% Raise duplication issue on key fields 67 | 68 | if not water_system["pwsid"].is_unique: 69 | raise Exception("pwsid is not unique.") 70 | 71 | # %% Save csv in staging 72 | 73 | water_system.to_csv(os.path.join(staging_path, "sdwis_water_system.csv"), index = False) 74 | -------------------------------------------------------------------------------- /src/transformers/transform_tigris_ne.R: -------------------------------------------------------------------------------- 1 | # transform TIGER places and crop to oceans polyline --------------------- 2 | 3 | library(fs) 4 | library(sf) 5 | library(tidyverse) 6 | library(tigris) 7 | library(rmapshaper) 8 | 9 | 10 | # path to save raw data, staging data, and standard projection 11 | data_path <- Sys.getenv("WSB_DATA_PATH") 12 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 13 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 14 | 15 | # download large files without timeout error 16 | options(timeout = 100000) 17 | 18 | # read Natural Earth ocean geometry 19 | ocean <- st_read(path(data_path, "ne/ocean/ne-ocean-10m/ne_10m_ocean.shp")) %>% 20 | select(geometry) %>% 21 | st_make_valid() 22 | 23 | # transform places to ocean crs, make valid 24 | places <- read_rds(path(data_path, "tigris/tigris_places.rds")) %>% 25 | st_transform(st_crs(ocean)$epsg) %>% 26 | st_make_valid() 27 | 28 | # intersect places with oceans and write 29 | places_clean <- places %>% 30 | st_intersection(ocean) %>% 31 | st_make_valid() %>% 32 | janitor::clean_names() 33 | 34 | # sanity check that oceans are removed 35 | # mapview::mapview(places_clean) 36 | 37 | # download tigris population data 38 | pop <- read_csv(path(data_path, "tigris/tigris_pop.csv")) %>% 39 | select(geoid, population) 40 | 41 | # join population data to places_clean 42 | places_clean <- places_clean %>% 43 | left_join(pop, by = "geoid") 44 | 45 | # write clean TIGER places 46 | path_out <- path(staging_path, "tiger_places_clean.gpkg") 47 | if(file_exists(path_out)) file_delete(path_out) 48 | 49 | st_write(places_clean, path_out) 50 | cat("Wrote clean TIGER places.\n") 51 | -------------------------------------------------------------------------------- /src/transformers/transform_ucmr.R: -------------------------------------------------------------------------------- 1 | # transform UCMR3 and UCMR4 zip codes and add centroids --------------------- 2 | 3 | library(fs) 4 | library(sf) 5 | library(tidyverse) 6 | library(tigris) 7 | 8 | # tell tigris to cache Census shapefile downloads for faster subsequent runs 9 | options(tigris_use_cache = TRUE) 10 | 11 | # helper function 12 | source(here::here("src/functions/f_clean_whitespace_nas.R")) 13 | 14 | # path to save raw data, staging data, and standard projection 15 | data_path <- Sys.getenv("WSB_DATA_PATH") 16 | staging_path <- Sys.getenv("WSB_STAGING_PATH") 17 | epsg <- as.numeric(Sys.getenv("WSB_EPSG")) 18 | epsg_aw <- Sys.getenv("WSB_EPSG_AW") 19 | 20 | # read ucmr3 and ucmr4 data, combine, clean names, add 21 | ucmr <- dir_ls(path(data_path, "ucmr"), regexp = "ZipCodes.txt") %>% 22 | read_tsv(col_types = "c") %>% 23 | distinct() %>% 24 | janitor::clean_names() %>% 25 | # a number of zipcodes end in "-" and should be cleaned 26 | mutate(zipcode = str_replace_all(zipcode, "-", "")) %>% 27 | # clean whitespace and NAs, and drop NA zipcodes 28 | f_clean_whitespace_nas() %>% 29 | drop_na(zipcode) 30 | 31 | # print nonsense zipcodes for review because they're few in number. 32 | # zip codes should have exactly 5 digits and no alphabetical chars 33 | zip_rm <- filter(ucmr, 34 | nchar(zipcode) != 5 | 35 | str_detect(zipcode, "[:alpha:]")) 36 | 37 | cat("Detected", nrow(zip_rm), "nonsense zipcodes:\n"); print(zip_rm) 38 | 39 | # remove nonsense zipcodes 40 | ucmr <- anti_join(ucmr, zip_rm) 41 | 42 | cat("Removed", nrow(zip_rm), "nonsense zipcodes from ucmr data.\n") 43 | 44 | 45 | # merge zip codes to spatial zip code polygon ----------------------------- 46 | 47 | # zip code columns to keep 48 | cols_keep <- c("zipcode", "geoid20", "aland20", "awater20", "st_areashape", 49 | "area_hull") 50 | 51 | # pull usa state geometries, project to input data CRS 52 | zipcode_areas <- tigris::zctas() 53 | zipcodes <- zipcode_areas %>% 54 | janitor::clean_names() %>% 55 | # use area weighted crs because we calculate polygon areas 56 | st_transform(st_crs(epsg_aw)) %>% 57 | mutate( 58 | # area calculations occur in area weighted epsg 59 | zipcode = zcta5ce20, 60 | st_areashape = st_area(geometry), 61 | convex_hull = st_geometry(st_convex_hull(geometry)), 62 | area_hull = st_area(convex_hull) 63 | ) %>% 64 | select(all_of(cols_keep)) 65 | 66 | 67 | # join zipcode polygon geometries to ucmr master list and 68 | # combine data and merge geometries for rows with duplicate pwsids -------- 69 | 70 | ucmr <- ucmr %>% 71 | left_join(zipcodes, on = "zipcode") %>% 72 | # convert object back to spatial 73 | st_as_sf(crs = epsg_aw) %>% 74 | # ensure valid geometries 75 | st_make_valid() %>% 76 | group_by(pwsid) %>% 77 | # mutate these new columns, knowing full well that duplicate rows 78 | # will be created, but that they will be dropped in the next step 79 | mutate( 80 | # combine all fragmented geometries 81 | geometry = st_union(geometry), 82 | # new area is the sum of the area of all polygons 83 | st_areashape = sum(st_areashape), 84 | area_hull = sum(area_hull), 85 | # new radius is calculated from the new area 86 | radius = sqrt(area_hull/pi), 87 | # combine data into list-formatted strings for character columns 88 | across(where(is.character), ~toString(unique(.))) 89 | ) %>% 90 | # only take the first result from each group 91 | slice(1) %>% 92 | ungroup() %>% 93 | # convert back to the project standard epsg 94 | st_transform(epsg) %>% 95 | # compute new centroids and note that when multipolygons are separated 96 | # by space, these are suspect and should not be used. Importantly, this 97 | # calculation occurs in the EPSG consistent with other staged data! 98 | mutate( 99 | centroid = st_geometry(st_centroid(geometry)), 100 | centroid_long = st_coordinates(centroid)[, 1], 101 | centroid_lat = st_coordinates(centroid)[, 2] 102 | ) %>% 103 | # remove columns. Note: future iteration may include other values downstream 104 | select(c(pwsid, zipcode, st_areashape, radius, centroid_long, centroid_lat)) %>% 105 | st_drop_geometry() 106 | 107 | cat("Recalculated area, radius, centroids for multipolygon pwsids.\n") 108 | cat("Combined string values for multipolygon pwsids.\n") 109 | 110 | # verify that there is only one pwsid per geometry 111 | n <- ucmr %>% 112 | count(pwsid) %>% 113 | filter(n > 1) %>% 114 | nrow() 115 | cat(n, "duplicate pwsids in labeled data following fix.\n") 116 | 117 | 118 | # Write clean ucmr data to CSV 119 | path_out <- path(staging_path, "ucmr.csv") 120 | if(file_exists(path_out)) file_delete(path_out) 121 | 122 | write_csv(ucmr, path_out) 123 | -------------------------------------------------------------------------------- /wsb.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | --------------------------------------------------------------------------------