├── data ├── processed │ ├── .gitignore │ ├── README.md │ ├── Makefile │ ├── pre-process-mv.py │ ├── pre-process-fit.py │ ├── pre-process-osm.py │ └── pre-process-repd.py ├── raw │ ├── README │ ├── machine_vision.geojson │ ├── Makefile │ ├── convert_fit_excel_to_csv.py │ └── compile_osm_solar.py ├── as_received │ └── README ├── exported │ ├── Makefile │ ├── export_geometries.py │ └── analyse_exported.py └── README.md ├── requirements.txt ├── doc ├── repd-definitions.pdf ├── preprocessing.md ├── matching.md └── database.md ├── .gitignore ├── db ├── neighbour-finding.sql ├── data-matching-rules │ ├── rule-7.sql │ ├── rule-6.sql │ ├── rule-1.sql │ ├── rule-3.sql │ ├── rule-5.sql │ ├── rule-3a.sql │ ├── rule-5a.sql │ ├── rule-4.sql │ ├── rule-4a.sql │ ├── rule-25.sql │ ├── rule-25a.sql │ ├── rule-1a.sql │ ├── rule-2.sql │ └── rule-2a.sql ├── README.md ├── map-osm-repd.sql ├── neighbour-finding │ ├── mv-repd.sql │ ├── mv-osm.sql │ └── osm-repd.sql ├── area-adaptive-threshold.sql ├── mv.sql ├── make-database.sql ├── data-matching.sql ├── fit.sql ├── osm.sql ├── dedup-repd.sql ├── repd.sql ├── dedup-osm.sql └── export.sql └── README.md /data/processed/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bng_latlon 2 | matplotlib 3 | numpy 4 | pandas 5 | scipy 6 | geopandas 7 | openpyxl 8 | sklearn 9 | -------------------------------------------------------------------------------- /doc/repd-definitions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openclimatefix/solar-power-mapping-data/HEAD/doc/repd-definitions.pdf -------------------------------------------------------------------------------- /data/raw/README: -------------------------------------------------------------------------------- 1 | This file is a placeholder, to ensure the folder appears in git. See readme in parent folder for info. 2 | 3 | -------------------------------------------------------------------------------- /data/as_received/README: -------------------------------------------------------------------------------- 1 | This file is a placeholder, to ensure the folder appears in git. See readme in parent folder for info. 2 | -------------------------------------------------------------------------------- /data/processed/README.md: -------------------------------------------------------------------------------- 1 | Processed datasets, create programmatically. 2 | 3 | (This file is a placeholder to make sure that git creates its parent directory.) 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | data/raw/fit.csv 3 | data/raw/osm.csv 4 | data/raw/repd.csv 5 | data/as_received/*.pbf 6 | data/as_received/*.xlsx 7 | data/as_received/*.geojson 8 | data/as_received/*.xml 9 | -------------------------------------------------------------------------------- /db/neighbour-finding.sql: -------------------------------------------------------------------------------- 1 | \echo -n Finding neighbouring objects across datasets ... 2 | 3 | \include neighbour-finding/osm-repd.sql 4 | \include neighbour-finding/mv-osm.sql 5 | \include neighbour-finding/mv-repd.sql 6 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-7.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 7 (see doc/matching.md) 2 | \echo -n Performing match rule 7... 3 | 4 | insert into matches 5 | select '7', NULL, osm.master_osm_id, osm_mv_neighbours.mv_id 6 | from osm_mv_neighbours, osm 7 | where osm.osm_id = osm_mv_neighbours.osm_id 8 | and osm_mv_neighbours.distance_meters < 1000; 9 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-6.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 6 (see doc/matching.md) 2 | \echo -n Performing match rule 6... 3 | 4 | insert into matches 5 | select '6', repd.master_repd_id, NULL, mv_repd_neighbours.mv_id 6 | from mv_repd_neighbours, repd 7 | where repd.repd_id = mv_repd_neighbours.repd_id 8 | and mv_repd_neighbours.distance_meters < 1000; 9 | -------------------------------------------------------------------------------- /data/exported/Makefile: -------------------------------------------------------------------------------- 1 | # export data. You can export AND analyse by invoking "make all". 2 | 3 | ukpvgeo_geometries.geojson: ukpvgeo_points.csv ../raw/osm-gb-solaronly.geojson 4 | python3 export_geometries.py 5 | 6 | all: ukpvgeo_geometries.geojson plot_analyse_exported.pdf 7 | 8 | plot_analyse_exported.pdf: ukpvgeo_points.csv 9 | python3 analyse_exported.py 10 | 11 | clean: 12 | rm ukpvgeo_geometries.geojson plot_analyse_exported.pdf 13 | 14 | -------------------------------------------------------------------------------- /db/README.md: -------------------------------------------------------------------------------- 1 | # Notes on setting up the database 2 | 3 | For more details, see `doc/database.md`. 4 | 5 | Om MacOS, with Postgres installed via homebrew, to run postgres as a background 6 | service and restart at login: 7 | 8 | ```bash 9 | brew services start postgresql 10 | ``` 11 | 12 | To start postgres now: 13 | 14 | ```bash 15 | pg_ctl -D /usr/local/var/postgres start 16 | ``` 17 | 18 | To create the database for first use: 19 | 20 | ```bash 21 | createdb hut23-425 "Solar PV database matching" 22 | ``` 23 | 24 | To populate the database: 25 | 26 | ```bash 27 | psql -f make-database.sql hut23-425 28 | ``` 29 | 30 | 31 | -------------------------------------------------------------------------------- /data/processed/Makefile: -------------------------------------------------------------------------------- 1 | # Create "processed" datasets from "raw" datasets 2 | 3 | all: repd.csv fit.csv osm.csv machine_vision.csv pre-process-repd.py pre-process-fit.py pre-process-osm.py pre-process-mv.py 4 | 5 | repd.csv: ../raw/repd.csv pre-process-repd.py 6 | ./pre-process-repd.py < $< > $@ 7 | 8 | fit.csv: ../raw/fit.csv pre-process-fit.py 9 | ./pre-process-fit.py < $< > $@ 10 | 11 | osm.csv: ../raw/osm.csv pre-process-osm.py 12 | ./pre-process-osm.py < $< > $@ 13 | 14 | machine_vision.csv: ../raw/machine_vision.geojson pre-process-mv.py 15 | ./pre-process-mv.py < $< > $@ 16 | 17 | clean: 18 | rm repd.csv fit.csv osm.csv machine_vision.csv 19 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-1.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 1 (see doc/matching.md) 2 | \echo -n Performing match rule 1... 3 | 4 | drop table if exists repd_copy; 5 | select * into repd_copy from repd_operational; 6 | insert into matches 7 | select '1', repd_copy.master_repd_id, osm.master_osm_id 8 | from osm_with_existing_repd_neighbours, osm, repd_operational, repd_copy 9 | -- Table linking: 10 | where osm_with_existing_repd_neighbours.osm_id = osm.osm_id 11 | and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id 12 | and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id 13 | -- Matching relevant: 14 | and repd_operational.location::geography <-> repd_copy.location::geography < 500; 15 | -------------------------------------------------------------------------------- /db/map-osm-repd.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Preliminary matching: OSM to REPD 3 | ** 4 | ** In raw.osm, the field repd_id_str contains a semicolon-separated list of REPD 5 | ** ids, possibly empty, whereas we would like this mapping in first normal form. 6 | */ 7 | 8 | \echo Breaking out REPD id tags from the OSM dataset 9 | 10 | -- osm_repd_id_mapping(osm_id, repd_id) 11 | 12 | drop table if exists osm_repd_id_mapping; 13 | 14 | select raw.osm.osm_id, x.repd_id into osm_repd_id_mapping 15 | from raw.osm cross join unnest(string_to_array(repd_id_str, ';')) with ordinality as x(repd_id) 16 | order by raw.osm.osm_id, x.repd_id; 17 | 18 | alter table osm_repd_id_mapping 19 | alter column repd_id 20 | set data type int 21 | using cast (repd_id as integer); 22 | 23 | 24 | -------------------------------------------------------------------------------- /db/neighbour-finding/mv-repd.sql: -------------------------------------------------------------------------------- 1 | drop table if exists mv_repd_neighbours; 2 | select closest_pt.repd_id, 3 | closest_pt.co_location_repd_id, 4 | closest_pt.master_repd_id, 5 | machine_vision.install_date as mv_date, 6 | machine_vision.area as mv_area, 7 | closest_pt.distance_meters, 8 | machine_vision.mv_id as mv_id 9 | into mv_repd_neighbours 10 | from machine_vision 11 | CROSS JOIN LATERAL 12 | (SELECT 13 | repd.repd_id, 14 | repd.co_location_repd_id, 15 | repd.master_repd_id, 16 | machine_vision.location::geography <-> repd.location::geography as distance_meters 17 | FROM repd 18 | ORDER BY machine_vision.location::geography <-> repd.location::geography 19 | LIMIT 1) AS closest_pt; 20 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # Directory structure 2 | 3 | `as_received` 4 | 5 | : Datasets precisely in the form in which we originally got them, unchanged in 6 | content or format, whether text or binary, open or proprietary. 7 | 8 | `raw` 9 | 10 | : Data that has been manually edited, at least to change the format to one that 11 | we can use programmatically (if required) but also there may a small number of 12 | edits to the data that do not seem to be generalisable, or we can't work out 13 | how to automate, or are not likely to re-occur next time we download the data. 14 | 15 | `processed` 16 | 17 | : Datasets that have been programmatically modified (typically from `raw`). Note 18 | that the `processed` directory is _local_, it is like a “compiled code” 19 | directory: not under version control but neither on the shared drive. 20 | 21 | -------------------------------------------------------------------------------- /db/area-adaptive-threshold.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Area-adaptive threshold function for clustering PV geodata items 3 | ** July 2020 4 | ** Author: Dan Stowell 5 | ** 6 | ** The function returns a distance threshold which is related to the 7 | ** geometry surface areas (sizes) of the two points being compared. 8 | ** 9 | ** If area is unknown, we use a heuristic estimate based on capacity similar to "2 hectares per MW"; 10 | ** The exact ratio is very close to the correlation seen in the paper. 11 | ** 12 | */ 13 | 14 | CREATE FUNCTION area_adaptive_threshold(area1 float, area2 float, capacity1 float, capacity2 float) 15 | RETURNS float AS $$ 16 | BEGIN 17 | 18 | RETURN LEAST(1500, GREATEST(10, 19 | 2 * SQRT(GREATEST( 20 | GREATEST(area1, capacity1 * 20000), 21 | GREATEST(area2, capacity2 * 20000) 22 | )))); 23 | END; $$ 24 | LANGUAGE plpgsql IMMUTABLE LEAKPROOF PARALLEL SAFE; 25 | 26 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-3.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 3 (see doc/matching.md) 2 | \echo -n Performing match rule 3... 3 | 4 | insert into matches 5 | select '3', repd_operational.master_repd_id, osm.master_osm_id 6 | from osm_repd_neighbours, osm, repd_operational 7 | -- Table linking: 8 | where osm_repd_neighbours.osm_id = osm.osm_id 9 | and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id 10 | and osm.osm_id = osm.master_osm_id 11 | and repd_operational.repd_id = repd_operational.master_repd_id 12 | -- Not matches already found: 13 | and not exists ( 14 | select 15 | from matches 16 | where matches.master_repd_id = repd_operational.master_repd_id 17 | or matches.master_osm_id = osm.master_osm_id 18 | ) 19 | -- Matching relevant: 20 | and osm_repd_neighbours.distance_meters < 700 21 | and osm.objtype != 'node'; 22 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-5.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 5 (see doc/matching.md) 2 | \echo -n Performing match rule 5... 3 | 4 | insert into matches 5 | select '5', repd_operational.master_repd_id, osm.master_osm_id 6 | from osm_repd_neighbours, osm, repd_operational 7 | -- Table linking: 8 | where osm_repd_neighbours.osm_id = osm.osm_id 9 | and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id 10 | and osm.osm_id = osm.master_osm_id 11 | and repd_operational.repd_id = repd_operational.master_repd_id 12 | -- Not matches already found: 13 | and not exists ( 14 | select 15 | from matches 16 | where matches.master_repd_id = repd_operational.master_repd_id 17 | or matches.master_osm_id = osm.master_osm_id 18 | ) 19 | -- Matching relevant: 20 | and osm.objtype = 'node' 21 | and repd_operational.site_name like '%Scheme%'; 22 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-3a.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 3a (see doc/matching.md) 2 | \echo -n Performing match rule 3a... 3 | 4 | insert into matches 5 | select '3a', repd_non_operational.master_repd_id, osm.master_osm_id 6 | from osm_repd_neighbours, osm, repd_non_operational 7 | -- Table linking: 8 | where osm_repd_neighbours.osm_id = osm.osm_id 9 | and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id 10 | and osm.osm_id = osm.master_osm_id 11 | and repd_non_operational.repd_id = repd_non_operational.master_repd_id 12 | -- Not matches already found: 13 | and not exists ( 14 | select 15 | from matches 16 | where matches.master_repd_id = repd_non_operational.master_repd_id 17 | or matches.master_osm_id = osm.master_osm_id 18 | ) 19 | -- Matching relevant: 20 | and osm_repd_neighbours.distance_meters < 700 21 | and osm.objtype != 'node'; 22 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-5a.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 5a (see doc/matching.md) 2 | \echo -n Performing match rule 5a... 3 | 4 | insert into matches 5 | select '5a', repd_non_operational.master_repd_id, osm.master_osm_id 6 | from osm_repd_neighbours, osm, repd_non_operational 7 | -- Table linking: 8 | where osm_repd_neighbours.osm_id = osm.osm_id 9 | and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id 10 | and osm.osm_id = osm.master_osm_id 11 | and repd_non_operational.repd_id = repd_non_operational.master_repd_id 12 | -- Not matches already found: 13 | and not exists ( 14 | select 15 | from matches 16 | where matches.master_repd_id = repd_non_operational.master_repd_id 17 | or matches.master_osm_id = osm.master_osm_id 18 | ) 19 | -- Matching relevant: 20 | and osm.objtype = 'node' 21 | and repd_non_operational.site_name like '%Scheme%'; 22 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-4.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 4 (see doc/matching.md) 2 | \echo -n Performing match rule 4... 3 | 4 | insert into matches 5 | select '4', repd_operational.master_repd_id, osm.master_osm_id 6 | from osm_repd_neighbours, osm, repd_operational 7 | -- Table linking: 8 | where osm_repd_neighbours.osm_id = osm.osm_id 9 | and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id 10 | and osm.osm_id = osm.master_osm_id 11 | and repd_operational.repd_id = repd_operational.master_repd_id 12 | -- Not matches already found: 13 | and not exists ( 14 | select 15 | from matches 16 | where matches.master_repd_id = repd_operational.master_repd_id 17 | or matches.master_osm_id = osm.master_osm_id 18 | ) 19 | -- Matching relevant: 20 | and osm.objtype = 'node' 21 | and repd_operational.site_name like '%Scheme%' 22 | and osm_repd_neighbours.distance_meters < 5000; 23 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-4a.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 4a (see doc/matching.md) 2 | \echo -n Performing match rule 4a... 3 | 4 | insert into matches 5 | select '4a', repd_non_operational.master_repd_id, osm.master_osm_id 6 | from osm_repd_neighbours, osm, repd_non_operational 7 | -- Table linking: 8 | where osm_repd_neighbours.osm_id = osm.osm_id 9 | and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id 10 | and osm.osm_id = osm.master_osm_id 11 | and repd_non_operational.repd_id = repd_non_operational.master_repd_id 12 | -- Not matches already found: 13 | and not exists ( 14 | select 15 | from matches 16 | where matches.master_repd_id = repd_non_operational.master_repd_id 17 | or matches.master_osm_id = osm.master_osm_id 18 | ) 19 | -- Matching relevant: 20 | and osm.objtype = 'node' 21 | and repd_non_operational.site_name like '%Scheme%' 22 | and osm_repd_neighbours.distance_meters < 5000; 23 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-25.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 2.5 (see doc/matching.md) 2 | \echo -n Performing match rule 2.5... 3 | 4 | insert into matches 5 | select '25', closest_pt.master_repd_id, osm.master_osm_id 6 | from osm_repd_id_mapping LEFT JOIN osm USING (osm_id) LEFT JOIN repd_operational USING (repd_id) 7 | -- selecting matching repd IDs, and finding their distance: 8 | CROSS JOIN LATERAL 9 | (SELECT 10 | repd.master_repd_id, 11 | osm.location::geography <-> repd.location::geography as distance_meters 12 | FROM repd 13 | WHERE repd.repd_id=osm_repd_id_mapping.repd_id 14 | -- Not matches already found: 15 | AND not exists ( 16 | select 17 | from matches 18 | where matches.master_repd_id = repd_operational.master_repd_id 19 | or matches.master_osm_id = osm.master_osm_id 20 | ) 21 | -- Only the single nearest-neighbour: 22 | ORDER BY osm.location::geography <-> repd.location::geography 23 | LIMIT 1) AS closest_pt; 24 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-25a.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 2.5a (see doc/matching.md) 2 | \echo -n Performing match rule 2.5a... 3 | 4 | insert into matches 5 | select '25a', closest_pt.master_repd_id, osm.master_osm_id 6 | from osm_repd_id_mapping LEFT JOIN osm USING (osm_id) LEFT JOIN repd_non_operational USING (repd_id) 7 | -- selecting matching repd IDs, and finding their distance: 8 | CROSS JOIN LATERAL 9 | (SELECT 10 | repd.master_repd_id, 11 | osm.location::geography <-> repd.location::geography as distance_meters 12 | FROM repd 13 | WHERE repd.repd_id=osm_repd_id_mapping.repd_id 14 | -- Not matches already found: 15 | AND not exists ( 16 | select 17 | from matches 18 | where matches.master_repd_id = repd_non_operational.master_repd_id 19 | or matches.master_osm_id = osm.master_osm_id 20 | ) 21 | -- Only the single nearest-neighbour: 22 | ORDER BY osm.location::geography <-> repd.location::geography 23 | LIMIT 1) AS closest_pt; 24 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-1a.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 1a (see doc/matching.md) 2 | \echo -n Performing match rule 1a... 3 | 4 | drop table if exists repd_copy; 5 | select * into repd_copy from repd_non_operational; 6 | insert into matches 7 | select '1a', repd_copy.master_repd_id, osm.master_osm_id 8 | from osm_with_existing_repd_neighbours, osm, repd_non_operational, repd_copy 9 | -- Table linking: 10 | where osm_with_existing_repd_neighbours.osm_id = osm.osm_id 11 | and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id 12 | and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id 13 | -- Not matches already found: 14 | and not exists ( 15 | select 16 | from matches 17 | where matches.master_repd_id = repd_non_operational.master_repd_id 18 | or matches.master_osm_id = osm.master_osm_id 19 | ) 20 | -- Matching relevant: 21 | and repd_non_operational.location::geography <-> repd_copy.location::geography < 500; 22 | -------------------------------------------------------------------------------- /db/neighbour-finding/mv-osm.sql: -------------------------------------------------------------------------------- 1 | drop table if exists osm_ways_relations; 2 | select * into osm_ways_relations from osm where osm.objtype != 'node'; 3 | 4 | drop table if exists osm_mv_neighbours; 5 | select closest_pt.osm_id, 6 | closest_pt.tag_start_date as osm_date, 7 | machine_vision.install_date as mv_date, 8 | closest_pt.area as osm_area, 9 | machine_vision.area as mv_area, 10 | closest_pt.distance_meters, 11 | machine_vision.mv_id as mv_id 12 | into osm_mv_neighbours 13 | from machine_vision 14 | CROSS JOIN LATERAL 15 | (SELECT 16 | osm_ways_relations.osm_id, 17 | osm_ways_relations.tag_start_date, 18 | osm_ways_relations.area, 19 | machine_vision.location::geography <-> osm_ways_relations.location::geography as distance_meters 20 | FROM osm_ways_relations 21 | ORDER BY machine_vision.location::geography <-> osm_ways_relations.location::geography 22 | LIMIT 1) AS closest_pt; 23 | 24 | drop table osm_ways_relations; 25 | -------------------------------------------------------------------------------- /db/mv.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Create table containing Machine Vision dataset 3 | */ 4 | 5 | \echo Creating Machine Vision table ... 6 | 7 | drop table if exists machine_vision; 8 | 9 | create table machine_vision ( 10 | mv_id int, 11 | area float, 12 | confidence char(1), 13 | install_date date, -- problem: some don't have proper dates e.g. "<2016-06" and some have multiple dates separated by commas 14 | iso_code_short char(2), 15 | iso_code char(6), 16 | attribution varchar(50), 17 | longitude float, 18 | latitude float, 19 | primary key (mv_id) 20 | ); 21 | 22 | \copy machine_vision from '../data/processed/machine_vision.csv' delimiter ',' csv header; 23 | 24 | /* ----------------------------------------------------------------------------- 25 | ** Edit table as necessary 26 | */ 27 | 28 | -- Create geometry columns for geographical comparison/matching 29 | -- NB: Spatial Reference ID 4326 refers to WGS84 30 | 31 | alter table machine_vision 32 | add column location geometry(Point, 4326); 33 | 34 | update machine_vision 35 | set location = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326); 36 | 37 | 38 | -------------------------------------------------------------------------------- /data/raw/machine_vision.geojson: -------------------------------------------------------------------------------- 1 | {"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [[[-4.898715334732418, 50.35443341724983], [-4.898697262371517, 50.354495023984384], [-4.898593151707162, 50.354585020048205], [-4.898384858019185, 50.35471130228192], [-4.89819790185845, 50.354781804091964], [-4.898071546583224, 50.35478386702829], [-4.897965117800202, 50.35475862735909], [-4.897809030286477, 50.35465057847919], [-4.897697218033746, 50.35449051503744], [-4.8976718451404055, 50.354383053428755], [-4.897710220354532, 50.35428799656915], [-4.897795887159217, 50.354216473347165], [-4.8980244443516625, 50.35413180492261], [-4.898336992613523, 50.35406630164386], [-4.898375745656702, 50.35404487220399], [-4.898398896214309, 50.35400429274275], [-4.89843125355013, 50.35406253395206], [-4.898571721367052, 50.35410936496741], [-4.8986629634032, 50.35417798947747], [-4.898708533548572, 50.35425982182948], [-4.898715334732418, 50.35443341724983]]]}, "properties": {"area": 4415.68707833541, "confidence": "A", "install_date": "<2016-06", "iso-3166-1": "GB", "iso-3166-2": "GB-CON", "attribution": "University of Oxford & Descartes Labs Inc."}}]} 2 | -------------------------------------------------------------------------------- /db/make-database.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Solar PV database creation and data ingest 3 | ** March 2020 4 | ** Authors: Ed Chalstery and James Geddes 5 | ** 6 | ** Prerequisites: 7 | ** i. A database named "hut23-425" is assumed to exist on the local PostgreSQL server 8 | ** 9 | ** These psql files: 10 | ** 1. Create database tables (deleting them if they exist already) 11 | ** 2. Break out the REPD tags from the OSM dataset 12 | ** 3. Deduplicate certain tables 13 | ** 14 | ** See `doc/database.md` for details 15 | */ 16 | 17 | -- Preliminaries 18 | 19 | \set ON_ERROR_STOP on 20 | alter database "hut23-425" set datestyle to "DMY"; -- to match FIT and REPD data files 21 | create schema if not exists raw; 22 | create extension if not exists postgis; 23 | 24 | \include area-adaptive-threshold.sql 25 | 26 | -- 1. Create tables and load data 27 | 28 | \include osm.sql 29 | \include repd.sql 30 | \include fit.sql 31 | \include mv.sql 32 | 33 | -- 2. Preliminary matching (REPD IDs already present in OSM) 34 | 35 | \include map-osm-repd.sql 36 | 37 | -- 3. Deduplicate 38 | 39 | \include dedup-osm.sql 40 | \include dedup-repd.sql 41 | 42 | -- 4. Find neighbours 43 | 44 | \include neighbour-finding.sql 45 | 46 | -- 5. Create matching table 47 | 48 | \include data-matching.sql 49 | -------------------------------------------------------------------------------- /db/neighbour-finding/osm-repd.sql: -------------------------------------------------------------------------------- 1 | -- Find the nearest neighbouring REPD object to each OSM object 2 | drop table if exists osm_repd_neighbours; 3 | select osm.osm_id, 4 | closest_pt.repd_id as closest_geo_match_from_repd_repd_id, 5 | closest_pt.co_location_repd_id as closest_geo_match_from_repd_co_location_repd_id, 6 | closest_pt.distance_meters 7 | into osm_repd_neighbours 8 | from osm 9 | CROSS JOIN LATERAL 10 | (SELECT 11 | repd.repd_id, 12 | repd.co_location_repd_id, 13 | osm.location::geography <-> repd.location::geography as distance_meters 14 | FROM repd 15 | ORDER BY osm.location::geography <-> repd.location::geography 16 | LIMIT 1) AS closest_pt; 17 | 18 | -- Create a table that compares the neighbour REPD ID to those that were found 19 | -- in the OSM data (where present) 20 | drop table if exists osm_with_existing_repd_neighbours; 21 | select osm_repd_neighbours.osm_id, 22 | osm_repd_id_mapping.repd_id as repd_id_in_osm, 23 | osm_repd_neighbours.closest_geo_match_from_repd_repd_id, 24 | osm_repd_neighbours.closest_geo_match_from_repd_co_location_repd_id, 25 | osm_repd_neighbours.distance_meters 26 | into osm_with_existing_repd_neighbours 27 | from osm_repd_neighbours, osm_repd_id_mapping 28 | where osm_repd_neighbours.osm_id = osm_repd_id_mapping.osm_id; 29 | -------------------------------------------------------------------------------- /db/data-matching.sql: -------------------------------------------------------------------------------- 1 | 2 | \echo -n Finding matches across datasets ... 3 | 4 | drop table if exists matches; 5 | create table matches ( 6 | match_rule varchar(3), 7 | master_repd_id integer, 8 | master_osm_id bigint, 9 | mv_id integer, 10 | fit_id integer 11 | ); 12 | 13 | -- OSM-REPD matching for operational installations 14 | drop table if exists repd_operational; 15 | select * into repd_operational from repd where dev_status = 'Operational'; 16 | 17 | \include data-matching-rules/rule-1.sql 18 | \include data-matching-rules/rule-2.sql 19 | \include data-matching-rules/rule-25.sql 20 | \include data-matching-rules/rule-3.sql 21 | \include data-matching-rules/rule-4.sql 22 | \include data-matching-rules/rule-5.sql 23 | 24 | -- OSM-REPD matching for non-operational installations 25 | drop table if exists repd_non_operational; 26 | select * into repd_non_operational from repd where dev_status != 'Operational'; 27 | 28 | \include data-matching-rules/rule-1a.sql 29 | \include data-matching-rules/rule-2a.sql 30 | \include data-matching-rules/rule-25a.sql 31 | \include data-matching-rules/rule-3a.sql 32 | \include data-matching-rules/rule-4a.sql 33 | \include data-matching-rules/rule-5a.sql 34 | 35 | -- REPD-MV matching 36 | \include data-matching-rules/rule-6.sql 37 | 38 | -- OSM-MV matching 39 | \include data-matching-rules/rule-7.sql 40 | -------------------------------------------------------------------------------- /data/processed/pre-process-mv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ### Create a CSV for easy import to PostgreSQL 3 | ### Get the centre of the polygons as point coordinates for the machine vision objects 4 | ### Remove the string "<2016-06" from dates as this can't be loaded as a date in PostgreSQL 5 | ### Reads from stdin, writes to stdout 6 | 7 | import geopandas as gpd 8 | import sys 9 | 10 | machine_vision = "../raw/machine_vision.geojson" 11 | machine_vision_df = gpd.read_file(machine_vision) 12 | 13 | def getXY(pt): 14 | return (pt.x, pt.y) 15 | centroidseries = machine_vision_df['geometry'].centroid 16 | centroidlist = map(getXY, centroidseries) 17 | machine_vision_df['x'], machine_vision_df['y'] = [list(t) for t in zip(*map(getXY, centroidseries))] 18 | 19 | def remove_bad_date(dt_str): 20 | """Remove the string representing any date before 2016-06 and also only take 21 | the first of those with 2 dates; take the earlier date as correct for 22 | install date""" 23 | return(dt_str.replace("<2016-06", "").split(",")[0]) 24 | machine_vision_df['install_date'] = list(map(remove_bad_date, machine_vision_df['install_date'])) 25 | 26 | # TODO: if we need this in PostgreSQL/PostGIS, figure out the correct data type for table column 27 | machine_vision_df = machine_vision_df.drop(['geometry'], axis=1) 28 | 29 | mv_csv_str = machine_vision_df.to_csv(index=True) 30 | 31 | sys.stdout.write(mv_csv_str) 32 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-2.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 2 (see doc/matching.md) 2 | \echo -n Performing match rule 2... 3 | 4 | insert into matches 5 | select '2', repd_operational.master_repd_id, osm.master_osm_id 6 | from osm_with_existing_repd_neighbours, osm, repd_operational, repd_copy 7 | -- Table linking: 8 | where osm_with_existing_repd_neighbours.osm_id = osm.osm_id 9 | and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id 10 | and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id 11 | -- Not matches already found: 12 | and not exists ( 13 | select 14 | from matches 15 | where matches.master_repd_id = repd_operational.master_repd_id 16 | or matches.master_osm_id = osm.master_osm_id 17 | ) 18 | -- Matching relevant: 19 | and (osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id 20 | or osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_co_location_repd_id 21 | or osm_with_existing_repd_neighbours.repd_id_in_osm = repd_operational.master_repd_id 22 | or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.co_location_repd_id 23 | or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.master_repd_id 24 | ); 25 | -------------------------------------------------------------------------------- /db/data-matching-rules/rule-2a.sql: -------------------------------------------------------------------------------- 1 | -- Match rule 2a (see doc/matching.md) 2 | \echo -n Performing match rule 2a... 3 | 4 | insert into matches 5 | select '2a', repd_non_operational.master_repd_id, osm.master_osm_id 6 | from osm_with_existing_repd_neighbours, osm, repd_non_operational, repd_copy 7 | -- Table linking: 8 | where osm_with_existing_repd_neighbours.osm_id = osm.osm_id 9 | and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id 10 | and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id 11 | -- Not matches already found: 12 | and not exists ( 13 | select 14 | from matches 15 | where matches.master_repd_id = repd_non_operational.master_repd_id 16 | or matches.master_osm_id = osm.master_osm_id 17 | ) 18 | -- Matching relevant: 19 | and (osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id 20 | or osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_co_location_repd_id 21 | or osm_with_existing_repd_neighbours.repd_id_in_osm = repd_non_operational.master_repd_id 22 | or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.co_location_repd_id 23 | or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.master_repd_id 24 | ); 25 | -------------------------------------------------------------------------------- /doc/preprocessing.md: -------------------------------------------------------------------------------- 1 | Manual pre-processing datasets 2 | =========== 3 | 4 | These are the manual data transformations made to get the files in `data/raw` from `data/as_received`. 5 | 6 | FiT 7 | ---- 8 | 9 | No transormation needed -- running `make` in the `data/raw` folder should be sufficient to combine the 3 Excel spreadsheets into one and convert to CSV, yielding `fit.csv`. 10 | 11 | OSM (csv) 12 | ---- 13 | 14 | There are sometimes manual edits needed, to fix typos in the uncontrolled OSM source data. We have fixed many of them, and also caught them in the preprocessing of `compile_osm_solar.py`, but there could be others in future OSM data releases. Here are examples of edits we made: 15 | 16 | 1. Example edits: 17 | 18 | - Changed "14Synthetic"... to "14" for id=7784835486 in generator:solar:modules column. 19 | - Changed "8node 0" to "8" for id=7772459006 in generator:solar:modules column 20 | - Changed "36node 0" to "36" for id=7772459035 in generator:solar:modules column 21 | - Changed "17generator:method" to "17" for id=7791014395 in generator:solar:modules column 22 | - For object id=835531116, move "SE" from wrong column generator:solar:modules to orientation 23 | 24 | 3. Save as `data/raw/osm.csv` 25 | 26 | REPD 27 | ---- 28 | 29 | 1. Delete one unusual date/time cell: "00/01/1900" 30 | 2. Save as `data/raw/repd.csv`. 31 | 32 | Machine vision dataset 33 | ----- 34 | 35 | No changes. Save as `data/raw/machine_vision.geojson`. 36 | -------------------------------------------------------------------------------- /db/fit.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Create table containing fit dataset 3 | */ 4 | 5 | \echo Creating fit table ... 6 | 7 | drop table if exists raw.fit; 8 | drop table if exists fit; 9 | 10 | create table raw.fit ( 11 | fit_id int, 12 | extension char(1), 13 | postcode_stub varchar(7), 14 | technology varchar(24), 15 | installed_capacity float, 16 | declared_net_capacity float, 17 | application_date date, 18 | commissioning_date date, 19 | mcs_issue_date date, 20 | export_status varchar(30), 21 | tariff_code varchar(20), 22 | tariff_description varchar(100), 23 | installation_type varchar(25), 24 | country varchar(15), 25 | local_authority varchar(40), 26 | govt_office_region varchar(40), 27 | constituency varchar(60), 28 | accreditation_route varchar(6), 29 | mpan_prefix float, 30 | comm_school varchar(40), 31 | llsoa_code varchar(20) 32 | ); 33 | 34 | \copy raw.fit from '../data/processed/fit.csv' delimiter ',' csv header; 35 | 36 | -- Restrict raw.fit to Solar PV only 37 | select * into fit 38 | from raw.fit 39 | where technology = 'Photovoltaic'; 40 | 41 | -- Add roughly calculated area for raw.fit entries to aid matching. 42 | -- Solar PV has power approx. 52W / m^2 43 | alter table fit 44 | add column "area" float; 45 | 46 | update fit 47 | set area = declared_net_capacity/52; 48 | -------------------------------------------------------------------------------- /data/raw/Makefile: -------------------------------------------------------------------------------- 1 | # Preprocess OSM extract into filtered subsets; and compile FiT Excels into csv. 2 | 3 | all: osm-gb-solaronly.osm.pbf osm-gb-solaronly.xml osm-gb-solaronly.geojson osm.csv fit.csv repd.csv 4 | 5 | 6 | # basic OSM solarfiltering -- reduces 1.5 GB to approx 2 MB 7 | osm-gb-solaronly.osm.pbf: ../as_received/great-britain-latest.osm.pbf 8 | osmium tags-filter $< generator:method=photovoltaic plant:method=photovoltaic plant:source=solar -o $@ 9 | 10 | # format-shifting OSM->XML 11 | osm-gb-solaronly.xml: osm-gb-solaronly.osm.pbf 12 | osmium cat $< -o $@ 13 | 14 | # format-shifting OSM->GeoJSON requires each "layer" to be exported 15 | osm-gb-solaronly.geojson: osm-gb-solaronly.osm.pbf 16 | ogr2ogr -f GeoJSON -overwrite -addfields $@ $< points -nln merged 17 | ogr2ogr -f GeoJSON -update -append -addfields $@ $< lines -nln merged 18 | ogr2ogr -f GeoJSON -update -append -addfields $@ $< multilinestrings -nln merged 19 | ogr2ogr -f GeoJSON -update -append -addfields $@ $< multipolygons -nln merged 20 | ogr2ogr -f GeoJSON -update -append -addfields $@ $< other_relations -nln merged 21 | 22 | osm.csv: osm-gb-solaronly.xml 23 | python3 compile_osm_solar.py 24 | 25 | fit.csv: ../as_received/installation_report_apr2020_part_1.xlsx 26 | python3 convert_fit_excel_to_csv.py 27 | 28 | repd.csv: ../as_received/renewable-energy-planning-database-march-2020.csv 29 | cat $< | sed -e "s|00/01/1900||g" > $@ 30 | 31 | clean: 32 | rm osm.csv fit.csv osm-gb-solaronly.osm.pbf osm-gb-solaronly.xml osm-gb-solaronly.geojson 33 | 34 | -------------------------------------------------------------------------------- /data/raw/convert_fit_excel_to_csv.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import openpyxl 3 | from openpyxl import load_workbook 4 | 5 | # tested using openpyxl 3.0.3 on python 3.7.5, ubuntu 18.04 6 | 7 | # TODO this script should be merged in with data/processed/pre-process-fit.py 8 | 9 | headercols = None 10 | rowswritten = 0 11 | 12 | def formatcell(item): 13 | "Appropriate conversions for CSV output" 14 | if item.value is None: 15 | return "" 16 | return str(item.internal_value) 17 | 18 | with open("fit.csv", "wt") as outfp: 19 | for infname in sorted(glob.glob("../as_received/installation_report_*_part_*.xlsx")): 20 | print(infname) 21 | wb = load_workbook(filename=infname, read_only=True) 22 | 23 | gotheader = False 24 | for whichrow, row in enumerate(wb.active.iter_rows()): 25 | at_repeated_header = False 26 | try: 27 | rowstrs = [formatcell(item) for item in row] 28 | except AttributeError as err: 29 | print(err) 30 | print(" skipping row %i (merged cell?)" % whichrow) 31 | continue 32 | for item in rowstrs: 33 | assert("," not in item) 34 | # Here we're detecting hitting the header in this XLS: 35 | if rowstrs[0] and (rowstrs[0].startswith("Extension")): 36 | gotheader = True 37 | #print(headercols) 38 | if headercols: 39 | assert(headercols==rowstrs) 40 | at_repeated_header = True 41 | else: 42 | headercols = rowstrs 43 | if gotheader and not at_repeated_header: 44 | #if whichrow < 20: 45 | # print(rowstrs[0]) 46 | outfp.write(",".join(rowstrs) + "\n") 47 | rowswritten += 1 48 | del wb 49 | 50 | print("Written %i rows" % rowswritten) 51 | assert(rowswritten > 10000) 52 | assert(gotheader) 53 | 54 | -------------------------------------------------------------------------------- /data/processed/pre-process-fit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ### Process the FiT data to add an index column 3 | ### Reads from stdin, writes to stdout 4 | 5 | import sys 6 | import pandas as pd 7 | import numpy as np 8 | 9 | fit_df = pd.read_csv(sys.stdin) 10 | 11 | # Check the file has the columns we expect and order them as we expect 12 | # If the columns don't exist, make the column empty 13 | output_df = pd.DataFrame() 14 | required_columns = ['Extension (Y/N)', 15 | 'PostCode', 16 | 'Technology', 17 | 'Installed capacity', 18 | 'Declared net capacity', 19 | 'Application date', 20 | 'Commissioning date', 21 | 'MCS issue date', 22 | 'Export status', 23 | 'TariffCode', 24 | 'Tariff Description', 25 | 'Installation Type', 26 | 'Installation Country', 27 | 'Local Authority', 28 | 'Government Office Region', 29 | 'Constituency', 30 | 'Accreditation Route', 31 | 'MPAN Prefix', 32 | 'Community school category', 33 | 'LLSOA Code'] 34 | 35 | for col in required_columns: 36 | try: 37 | output_df[col] = fit_df[col] 38 | except KeyError: 39 | output_df[col] = np.nan 40 | 41 | # Also at this point reduce to PV only (reduces data volumes) 42 | output_df = output_df[output_df['Technology']=='Photovoltaic'] 43 | 44 | # Add index column 45 | fit_csv_str = output_df.to_csv(index=True) 46 | 47 | sys.stdout.write(fit_csv_str) 48 | -------------------------------------------------------------------------------- /db/osm.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Create table containing OSM database 3 | */ 4 | 5 | \echo Creating OSM table ... 6 | 7 | drop table if exists raw.osm; 8 | 9 | create table raw.osm ( 10 | objtype varchar(8), 11 | osm_id bigint, 12 | username varchar(60), 13 | time_created date, 14 | latitude float, 15 | longitude float, 16 | area float, 17 | capacity float, 18 | modules float, 19 | located varchar(20), 20 | orientation float, 21 | plantref varchar(20), 22 | source_capacity varchar(255), 23 | source_obj varchar(255), 24 | tag_power varchar(15), 25 | repd_id_str varchar(20), 26 | tag_start_date date, 27 | primary key (osm_id) 28 | ); 29 | 30 | \copy raw.osm from '../data/processed/osm.csv' delimiter ',' csv header; 31 | 32 | /* ----------------------------------------------------------------------------- 33 | ** Edit table as necessary 34 | */ 35 | 36 | -- coerce some float columns to int, since float res is excessive (but sometimes present in the input) 37 | 38 | alter table raw.osm 39 | alter column modules 40 | type bigint using round(modules)::bigint; 41 | alter table raw.osm 42 | alter column orientation 43 | type integer using round(orientation)::integer; 44 | 45 | -- our input is using kW, convert to MW for coherence with use elsewhere in the db 46 | update raw.osm set capacity = 0.001 * capacity; 47 | 48 | -- Create geometry columns for geographical comparison/matching 49 | -- NB: Spatial Reference ID 4326 refers to WGS84 50 | 51 | alter table raw.osm 52 | add column location geometry(Point, 4326); 53 | 54 | update raw.osm 55 | set location = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326); 56 | -------------------------------------------------------------------------------- /data/processed/pre-process-osm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ### Process the OSM data to fix date formatting 3 | ### Reads from stdin, writes to stdout 4 | 5 | import sys 6 | import pandas as pd 7 | from dateutil.parser import parse 8 | 9 | osm_df = pd.read_csv(sys.stdin) 10 | 11 | # Check the file has the columns we expect and order them as we expect 12 | # If the columns don't exist, make the column empty 13 | output_df = pd.DataFrame() 14 | required_columns = ['objtype', 15 | 'id', 16 | 'user', 17 | 'timestamp', 18 | 'lat', 19 | 'lon', 20 | 'calc_area', 21 | 'calc_capacity', 22 | 'generator:solar:modules', 23 | 'location', 24 | 'orientation', 25 | 'plantref', 26 | 'source_capacity', 27 | 'source_obj', 28 | 'tag_power', 29 | 'tag_repd:id', 30 | 'tag_start_date'] 31 | 32 | for col in required_columns: 33 | try: 34 | output_df[col] = osm_df[col] 35 | except KeyError: 36 | output_df[col] = np.nan 37 | 38 | # Edit tagged date column with pandas 39 | dates = [] 40 | before_date_strs = ['before '] 41 | after_date_strs = ['.'] 42 | mistakes = [('-00', '-01')] 43 | for date in output_df['tag_start_date']: 44 | if pd.notna(date): 45 | og_date = date 46 | date = str(date) 47 | for string in before_date_strs: 48 | date = date.split(string)[-1] 49 | for string in after_date_strs: 50 | date = date.split(string)[0] 51 | for mistake, correction in mistakes: 52 | date = date.replace(mistake, correction) 53 | dates.append(parse(date, ignoretz=True, default=parse('2020-01-01'))) 54 | else: 55 | dates.append(None) 56 | output_df['tag_start_date'] = dates 57 | 58 | osm_csv_str = output_df.to_csv(index=False) 59 | 60 | sys.stdout.write(osm_csv_str) 61 | -------------------------------------------------------------------------------- /doc/matching.md: -------------------------------------------------------------------------------- 1 | % Matching 2 | 3 | This note documents the matching of entities between the various datasets in the 4 | `hut23-425` database. 5 | 6 | 7 | # Match Rules: 8 | 9 | ## OSM-REPD 10 | 11 | First, proximity match to get the nearest neighbouring REPD for every single OSM object, 12 | 13 | 1. The master REPD id of the REPD id in OSM, where this is < 500m from the nearest neighbour REPD object to an OSM object 14 | 2. The nearest neighbour in meters of OSM entries to REPD entries where the REPD ID is the same as that already present in OSM data. Accept those that are the same (regardless of distance). This should catch any that aren't covered by rule 0. (Implementation note: added rule 2.5 which comes after rule 2, implementing the same idea with a different proximity search. It re-runs the nearest neighbour search but only for matching REPD IDs rather than any REPD ID.) 15 | 3. Nearest neighbour for remaining OSM/REPD where the OSM `objtype` is `way` or `relation` and the object is a "group master" (the `osm_id = master_osm_id` or `repd_id = master_repd_id`). Accept all matches that are below a distance threshold of 700 meters. 16 | 4. Nearest REPD neighbour for OSM nodes, where REPD has "Scheme" in the title, with a distance threshold of 5KM. 17 | 5. Repeat 4, with no distance threshold 18 | 19 | Repeat matching rules 1-5 for non-operational REPD entries. Label these match rules 1a, 2a etc. 20 | 21 | ## Machine Vision dataset 22 | 23 | First, proximity match to get the nearest neighbouring OSM way/relation for every single Machine Vision object and separately, the nearest REPD for each Machine Vision Object. 24 | 25 | 6. Nearest neighbour REPD to each Machine Vision object, below a 1000m distance threshold. 26 | 7. Nearest neighbour OSM way/relation to each Machine Vision object, below a 1000m distance threshold. 27 | 28 | # Matches 29 | 30 | | Rule | Count | Total | 31 | | --- | --- | --- | 32 | | 1 | 825 | 825 | 33 | | 2 | 0 | 825 | 34 | | 3 | 88 | 913 | 35 | | 4 | 1,192| 2,105| 36 | | 5 | 68 | 2,173| 37 | | 1a | 2 | 2,175| 38 | | 2a | 0 | 2,175| 39 | | 3a | 46 | 2,221| 40 | | 4a | 893 | 3,114| 41 | | 5a | 0 | 3,114| 42 | | ---- | ----- | ----- | 43 | | 6 | 1,759| | 44 | | 7 | 1,851| | 45 | 46 | # Notes 47 | 48 | Doesn't look like any of the nodes where nearest REPD is not a scheme are genuine matches. Many matches to "Goldthorpe" REPD, which made me think this could be a scheme. Unlikely though, because the operator is Aldi. 49 | 50 | Rule 4/5 are intended to distinguish between those OSM nodes that are very likely to be part of a scheme and those that are slightly less likely. 51 | -------------------------------------------------------------------------------- /db/dedup-repd.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Deduplicate the REPD dataset. 3 | ** 4 | ** Identify individual entries as being the same solar farm if: 5 | ** 1. They are within a certain disance; and 6 | ** 2. They have a "similar" name 7 | ** 8 | ** Names are similar if their reduced forms are close by trigram matching. 9 | ** The reduced form of a name is the name with certain common words (like "Farm") removed. 10 | ** 11 | */ 12 | 13 | create extension if not exists pg_trgm; -- trigram matching 14 | 15 | \echo -n Deduplicating REPD dataset ... 16 | 17 | -- PARAMETERS: 18 | -- 19 | -- cluster_distance is the distance (in metres) within which we count 20 | -- two objects as potentially being part of the same cluster. 21 | -- 22 | -- name_distance is the threshold (in trigram matching) for counting two site 23 | -- names as potentially representing the same site. 24 | 25 | \set cluster_distance 1380 26 | \set identical_cluster_distance 5 27 | \set name_distance 0.2 28 | 29 | 30 | create temporary view repd_parts(repd_id1, repd_id2) as 31 | with temp(repd_id, location, capacity, reduced_site_name) as ( 32 | -- Within `site_name` remove the following strings: 33 | -- solar, Solar, park, Park, farm, Farm, resubmission, (resubmission), (Resubmission), 34 | -- extension, Extension, () 35 | -- also remove ' - ' and reduce two consecutive spaces to one. 36 | select repd_id, 37 | location, 38 | capacity, 39 | regexp_replace( 40 | regexp_replace( 41 | site_name, 42 | 'solar|Solar|park|Park|farm|Farm|\(resubmission\)|\(Resubmission\)|resubmission|Resubmission|extension|Extension|\(\)', '', 'g'), 43 | ' +', ' ', 'g') as reduced_site_name 44 | from repd 45 | ) 46 | select x.repd_id as repd_id1, 47 | y.repd_id as repd_id2 48 | from temp as x cross join temp as y 49 | -- two sites are the same if they are close and have similar names; or if they 50 | -- are so close as to be clearly the same. 51 | where x.repd_id != y.repd_id 52 | and ((x.location::geography <-> y.location::geography < 53 | area_adaptive_threshold(NULL, NULL, x.capacity, y.capacity) 54 | and similarity(x.reduced_site_name, y.reduced_site_name) >= :name_distance) 55 | or x.location::geography <-> y.location::geography < :identical_cluster_distance); 56 | 57 | 58 | -- repd_clusters(repd_id1, repd_id2) 59 | -- Objects that can be reached through a chain of connections 60 | 61 | create temporary view repd_clusters as 62 | with recursive repd_clusters(repd_id1, repd_id2) as ( 63 | select repd_id1, repd_id2 64 | from repd_parts 65 | union 66 | select repd_clusters.repd_id1 as repd_id1, repd_parts.repd_id2 as repd_id2 67 | from repd_clusters cross join repd_parts 68 | where repd_clusters.repd_id2 = repd_parts.repd_id1 69 | ) 70 | select repd_id1, repd_id2 FROM repd_clusters; 71 | 72 | 73 | -- repd_dedup(repd_id, master_repd_id) 74 | -- master_repd_id is the largest repd_id over all objects within the 75 | -- same cluster 76 | 77 | create temporary view repd_dedup as 78 | select repd_id1 as repd_id, max(repd_id2) as master_repd_id 79 | from repd_clusters 80 | group by repd_id1; 81 | 82 | /* 83 | ** Merge the new groupings into the repd table 84 | */ 85 | 86 | alter table repd 87 | add column master_repd_id integer; -- default is NULL 88 | 89 | \echo clustering ... 90 | 91 | update repd 92 | set (master_repd_id) = 93 | (select master_repd_id 94 | from repd_dedup 95 | where repd_dedup.repd_id = repd.repd_id) 96 | where master_repd_id is null; 97 | 98 | -- Add master id identical to id for all singletons, to aid matching 99 | update repd 100 | set master_repd_id = repd_id 101 | where master_repd_id is null; 102 | -------------------------------------------------------------------------------- /db/repd.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Create table containing REPD dataset 3 | */ 4 | 5 | \echo Creating REPD table ... 6 | 7 | drop table if exists raw.repd; 8 | drop table if exists repd cascade; 9 | 10 | create table raw.repd ( 11 | old_repd_id varchar(15), 12 | repd_id integer, 13 | record_last_updated date, 14 | operator varchar(100), 15 | site_name varchar(100), 16 | tech_type varchar(40), 17 | storage_type varchar(40), 18 | co_location_repd_id float, -- can't make this int as csv contains NaN, but this is fixed below 19 | capacity varchar(8), 20 | chp_enabled varchar(3), 21 | ro_banding varchar(10), 22 | fit_tariff float, 23 | cfd_capacity varchar(10), 24 | turbine_capacity varchar(10), 25 | num_turbines varchar(10), 26 | height_turbines varchar(10), 27 | mounting_type varchar(10), 28 | dev_status varchar(40), 29 | dev_status_short varchar(30), 30 | address varchar(300), 31 | county varchar(30), 32 | region varchar(20), 33 | country varchar(20), 34 | postcode varchar(15), 35 | x float, 36 | y float, 37 | planning_authority varchar(70), 38 | planning_application_reference varchar(50), 39 | appeal_reference varchar(50), 40 | sec_state_ref varchar(50), 41 | type_sec_state_intervention varchar(20), 42 | judicial_review float, 43 | offshore_wind_round varchar(10), 44 | planning_application_submitted date, 45 | planning_application_withdrawn date, 46 | planning_permission_refused date, 47 | appeal_lodged date, 48 | appeal_withdrawn date, 49 | appeal_refused date, 50 | appeal_granted date, 51 | planning_permission_granted date, 52 | sec_state_intervened date, 53 | sec_state_refused date, 54 | sec_state_granted date, 55 | planning_permission_expired date, 56 | under_construction date, 57 | operational date, 58 | latitude float, 59 | longitude float, 60 | primary key (repd_id) 61 | ); 62 | 63 | \copy raw.repd from '../data/processed/repd.csv' delimiter ',' csv header; 64 | 65 | 66 | /* ----------------------------------------------------------------------------- 67 | ** Edit table as necessary 68 | */ 69 | 70 | -- Change co_location_repd_id to integer 71 | alter table raw.repd 72 | alter column co_location_repd_id 73 | set data type int 74 | using cast (co_location_repd_id as integer); 75 | 76 | alter table raw.repd 77 | alter column capacity 78 | type float using (capacity)::float; 79 | 80 | -- Create geometry columns for geographical comparison/matching 81 | -- NB: Spatial Reference ID 4326 refers to WGS84 82 | 83 | alter table raw.repd 84 | add column location geometry(Point, 4326); 85 | 86 | update raw.repd 87 | set location = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326); 88 | 89 | /* ----------------------------------------------------------------------------- 90 | ** Generate final table 91 | */ 92 | 93 | -- Restrict REPD to operational Solar PV only 94 | select * into repd 95 | from raw.repd 96 | where tech_type = 'Solar Photovoltaics'; 97 | 98 | alter table repd 99 | drop column tech_type; 100 | -------------------------------------------------------------------------------- /db/dedup-osm.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Deduplicate the OSM dataset and add field `master_osm_id` 3 | ** 4 | ** Some OSM rows represent geographical entities that are in reality component parts 5 | ** of a single installation (often a solar "farm"). 6 | ** This script adds a field `master_osm_id` that contains a unique `osm_id` for all 7 | ** members of a single group. 8 | */ 9 | 10 | \echo -n Deduplicating OSM dataset ... 11 | 12 | -- PARAMETERS: cluster_distance is the distance (in metres) within which we count 13 | -- two objects as certainly being part of the same cluster. 14 | 15 | \set cluster_distance 300 16 | 17 | drop table if exists osm cascade; 18 | drop table if exists osm_dedup; 19 | 20 | -- The field plantref, if not null, contains a value of the form 'way/123456789'. 21 | -- We extract the part after the "/", which corresponds to another osm_id. 22 | 23 | select objtype, 24 | osm_id, 25 | username, 26 | time_created, 27 | latitude, 28 | longitude, 29 | area, 30 | capacity, 31 | modules, 32 | located, 33 | orientation, 34 | cast(split_part(plantref, '/', 2) as bigint) as master_osm_id, 35 | source_capacity, 36 | source_obj, 37 | tag_power, 38 | repd_id_str, 39 | tag_start_date, 40 | location 41 | into osm 42 | from raw.osm; 43 | 44 | /* 45 | ** Deduplicate objects that are part of the same farm 46 | ** 47 | ** 1. Find groups of objects within 300m of each other; 48 | ** 2. Call these "the same" and close over this equivalence relation 49 | ** 3. Choose one osm_id from each equivalence class 50 | */ 51 | 52 | \echo clustering ... 53 | 54 | -- osm_parts(osm_id1, osm_id2) 55 | -- All pairs of objects that are within 300m of each other 56 | 57 | create temporary view osm_parts as 58 | with maybe_dupes(osm_id, location, area, capacity) as ( 59 | -- ignore nodes, (various misspellings of) rooftop things, 60 | -- and cases where there is already a master_osm_id. 61 | -- NB. "X is not true" is true if X is false or X is null 62 | select osm_id, location, area, capacity from osm 63 | where 64 | objtype != 'node' 65 | and (located in ('roof', 'rood', 'roofq', 'rof', 'roofs')) is not true 66 | and master_osm_id is null 67 | ) 68 | -- find objects within appropriate distance of each other 69 | select md1.osm_id as osm_id1, md2.osm_id as osm_id2 70 | from maybe_dupes as md1, maybe_dupes as md2 71 | where md1.osm_id != md2.osm_id 72 | and md1.location::geography <-> md2.location::geography < 73 | area_adaptive_threshold(md1.area, md2.area, md1.capacity, md2.capacity); 74 | 75 | -- osm_clusters(osm_id1, osm_id2) 76 | -- Objects that can be reached through a chain of connections 77 | 78 | create temporary view osm_clusters as 79 | with recursive osm_clusters(osm_id1, osm_id2) as ( 80 | select osm_id1, osm_id2 81 | from osm_parts 82 | union 83 | select osm_clusters.osm_id1 as osm_id1, osm_parts.osm_id2 as osm_id2 84 | from osm_clusters cross join osm_parts 85 | where osm_clusters.osm_id2 = osm_parts.osm_id1 86 | ) 87 | select osm_id1, osm_id2 FROM osm_clusters; 88 | 89 | -- osm_dedup(osm_id, master_osm_id) 90 | -- master_osm_id is the largest osm_id over all objects within the 91 | -- same cluster 92 | 93 | select osm_id1 as osm_id, max(osm_id2) as master_osm_id 94 | into osm_dedup 95 | from osm_clusters 96 | group by osm_id1; 97 | 98 | /* 99 | ** Merge the new groupings into the osm table 100 | */ 101 | 102 | update osm 103 | set (master_osm_id) = 104 | (select master_osm_id 105 | from osm_dedup 106 | where osm_dedup.osm_id = osm.osm_id) 107 | where master_osm_id is null; 108 | 109 | -- Add master id identical to id for all singletons, to aid matching 110 | update osm 111 | set master_osm_id = osm_id 112 | where master_osm_id is null; 113 | -------------------------------------------------------------------------------- /data/processed/pre-process-repd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ### Process the REPD data to remove oddities 3 | ### Reads from stdin, writes to stdout 4 | 5 | from bng_to_latlon import OSGB36toWGS84 as convert 6 | import sys 7 | import pandas as pd 8 | import re 9 | 10 | # Remove "carriage returns" and the dagger symbol 11 | def clean_repd_csv(csv_str): 12 | csv_str = csv_str.replace("\r", "").replace("†", "") 13 | rgx_list = ['\s*\s*"'] 14 | for rgx_match in rgx_list: 15 | csv_str = re.sub(rgx_match, '"', csv_str) 16 | return csv_str 17 | 18 | sys.stdin.reconfigure(encoding='iso-8859-1') 19 | repd_df = pd.read_csv(sys.stdin, skiprows=1) 20 | 21 | # Check the file has the columns we expect and order them as we expect 22 | # If the columns don't exist, make the column empty 23 | output_df = pd.DataFrame() 24 | required_columns = ['Old Ref ID', 25 | 'Ref ID', 26 | 'Record Last Updated (dd/mm/yyyy)', 27 | 'Operator (or Applicant)', 28 | 'Site Name', 29 | 'Technology Type', 30 | 'Storage Type', 31 | 'Storage Co-location REPD Ref ID', 32 | 'Installed Capacity (MWelec)', 33 | 'CHP Enabled', 34 | 'RO Banding (ROC/MWh)', 35 | 'FiT Tariff (p/kWh)', 36 | 'CfD Capacity (MW)', 37 | 'Turbine Capacity (MW)', 38 | 'No. of Turbines', 39 | 'Height of Turbines (m)', 40 | 'Mounting Type for Solar', 41 | 'Development Status', 42 | 'Development Status (short)', 43 | 'Address', 44 | 'County', 45 | 'Region', 46 | 'Country', 47 | 'Post Code', 48 | 'X-coordinate', 49 | 'Y-coordinate', 50 | 'Planning Authority', 51 | 'Planning Application Reference', 52 | 'Appeal Reference', 53 | 'Secretary of State Reference', 54 | 'Type of Secretary of State Intervention', 55 | 'Judicial Review', 56 | 'Offshore Wind Round', 57 | 'Planning Application Submitted', 58 | 'Planning Application Withdrawn', 59 | 'Planning Permission Refused', 60 | 'Appeal Lodged', 61 | 'Appeal Withdrawn', 62 | 'Appeal Refused', 63 | 'Appeal Granted', 64 | 'Planning Permission Granted', 65 | 'Secretary of State - Intervened', 66 | 'Secretary of State - Refusal', 67 | 'Secretary of State - Granted', 68 | 'Planning Permission Expired', 69 | 'Under Construction', 70 | 'Operational'] 71 | 72 | for col in required_columns: 73 | try: 74 | output_df[col] = repd_df[col] 75 | except KeyError: 76 | output_df[col] = np.nan 77 | 78 | # Also at this point reduce to PV only (reduces data volumes) 79 | output_df = output_df[output_df['Technology Type']=='Solar Photovoltaics'] 80 | 81 | # Remove thousand-separator commas from number fields 82 | output_df['Storage Co-location REPD Ref ID'] = output_df['Storage Co-location REPD Ref ID'].map(lambda x: float(str(x).replace(',',''))) 83 | output_df['X-coordinate'] = output_df['X-coordinate'].map(lambda x: float(str(x).replace(',',''))) 84 | output_df['Y-coordinate'] = output_df['Y-coordinate'].map(lambda x: float(str(x).replace(',',''))) 85 | 86 | # Remove spaces from postcodes 87 | output_df['Post Code'] = output_df['Post Code'].map(lambda x: str(x).replace(' ','')) 88 | 89 | # Ensure the tariff is numeric 90 | output_df['FiT Tariff (p/kWh)'] = output_df['FiT Tariff (p/kWh)'].map(lambda x: float(x)) 91 | 92 | # Remove line breaks from within certain fields 93 | output_df['Site Name'] = output_df['Site Name'].str.strip() 94 | output_df['Address'] = output_df['Address'].str.replace('\r\n', ', ') 95 | output_df['Appeal Reference'] = output_df['Appeal Reference'].map(lambda x: str(x).replace('\r\n', '')) 96 | 97 | # Convert each BNG coordinate to lat and lon and add new columns 98 | for index, row in output_df.iterrows(): 99 | # Ignore any rows that don't have an X and Y coordinate 100 | if pd.notna(row['X-coordinate']) and pd.notna(row['Y-coordinate']): 101 | lat, lon = convert(row['X-coordinate'], row['Y-coordinate']) 102 | output_df.at[index,'latitude'] = lat 103 | output_df.at[index,'longitude'] = lon 104 | 105 | repd_csv_str = output_df.to_csv(index=False) 106 | 107 | # Make generic edits and write out 108 | sys.stdout.write(clean_repd_csv(repd_csv_str)) 109 | -------------------------------------------------------------------------------- /db/export.sql: -------------------------------------------------------------------------------- 1 | /* 2 | ** Solar PV database export to CSV etc 3 | ** April--August 2020 4 | ** Author: Dan Stowell 5 | */ 6 | 7 | -- "Suggested REPD IDs for OSM" 8 | CREATE TEMP TABLE "tmp_export_osm_repd" AS 9 | SELECT match_rule, osm.objtype, osm.osm_id, osm.latitude as "osm_latitude", osm.longitude as "osm_longitude", osm.repd_id_str as "osm_repd_id", repd.* 10 | FROM matches 11 | LEFT JOIN repd ON matches.master_repd_id=repd.repd_id 12 | LEFT JOIN osm ON matches.master_osm_id=osm.osm_id 13 | WHERE osm.repd_id_str IS NULL -- this one excludes all OSM items with no listed REPD 14 | OR (NOT osm.repd_id_str=cast(repd.repd_id as varchar(10))) -- and this one adds in the ones present but mismatching 15 | ORDER BY (match_rule IN ('4', '4a')), repd.repd_id; 16 | 17 | \copy "tmp_export_osm_repd" TO '../data/exported/osm_repd_proposed_matches.csv' WITH DELIMITER ',' CSV HEADER; 18 | 19 | -- "Grand unified [over osm & repd] CSV of PV geolocations" 20 | CREATE TEMP TABLE "tmp_export_pvgeo" AS 21 | SELECT DISTINCT osm.objtype as osm_objtype, osm.osm_id, 22 | repd.repd_id, repd.site_name as repd_site_name, 23 | repd.capacity::float as "capacity_repd_MWp", osm.capacity as "capacity_osm_MWp", 24 | COALESCE(osm.latitude, repd.latitude) as latitude, 25 | COALESCE(osm.longitude, repd.longitude) as longitude, 26 | osm.area as area_sqm, osm.located, osm.orientation, osm.tag_power as osm_power_type, osm.tag_start_date as osm_tag_start_date, 27 | osm.modules as num_modules, -- osm.source_obj as osm_source_obj, osm.source_capacity as osm_source_capacity, 28 | repd.dev_status_short as repd_status, repd.operational as repd_operational_date, repd.old_repd_id, 29 | osm.master_osm_id as osm_cluster_id, repd.master_repd_id as repd_cluster_id, 30 | source_capacity, source_obj, 31 | matches.match_rule 32 | FROM (matches 33 | FULL JOIN repd ON (matches.master_repd_id=repd.master_repd_id 34 | AND repd.dev_status_short NOT IN ('Abandoned', 'Application Refused', 'Application Withdrawn', 'Planning Permission Expired') 35 | AND match_rule NOT IN ('4', '4a')) -- skip matches that were "schemes" 36 | FULL JOIN osm ON (matches.master_osm_id=osm.master_osm_id 37 | )) 38 | ORDER BY repd.repd_id, osm.osm_id; 39 | 40 | -- Delete irrelevant REPD entries (i.e. no OSM ID and status hints nonexistence). 41 | -- (Doing this in the main query was not working correctly for some reason.) 42 | -- Note that some statuses e.g. 'Awaiting Construction', 'Under Construction' imply they are likely to exist soon. 43 | -- Here we try to strike a balance between being conservative (which means, ignore everything not flagged as operational) 44 | -- and accounting for the time-lag in the official data (which means, include items likely to come online very soon). 45 | DELETE FROM tmp_export_pvgeo WHERE (osm_id IS NULL) AND ((repd_status IS NULL) OR 46 | -- (repd_status IN ('Abandoned', 'Application Refused', 'Application Submitted', 'Application Withdrawn', 'Awaiting Construction', 'Planning Permission Expired'))); 47 | (NOT repd_status IN ('No Application Required', 'Operational', 'Under Construction', 'Awaiting Construction'))); 48 | 49 | 50 | -- Copy in, and redistribute, the REPD official capacities: if there are multiple clusters with the same REPD ID, split the capacity equally over them. Any item that's not the cluster representative should not list the repd capacity. This way, the REPD capacity column can be meaningfully summed. 51 | -- (Note: NOT across items with the same cluster ID. We do not want to spread the REPD capacities over every subelement, but when there are multiple "top-level" REPD matches we have no alternative but to spread the capacity to ensure we don't double-count the capacity.) 52 | UPDATE tmp_export_pvgeo SET "capacity_repd_MWp"=NULL WHERE NOT (osm_id=osm_cluster_id or osm_id is null); -- Note, do not null out the REPD cap where not repd-cluster, since each REPD may have its own cap. 53 | UPDATE tmp_export_pvgeo SET "capacity_repd_MWp"=portioned FROM ( 54 | select repd_cluster_id, "capacity_repd_MWp", "capacity_repd_MWp"::float/COUNT(repd_cluster_id) as portioned from tmp_export_pvgeo where repd_cluster_id>0 and repd_id=repd_cluster_id and (osm_id=osm_cluster_id or osm_id is null) group by repd_cluster_id, "capacity_repd_MWp") as portiontable WHERE tmp_export_pvgeo.repd_cluster_id=portiontable.repd_cluster_id and tmp_export_pvgeo.repd_cluster_id=tmp_export_pvgeo.repd_id and (osm_id=osm_cluster_id or osm_id is null); 55 | UPDATE tmp_export_pvgeo SET "capacity_repd_MWp"=NULL WHERE "capacity_repd_MWp"=0; -- proper NA entries. 56 | 57 | -- We do the same for duplicated OSM IDs too. 58 | -- It may seem strange that we have OSM IDs duplicated, but the reason is that they can meaningfully match two REPD entries (eg a farm, plus its extension). 59 | -- In such a case the OSM ID appears in two rows. We don't want the osm-tagged capacity double-counted, so we subdivide it. 60 | UPDATE tmp_export_pvgeo SET "capacity_osm_MWp"=portioned FROM ( 61 | select osm_id, "capacity_osm_MWp", "capacity_osm_MWp"::float/COUNT(osm_id) as portioned from tmp_export_pvgeo where osm_id>0 and "capacity_osm_MWp">0 group by osm_id, "capacity_osm_MWp" 62 | ) as portiontable WHERE tmp_export_pvgeo.osm_id=portiontable.osm_id; 63 | UPDATE tmp_export_pvgeo SET "capacity_osm_MWp"=NULL WHERE "capacity_osm_MWp"=0; -- proper NA entries. 64 | 65 | 66 | \copy "tmp_export_pvgeo" TO '../data/exported/ukpvgeo_points.csv' WITH DELIMITER ',' CSV HEADER; 67 | 68 | -- deactivated: cluster-only export. 69 | -- \copy (SELECT * FROM tmp_export_pvgeo WHERE (osm_id = osm_cluster_id OR osm_cluster_id IS NULL OR osm_id IS NULL) AND (repd_id = repd_cluster_id OR repd_cluster_id IS NULL OR repd_id IS NULL)) TO '../data/exported/ukpvgeo_points_merged_deduped_osm-repd_clusters.csv' WITH DELIMITER ',' CSV HEADER; 70 | 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Solar Power Mapping Data 2 | 3 | This software combines datasets about solar panels (photovoltaic "PV" panels), and combines them into a harmonised data source which can be used for machine vision, short-term solar forecasting and other applications. 4 | 5 | The goal is to produce a large and rich dataset of PV, which is fully open data. 6 | 7 | This software was developed under the Alan Turing Institute's Climate Action project grant R-SPES-115: "Enabling worldwide solar PV nowcasting via machine vision and open data". 8 | 9 | 10 | ## Overview of the directory structure 11 | 12 | ``` 13 | . 14 | |-- admin -- project process and planning docs 15 | |-- data 16 | | |-- as_received -- downloaded data files 17 | | |-- raw -- manually edited files (replace dummy data) 18 | | |-- processed 19 | |-- db -- database creation 20 | |-- doc -- documentation 21 | |-- explorations -- exploratory work 22 | `-- notebooks 23 | ``` 24 | 25 | 26 | ## Data 27 | 28 | Data is held in three directories: `as_received` contains the data precisely as 29 | downloaded from its original source and in its original format; `raw` contains 30 | data that has been manually restructured or reformatted to be suitable for use by 31 | software in the project (see "Using this repo" header below). `processed` contains data that may have been processed in some way, such as by Python code, but is still thought of as “source” data. 32 | 33 | The following sources of data are used: 34 | 35 | - OpenStreetMap - [Great Britain download (Geofabrik)](https://download.geofabrik.de/europe/great-britain.html). 36 | - [OSM data types](https://wiki.openstreetmap.org/wiki/Elements) 37 | - [Solar PV tagging](https://wiki.openstreetmap.org/wiki/Tag:generator:source%3Dsolar) 38 | - [FiT](https://www.ofgem.gov.uk/environmental-programmes/fit/contacts-guidance-and-resources/public-reports-and-data-fit/installation-reports) - Report of installed PV (and other tech including wind). 100,000s entries. 39 | - [REPD](https://www.gov.uk/government/publications/renewable-energy-planning-database-monthly-extract) - Official UK data from the "renewable energy planning database". It contains large solar farms only. 40 | - Machine Vision dataset - supplied by Descartes labs (Oxford), not publicly available yet. 41 | 42 | ## Project outcome 43 | 44 | This repo includes a set of scripts that will take 45 | input datasets (REPD, OSM, FiT and machine vision – each in diff format), 46 | perform data cleaning/conversion, populate a PostgreSQL database, perform 47 | grouping of data where necessary (there are duplicate entries in REPD, multiple solar farm 48 | components in OSM) and then match entries between the data tables, based on the 49 | matching criteria we have come up with. 50 | 51 | The database creation and matching scripts should work with newer versions of the source data files, or at least do so with minimal changes to the data processing (see "Using this repo" below). 52 | 53 | The result of matching is a table in the database called `matches` that links the unique identifiers of the 54 | data tables. This also contains a column called `match_rule`, which refers to the method by which the match was determined, as documented in [doc/matching](doc/matching.md). 55 | 56 | ## Using this repo 57 | 58 | First, download (or clone) this repository. 59 | 60 | ### Install requirements 61 | 62 | 1. Install [PostgreSQL](https://www.postgresql.org/download/) with [PostGIS](http://postgis.net/install/) 63 | 2. Install Python 3 (version 3.7 or later) and `pip` 64 | 3. Run `pip install -r requirements.txt` 65 | 4. Install [Osmium](https://osmcode.org/osmium-tool/) 66 | 67 | ### Download and prepare data files 68 | 69 | 1. Download the following data files from the internet and store locally. We recommend saving these original data files within the directory structure under `data/as_received`: 70 | - OSM PBF file (GB extract): [Download](https://download.geofabrik.de/europe/great-britain-latest.osm.pbf) 71 | - FiT reports: Navigate to [ofgem](https://www.ofgem.gov.uk/environmental-programmes/fit/contacts-guidance-and-resources/public-reports-and-data-fit/installation-reports) and click the link for the latest Installation Report (during the Turing project, 30 September 2019 was used), then download the main document AND subsidiary documents 72 | - REPD CSV file: [Download](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/879414/renewable-energy-planning-database-march-2020.csv) - this is always the most up to date version 73 | - Machine Vision dataset: supplied by Descartes labs (Oxford), not publicly available yet. 74 | 2. Navigate to `data/raw` and type `make` - this will convert some of the downloads into other file formats ready for further processing. 75 | - Note that the OpenStreetMap data will have been processed into a file `osm.csv`. If you do not need to do any merging/clustering, you could use this file directly, as a simplified extract of OSM solar PV data. 76 | 3. Carry out manual edits to the data files, as described in [doc/preprocessing](doc/preprocessing.md), editing the file copies in `data/raw` under the names suggested by the doc. 77 | 4. Navigate to `data/processed` and type `make` - this will create versions of the data files ready for import to PostgreSQL 78 | 79 | ### Run the database creation and data matching 80 | 81 | 5. Make sure you have PostgreSQL on your machine, then run the command: `createdb hut23-425 "Solar PV database matching"` - this creates the empty database. 82 | 6. Navigate to `db` and run the command `psql -f make-database.sql hut23-425` - this populates the database (see [doc/database](doc/database.md)), carries out some de-duplication of the datasets and performs the matching procedure (see [doc/matching](doc/matching.md)). Note: this may take several minutes. 83 | 84 | Note that the above commands require you to have admin rights on your PostgreSQL server. On standard Debian-based machines you could prepend the commands with `sudo -u postgres`, or you could assign privileges to your own user account. 85 | 86 | ### Export data from the database 87 | 88 | The data tables in PostgreSQL can be used for further analysis. To make a data "snapshot" we export back out again: 89 | 90 | 7. Navigate to `db` and run the command `psql -f export.sql hut23-425` 91 | 8. Navigate to `data/exported` and run `make`. Note: this may take several minutes. 92 | 93 | You can also run the statistical analysis and plotting -- however, this relies on some external data files such as GSP regions and LSOA regions. The file `analyse_exported.py` makes use of some local file paths (in `data/other`, not in the public source code). To do the additional plotting+stats, in `data/exported` run `make all`. 94 | 95 | As a result of this, you should have a CSV and a GeoJSON file representing the harmonised data exported from the local database. 96 | -------------------------------------------------------------------------------- /data/exported/export_geometries.py: -------------------------------------------------------------------------------- 1 | # export_geometries.py 2 | # Dan Stowell April 2020 3 | 4 | # This script takes as input: 5 | # (1) a CSV file as produced by db/export.sql 6 | # (2) a GeoJSON file produced by taking our filtered OSM extract of UK PV 7 | # and unifies them into a GeoJSON tagged with our PV data. 8 | 9 | # How to convert the OSM extract to GeoJSON, using ogr2ogr on a linux commandline: 10 | # rm data/exported/osm_layers_merged.geojson 11 | # for layer in points lines multilinestrings multipolygons other_relations; 12 | # do echo "Extracting layer $layer"; 13 | # ogr2ogr -f GeoJSON -update -append -addfields data/exported/osm_layers_merged.geojson data/exported/osm-gb-solaronly.osm.pbf $layer -nln merged; 14 | # done 15 | 16 | 17 | 18 | import csv, os 19 | import numpy as np 20 | import pandas as pd 21 | import geopandas as gpd 22 | from shapely.geometry import Point, Polygon, MultiPolygon, LineString, GeometryCollection 23 | 24 | pvexportfpath = 'ukpvgeo_points.csv' 25 | geometryfpath = '../raw/osm-gb-solaronly.geojson' 26 | 27 | 28 | # load ukpvgeo_all.csv to df 29 | inttype = pd.Int64Dtype() 30 | df = pd.read_csv(pvexportfpath, dtype={'repd_id':inttype, 'osm_id':inttype, 'repd_cluster_id':inttype, 'osm_cluster_id':inttype, 'num_modules':inttype, 'orientation':inttype}) 31 | # load osm geojson to gdf 32 | gdf = gpd.read_file(geometryfpath) 33 | 34 | # summarise the geometries loaded 35 | print("Loaded GeoJSON source, with the following geometry objects:") 36 | print(gdf.geom_type.value_counts()) 37 | 38 | # delete columns that we don't care about. for example: the other_tags from gdf 39 | # DON'T YET delete the lat lon from csv. 40 | for colname in ['other_tags', 'barrier', 'man_made', 'highway', 'landuse', 'building', 'tourism', 'amenity', 'shop', 'natural', 'leisure', 'sport', 'z_order', 'type']: 41 | if colname in gdf: 42 | del gdf[colname] 43 | 44 | # csv: check stats on osm_id, osm_way_id and their co-occurrence --- then merge the columns 45 | # cute row selection: gdf[~gdf.barrier.isna() & ~gdf.osm_id.isna()] 46 | print("GDF columns:") 47 | print(gdf.columns) 48 | assert ( gdf.osm_id.isna() & gdf.osm_way_id.isna()).sum()==0, "Violated expectation that no GeoJSON object can have BOTH osm_id and osm_way_id" 49 | assert (~gdf.osm_id.isna() & ~gdf.osm_way_id.isna()).sum()==0, "Violated expectation that every GeoJSON object must have osm_id or osm_way_id" 50 | gdf.osm_id.update(gdf.osm_way_id) # coalesce in-place - this impl assumes no overlap 51 | del gdf['osm_way_id'] 52 | # now convert the osm_id to the same data type as in the other dataset 53 | gdf["osm_id"] = pd.to_numeric(gdf["osm_id"]) 54 | gdf = gdf.astype({'osm_id': inttype}) 55 | 56 | # rename some columns to clarify their origin 57 | gdf = gdf.rename(columns={'name':'osm_name'}) 58 | 59 | #################################################################### 60 | # convert lines into polygons 61 | # https://gis.stackexchange.com/questions/321004/iterating-features-and-buffering-using-geopandas 62 | 63 | def convert_line_to_polygon(obj): 64 | "NOTE: returns false if no conversion possible/needed" 65 | coords = obj.coords 66 | closed = coords[0]==coords[-1] 67 | if not closed: 68 | print("Warning, unclosed way (has %i points), consider inspecting it." % len(coords)) 69 | return False 70 | else: 71 | return Polygon(coords) 72 | 73 | geomconverted = {'LineString': 0, 'GeometryCollection_LineString': 0, 'nonosm_Point': 0} 74 | 75 | for index, row in gdf[gdf.geom_type=='LineString'].iterrows(): 76 | newgeom = convert_line_to_polygon(row.geometry) 77 | if not newgeom: 78 | print(row.osm_id) 79 | else: 80 | gdf.loc[index, 'geometry'] = newgeom 81 | geomconverted['LineString'] += 1 82 | 83 | # TODO I can't correctly update the geomcoll for some reason 84 | if False: 85 | for index, row in gdf[gdf.geom_type=='GeometryCollection'].iterrows(): 86 | print("%s %i, OSM id %i:" % (gdf.geom_type[index], index, row.osm_id)) 87 | print([item.geom_type for item in row.geometry]) 88 | 89 | for gindex, item in enumerate(row.geometry): 90 | print(item) 91 | bignewgeom = [] 92 | if item.geom_type=='LineString': 93 | newgeom = convert_line_to_polygon(item) 94 | if not newgeom: 95 | print(row.osm_id, "entry %i" % gindex) 96 | bignewgeom.append(item) 97 | else: 98 | bignewgeom.append(newgeom) 99 | geomconverted['GeometryCollection_LineString'] += 1 100 | #gdf.loc[index, 'geometry'] = GeometryCollection(bignewgeom) 101 | row.geometry = GeometryCollection(bignewgeom) 102 | gdf.iloc[index] = row 103 | 104 | print("Converted geometries. Converted: %s. Results:" % str(geomconverted)) 105 | print(gdf.geom_type.value_counts()) 106 | 107 | ######################################################################### 108 | # perform a join -- a right join, to capture the REPD items with no osmid 109 | udf = gdf.merge(df, on='osm_id', how='right') 110 | 111 | print("Items with osmid and no geom (will be dropped, assumed merged into a relation's multipol): %i" % (udf.geometry.isna() & ~udf.osm_id.isna()).sum()) 112 | udf.drop(udf[udf.geometry.isna() & ~udf.osm_id.isna()].index, inplace=True) 113 | print("Items with osmid and no geom (now dropped): %i" % (udf.geometry.isna() & ~udf.osm_id.isna()).sum()) 114 | 115 | print("Items with no geom and no lat-lon (will be dropped, cannot geolocate): %i" % (udf.geometry.isna() & udf.latitude.isna()).sum()) 116 | udf.drop(udf[udf.geometry.isna() & udf.latitude.isna()].index, inplace=True) 117 | 118 | 119 | # the resulting items with no osm_id - are they the ones we expect? is it the same set as the entries with no geom? 120 | print("Merged data.") 121 | print("Items with geometry: %i" % (~udf.geometry.isna()).sum()) 122 | print("Items without geometry: %i" % udf.geometry.isna().sum()) 123 | print("Items with osmid : %i" % (~udf.osm_id.isna()).sum()) 124 | print("Items without osmid : %i" % udf.osm_id.isna().sum()) 125 | 126 | # the resulting items with no osm_id - assert no geom, add point geom from lat+lon 127 | assert (~udf[udf.osm_id.isna()].geometry.isna()).sum()==0, "Rows with no osm_id should also have no geometry" 128 | 129 | for index, row in udf[udf.osm_id.isna()].iterrows(): 130 | udf.loc[index, 'geometry'] = Point(row['longitude'], row['latitude']) 131 | geomconverted['nonosm_Point'] += 1 132 | 133 | print("Created geometries for non-osm points. Converted: %i. Results:" % geomconverted['nonosm_Point']) 134 | print(gdf.geom_type.value_counts()) 135 | 136 | assert udf.geometry.isna().sum()==0, "After creating lat-lon points, no-one should have null geometry: we have %i" % udf.geometry.isna().sum() 137 | 138 | # near end: delete columns latitude, longitude -- these were derived from the geometry anyway 139 | for colname in ['latitude', 'longitude']: 140 | if colname in udf: 141 | del udf[colname] 142 | 143 | ######################################################################### 144 | print("===========================================================================") 145 | print("Finished filtering and merging CSV and GeoJSON data.") 146 | print(udf.describe(exclude=gpd.array.GeometryDtype)) 147 | 148 | # write file out 149 | udf.to_file("ukpvgeo_geometries.geojson", driver='GeoJSON') 150 | 151 | -------------------------------------------------------------------------------- /doc/database.md: -------------------------------------------------------------------------------- 1 | % The hut23-425 database 2 | 3 | This note documents the creation of the `hut23-425` database and the initial 4 | deduplication of the data. 5 | 6 | We use an "Extract-Load-Transform" methodology: Tables of the source datasets 7 | are first uploaded (from the `data/processed` directory) into the schema `raw` 8 | in the database, then post-processed in the database and saved as tables in the 9 | default schema. (Some tables do not require post-processing and are uploaded 10 | directly into the default schema.) In this note, we assume that the 11 | pre-processing has already taken place. 12 | 13 | In summary, the final tables created, and row counts as of commit `0d8d38b` are: 14 | 15 | | Table | Row count | Master entity field | Master entity count | 16 | |-----------------------|-----------|---------------------|---------------------| 17 | | `osm` | 126,939| `master_osm_id` | 119,427| 18 | | `repd` | 1,986| `master_repd_id` | 1,736 | 19 | | `fit` | 863,079| | | 20 | | `machine_vision` | 2,221| | | 21 | | `osm_repd_id_mapping` | 933 | | | 22 | 23 | (In the above table, the master entity count is the number of the distinct 24 | entries in the named column) 25 | 26 | # 1. Database creation scripts 27 | 28 | These scripts assume the existence of a local Postgres installation containing a 29 | database called `hut23-425`. To create the database, run: 30 | 31 | ```bash 32 | createdb hut23-425 "Solar PV database matching" 33 | ``` 34 | 35 | ## Upload of source data 36 | 37 | SQL code to create the tables and populate the database is in the `db/` 38 | directory. To create the complete database, change to that directory and run: 39 | 40 | ```bash 41 | psql -f make-database.sql hut23-425 42 | ``` 43 | 44 | This script will in turn call `osm.sql`, `repd.sql`, `fit.sql`, `mv.sql`, 45 | `match-osm-repd.sql`, and `dedup-osm.sql` which create and populate the data 46 | tables from the respective source data in `../data/processed/` (adding a small 47 | number of additional columns) and then postprocessing. Note that the schema 48 | `raw` is used as a staging area for certain tables where it is necessary to do 49 | some postprocessing. After postprocessing the working tables will be in the 50 | default schema. 51 | 52 | - `raw.osm`: The raw OSM data. 53 | - `raw.repd`: The raw REPD data. 54 | - `repd`: The REPD data, restricted to solar PV technologies. 55 | - `osm`: The OSM data, de-duplicated. 56 | - `fit`: The FiT data. 57 | - `machine_vision`: The machine vision data. 58 | 59 | A field, `area`, is added to the `fit` table, containing an estimate of the area 60 | of the solar panel(s) based on the declared net capacity. 61 | 62 | The tables `osm`, `repd`, and `machine_vision` include a latitude and longitude 63 | for each installation. An additional field `location` is added to these tables 64 | containing these coordinates converted to a Postgis point. 65 | 66 | The `repd` table has been restricted to those installations whose technology 67 | type is “solar Photovoltaics” and whose development status is “Operational.” 68 | 69 | ## Primary keys for the uploaded data 70 | 71 | ### FiT: `row_id` 72 | 73 | We presume each row of the FiT data denotes an individual installation. However, 74 | there is no defined primary key for this dataset. To allow us to reference the 75 | original rows later an index is added to the dataset between `raw` and 76 | `processed`. 77 | 78 | ### REPD: `repd_id` 79 | 80 | The source data contains a unique identifier, `Ref ID`, for each 81 | installation. This field has been renamed to `repd_id` and used as the primary 82 | key. 83 | 84 | ### OSM: `osm_id` 85 | 86 | The OSM data has a unique identifier, `id`, for each row. This field has been 87 | renamed `osm_id` and used as the primary key but note that it does not 88 | necessarily represent a unique installation. 89 | 90 | ### Machine Vision: `mv_id` 91 | 92 | We have added a row identifier, `mv_id`, to the pre-processed machine vision dataset. 93 | 94 | # 2. Preliminary matching between OSM and REPD 95 | 96 | The table `osm_repd_id_mapping(osm_id, repd_id)` maps OSM identifiers to REPD 97 | identifiers. 98 | 99 | The entries in the OSM dataset were tagged (in the original data) with zero 100 | or more REPD identifiers. These are present in the field `repd_id_str` as a 101 | semicolon-separated list. The script `match-osm-repd.sql` “un-nests” these 102 | identifiers as a set of rows matched to the corresponding `osm_id`. 103 | 104 | # 3. Deduplication of the OSM dataset 105 | 106 | An OSM entry `objtype` can be one of `relation`, `way`, or `node`. In the case 107 | of a `relation`, there may be several other entries classified as `way` that are 108 | actually the components of the `relation`, all of which refer to a single PV 109 | installation. There may also be several `way`s that are part of the same 110 | installation even though there is no unifying `relation`. 111 | 112 | The script `dedup-osm.sql` identifies groups of objects in the OSM data that are 113 | likely part of the same installation. An extra column, `master_osm_id` is added 114 | to the `osm` table; this column contains a unique `osm_id` for each object in 115 | a single cluster. (The particular `osm_id` used has no significance.) 116 | 117 | ## Using the `plantref` field 118 | 119 | Some of the OSM objects have already been tagged as being part of the same 120 | installation. These are indicated by an entry in the field `plantref` of the 121 | form `way/123456789` where the digits indicate another `osm_id`. If this field 122 | is non-`NULL`, the number is copied across to `osm_master_id`. 123 | 124 | ## Using proximity 125 | 126 | The remainder of the script identifies pairs of installations that are within 127 | 300 metres of each other; it then extends this relation to an equivalence 128 | relation and tags objects that are equivalent to each other with a common 129 | `master_osm_id`. (In fact, the tag is the greatest `osm_id` from the group but 130 | this choice is simply for convenience.) 131 | 132 | ### Technical note 133 | 134 | The relation that contains parts of objects within 300m of each other is clearly 135 | a symmetric relation but it is not necessarily transitive. (It may be the case 136 | that A and B are within 300m of each other and B and C are within 300m of each 137 | other but A is more than 300m from C.) To extend the proximity relation to an 138 | equivalence relation we form the transitive closure of the proximity relation. 139 | 140 | Taking the transitive closure is acheived in SQL through the use of a "recursive 141 | common table expression" (recursive CTE). In the script, it is the query that 142 | begins “`WITH RECURSIVE ...`”. The primary use of recursive CTEs is, in fact, to 143 | compute transitive closures. 144 | 145 | 146 | # 4. Deduplication of the REPD dataset 147 | 148 | The REPD dataset also contains objects that are close enough in both proximity 149 | and name that we believe they are likely to be part of the same installation. 150 | 151 | Deduplication proceeds in a similar manner to the OSM database. We use a 152 | slightly larger distance threshold (1380 m) but include a measure of similarity 153 | between the installation names using Postgres' `similarity` function. In 154 | addition, prior to computing the similarity of names, we “normalise” the names 155 | to remove certain common words (such as “farm”). 156 | 157 | As with the OSM data, a new field, `master_repd_id` is added to the `repd` table 158 | that is non-`NULL` and unique for sites that are believed to be the same site. 159 | -------------------------------------------------------------------------------- /data/raw/compile_osm_solar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Script to parse an OSM XML extract for solar PV data. 4 | # Dan Stowell, 2019-2020. 5 | 6 | import os, sys, csv, subprocess, re 7 | from functools import reduce 8 | from xml import sax 9 | import numpy as np 10 | 11 | from matplotlib.path import Path 12 | from scipy.spatial import KDTree as kdtree 13 | 14 | import pandas as pd 15 | from matplotlib.backends.backend_pdf import PdfPages 16 | import matplotlib.pyplot as plt 17 | 18 | ############################################ 19 | # User configuration: 20 | 21 | # do_osmium = True # if False, it won't try to filter the input OSM file 22 | # osmsourcefpath = os.path.expanduser('~/osm/great-britain/great-britain-200729.osm.pbf') 23 | 24 | do_osmium = False 25 | osmsourcefpath = "osm-gb-solaronly.xml" 26 | 27 | # The extract must ALREADY have been processed by osmium to filter down to just the generator:method=photovoltaic items. 28 | # Here's what I do: 29 | # osmium tags-filter ~/osm/great-britain/great-britain-200729.osm.pbf generator:method=photovoltaic plant:method=photovoltaic plant:source=solar -o ~/osm/solarsearch/gb-solarextracts/gb-200729-solar-withreferenced.xml 30 | # In the public scripts, this is all included in the work done by the makefile 31 | 32 | 33 | ############################################ 34 | # Helper functions: 35 | 36 | earthradius = 6364380.0 # earth radius (m) at Manchester 37 | degrees_to_metres = 2 * np.pi * earthradius / 360 38 | def angular_area_to_sqm(anga, latitude_deg): 39 | """Use simple angular calculation to approximately convert angular areas (calculated from GPS coords) to areas in square metres. 40 | WARNING: APPROXIMATE. Uses spherical assumptions, and does not know about tilt of objects.""" 41 | latitude_correction = np.cos(np.radians(latitude_deg)) # 53.4427 42 | return degrees_to_metres * degrees_to_metres * latitude_correction * anga 43 | 44 | def PolyArea(x,y): 45 | "Calculate area of a polygon from its coordinates (shoelace formula)" 46 | return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1))) # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates 47 | 48 | def guess_kilowattage(anobj): 49 | "This should NOT NORMALLY BE USED since it is really only a rule of thumb." 50 | if not anobj['calc_area']: 51 | return 1. 52 | else: 53 | return anobj['calc_area'] * 0.15 54 | 55 | regex_numbers_semicolon_start = re.compile('^([0-9;]*).*') 56 | 57 | compasspoints = { 58 | 'N': 0, 59 | 'NNE': 22.5, 60 | 'NE': 45, 61 | 'ENE': 67.5, 62 | 'E': 90, 63 | 'ESE': 112.5, 64 | 'SE': 135, 65 | 'SSE': 157.5, 66 | 'S': 180, 67 | 'SSW': 202.5, 68 | 'SW': 225, 69 | 'WSW': 247.5, 70 | 'W': 270, 71 | 'WNW': 292.2, 72 | 'NW': 315, 73 | 'NNW': 337.5, 74 | # unconventional but seen in data: 75 | 'NORTH': 0, 76 | 'NORTHEAST': 45, 77 | 'NORTH_EAST': 45, 78 | 'EAS': 90, # TEMPORARY fix for wonky entry 79 | 'EAST': 90, 80 | 'SOUTHEAST': 135, 81 | 'SOUTH_EAST': 135, 82 | 'SOUTH': 180, 83 | 'SOUTHWEST': 225, 84 | 'SOUTH_WEST': 225, 85 | 'WEST': 270, 86 | 'NORTHWEST': 315, 87 | 'NORTH_WEST': 315, 88 | } 89 | 90 | ############################################################################## 91 | # The main routine, which progressively reacts to XML content as it is loaded: 92 | 93 | class SolarXMLHandler(sax.handler.ContentHandler): 94 | """Parses solar PV data from OSM XML. After this has finished, the 'objs' member is a list of processed PV objects (panels as well as plants). 95 | Note that after the initial parse of the XML, you then need to call postprocess() which will propagate information down from relation-containment and geographic-containment.""" 96 | def __init__ (self): 97 | sax.handler.ContentHandler.__init__(self) 98 | self.curitem = None 99 | self.objs = [] 100 | # these value-stores are for intermediate processing of relationships (e.g. the parent way/rel to know their own accumulated contents) - they are not used as the output data. 101 | self.nodedata = {} 102 | self.waydata = {} 103 | self.reldata = {} 104 | 105 | def startElement (self, name, attrs): 106 | if name in ['node', 'way', 'relation']: # start a new "object" with empty tags 107 | self.curitem = {'id': attrs['id'], 'timestamp': attrs['timestamp'], 'user': attrs.get('user', ''), 'tags':{}, 'objtype': name} 108 | if name == 'node': 109 | self.curitem['lat'] = float(attrs['lat']) 110 | self.curitem['lon'] = float(attrs['lon']) 111 | elif name == 'way': 112 | self.curitem['nodes'] = [] 113 | elif name == 'relation': 114 | self.curitem['nodes'] = [] 115 | self.curitem['ways'] = [] 116 | self.curitem['relations'] = [] 117 | elif name == 'tag': 118 | self.curitem['tags'][attrs['k']] = attrs['v'] 119 | elif name == 'nd': # This is a node reference from within a way 120 | self.curitem['nodes'].append(attrs['ref']) 121 | elif name == 'member': # This is a node/way/rel reference from within a relation 122 | if attrs['type'] == 'way': 123 | self.curitem['ways'].append({'ref':attrs['ref'], 'role':attrs['role']}) 124 | elif attrs['type'] == 'relation': 125 | self.curitem['relations'].append({'ref':attrs['ref'], 'role':attrs['role']}) 126 | elif attrs['type'] == 'node': 127 | self.curitem['nodes'].append({'ref':attrs['ref'], 'role':attrs['role']}) 128 | else: 129 | raise ValueError("This script does not know how to handle the following member-type found in relation %s: '%s'" % (self.curitem['id'], attrs['type'])) 130 | 131 | def endElement(self, name): 132 | if name in ['node', 'way', 'relation']: # finish off, and store, the "object" 133 | curitem = self.curitem # for speed 134 | 135 | if ('highway' in curitem['tags']) and (curitem['tags']['highway']=='turning_circle'): 136 | raise ValueError("Data contains highway=turning_circle items. Are you sure this OSM data has been preprocessed to strip it down only to the photovoltaics?") 137 | 138 | # Cache the object data for use in lookups etc (NOT for output, these data structures - actually "curitem" is where the data for output live.) 139 | datacache = { 140 | 'node': self.nodedata, 141 | 'way': self.waydata, 142 | 'relation': self.reldata, 143 | }[name] 144 | if curitem['id'] in datacache: 145 | raise ValueError("datacache seems to encounter a duplicate item: type %s, id %i" % (name, curitem['id'])) 146 | datacache[curitem['id']] = {} 147 | datacacheitem = datacache[curitem['id']] 148 | 149 | if curitem['objtype']=='node': 150 | datacacheitem['lat'] = curitem['lat'] 151 | datacacheitem['lon'] = curitem['lon'] 152 | curitem['calc_area'] = 0 # TODO there may be a tag telling you the area; else, as a node, it's useful to make clear we have no area estimate 153 | 154 | elif curitem['objtype']=='way': 155 | latlist = [self.nodedata[nodeid]['lat'] for nodeid in curitem['nodes']] 156 | lonlist = [self.nodedata[nodeid]['lon'] for nodeid in curitem['nodes']] 157 | datacacheitem['outlinepath'] = Path([_ for _ in zip(latlist, lonlist)]) 158 | # calculate its centroid as mean(nodes) 159 | curitem['lat'] = np.mean(latlist) 160 | curitem['lon'] = np.mean(lonlist) 161 | datacacheitem['lat'] = curitem['lat'] 162 | datacacheitem['lon'] = curitem['lon'] 163 | # calculate its area using PolyArea 164 | curitem['calc_area'] = PolyArea(latlist, lonlist) 165 | # convert to square metres 166 | curitem['calc_area'] = round(angular_area_to_sqm(curitem['calc_area'], curitem['lat']), 1) 167 | datacacheitem['calc_area'] = curitem['calc_area'] 168 | 169 | elif curitem['objtype']=='relation': 170 | datacacheitem['relations'] = curitem['relations'] 171 | datacacheitem['ways'] = curitem['ways'] 172 | datacacheitem['nodes'] = curitem['nodes'] 173 | datacacheitem['calc_area'] = 0 174 | # NB most of the relationship-handling comes at the end in postprocess() 175 | 176 | #################################### 177 | # Processing tags to log a PV object 178 | if (curitem['tags'].get('power')=='generator' and curitem['tags'].get('generator:method')=='photovoltaic') \ 179 | or (curitem['tags'].get('power')=='plant' and (curitem['tags'].get('plant:method')=='photovoltaic' or curitem['tags'].get('plant:source')=='solar')): 180 | 181 | for k,v in curitem['tags'].items(): 182 | # In here we aim to process ALL KNOWN tags, including irrelevant ones, so that no information is lost 183 | ok = True 184 | if k=='power': 185 | curitem['tag_power'] = v 186 | elif k in ['generator:output:electricity', 'plant:output:electricity']: 187 | v = v.replace(",", ".").upper() 188 | if v in ['YES', 'SMALL_INSTALLATION']: 189 | pass 190 | elif v.endswith(" W"): 191 | curitem['calc_capacity'] = float(v[:-2]) * 0.001 192 | elif v.endswith(" KW"): 193 | curitem['calc_capacity'] = float(v[:-3]) 194 | elif v.endswith(" MW"): 195 | curitem['calc_capacity'] = float(v[:-3]) * 1000 196 | elif v.endswith(" MWP"): # NB probably not possible to make use of difference between MW and MWp 197 | curitem['calc_capacity'] = float(v[:-3]) * 1000 198 | elif v.endswith("KW"): 199 | curitem['calc_capacity'] = float(v[:-2]) 200 | elif v.endswith("MW"): 201 | curitem['calc_capacity'] = float(v[:-2]) * 1000 202 | else: 203 | ok = False 204 | elif k in ['location', 'generator:place', 'generator:location']: 205 | if v=='rooftop': 206 | v = 'roof' 207 | curitem['location'] = v 208 | elif k in ['source', 'source:geometry']: 209 | curitem['source_obj'] = v.replace(",", ";") 210 | elif k in ['source:generator:output:electricity', 'source:plant:output:electricity', "source:power:output", "source:output", "source:power"]: 211 | if v=='REPD Open Data': 212 | v = 'repd' 213 | curitem['source_capacity'] = v.replace(",", ";") 214 | elif k=='note': 215 | if v=='roof household': 216 | curitem['location'] = 'roof' 217 | else: 218 | pass # TODO check all unhandled "note", see if they're OK 219 | elif k=='notional_area': 220 | if curitem['calc_area'] != 0: 221 | #TODO consider: print(" WARNING: skipping notional_area for %s %s because calc_area already filled in" % (curitem['objtype'], curitem['id'])) 222 | pass 223 | if v.endswith(' sq m'): 224 | try: 225 | curitem['calc_area'] = float(v[:-5].replace(',', '.', )) 226 | except: 227 | print("Couldn't handle this notional_area: " + v) 228 | else: 229 | ok = False 230 | elif k in ['direction', 'generator:orientation', 'orientation']: 231 | v = v.replace('`', '').upper() # some people write it this way 232 | if v in compasspoints: 233 | curitem['orientation'] = compasspoints[v] 234 | elif v in ['ESW']: 235 | pass # uninterpretable case seen in data... so skip 236 | elif v in ['FLAT']: 237 | pass # flat-mounted items have no orientation 238 | else: 239 | try: 240 | curitem['orientation'] = int(v) # NB there could of course be parse failures here 241 | except ValueError: 242 | print("Un-parseable orientation value in %s %s: %s=%s" % (curitem['objtype'], curitem['id'], k, v)) 243 | elif k=='pv_module_array': 244 | splitvals = v.split(" by ") 245 | if len(splitvals)==2: 246 | curitem['generator:solar:modules'] = int(splitvals[0]) * int(splitvals[1]) 247 | else: 248 | ok = False 249 | elif k in ['modules', 'generator:solar:modules', 'generator:modules']: 250 | if v=='unknown': 251 | v='' 252 | else: 253 | # It's quite common (for some UI reason) to get typos in this field with characters pasted after the initial digits. We trim them off. 254 | v = regex_numbers_semicolon_start.sub('\\1', v) 255 | if ';' in v: 256 | v = str(reduce(lambda a, b: a+b, map(int, v.split(';')))) # entries could be e.g. "7;5;2" and here we reduce them to a single integer sum 257 | curitem['generator:solar:modules'] = v 258 | #elif k=='': 259 | #elif k=='': 260 | #elif k=='': 261 | #elif k=='': 262 | #elif k=='': 263 | #elif k=='': 264 | #elif k=='': 265 | elif k in [ 266 | # Here are ALL the ones we copy into the output data unedited 267 | 'start_date', 268 | 'repd:id', 269 | ]: 270 | curitem['tag_%s' % k] = v 271 | elif k in [ 272 | # Here are ALL the ones we don't need information from: 273 | 'generator', 274 | 'generator:source', 275 | 'generator:method', 276 | 'generator:type', 277 | 'generator:output', 278 | 'generator:note', 279 | 'generator:strings', 280 | 'generator:output:biogas', 281 | 'generator:output:hot_water', 282 | 'generator:plant', 283 | 'plant:source', 284 | 'plant:method', 285 | 'plant:type', 286 | 'power_source', 287 | 'note:generator:output:electricity', 288 | 'voltage', 289 | 'fixme', 290 | 'earliest_start_date', 291 | 'latest_start_date', 292 | 'amenity', 293 | 'capacity', 294 | 'floating', 295 | 'area', 296 | 'ref', 297 | 'website', 298 | 'alt_name', 299 | 'url', 300 | 'landcover', 301 | 'email', 302 | 'fax', 303 | 'postal_code', 304 | 'phone', 305 | 'site', 306 | 'surface', 307 | 'notes', 308 | 'industrial', 309 | 'listed_status', 310 | 'survey_date', 311 | 'man_made', 312 | 'barrier', 313 | 'fence_type', 314 | 'height', 315 | 'shop', 316 | 'wheelchair', 317 | 'sport', 318 | 'brand', 319 | 'leisure', 320 | 'frequency', 321 | 'manufacturer', 322 | 'architect', 323 | 'HE_ref', 324 | 'note_2', 325 | 'survey:date', 326 | 'mapillary', 327 | 'natural', 328 | 'construction', 329 | 'farmland', 330 | 'opening_hours', 331 | ] or k.split(':')[0] in [ 332 | 'source', # NB there are lots of different source tags. Here we're suppressing any leftovers, even though we may use specific source tags already. 333 | 'operator', 334 | 'owner', 335 | 'description', 336 | 'contact', 337 | 'name', 338 | 'fence', 339 | 'social_facility', 340 | 'flickr', 341 | 'highway', 342 | 'landuse', 343 | 'fhrs', 344 | 'layer', 345 | 'ref', 346 | 'level', 347 | 'wikimedia_commons', 348 | 'wikidata', 349 | 'wikipedia', 350 | 'geograph', 351 | 'type', # rel multipolygon 352 | 'tourism', 353 | 'building', 354 | 'addr', 355 | 'roof', 356 | 'demolished', 357 | 'indoor', 358 | ]: 359 | pass 360 | else: 361 | ok = False 362 | 363 | if not ok: 364 | astr = "Un-recognised tag in %s %s: %s=%s" % (curitem['objtype'], curitem['id'], k, v) 365 | print(astr) 366 | #raise ValueError(astr) 367 | 368 | # TODO also try to calc_type: rooftop or infarm (location=roof; large size) or unknown 369 | 370 | if curitem['tag_power']=='plant': # We store the power-plant's ID info in a form which will be easy to propagate down to child members 371 | curitem['plantref'] = (curitem['objtype'], curitem['id']) 372 | 373 | # OK now store it - we only need to store top-level PV items, child-nodes etc are not needed except for the data stored elsewhere 374 | self.objs.append(self.curitem) 375 | 376 | 377 | self.curitem = None # NB we need to clear "curitem" in ALL cases where it was a node/way/item, NOT just if it's a PV item processed. 378 | 379 | def postprocess(self): 380 | """ 381 | This MUST be called, once, after the XML has been loaded. 382 | It: 383 | - ensures relation-members know about their parent object (incl. the plantref if parent is plant); 384 | - calculates the total area of each relation; 385 | - calculates the centroid of each relation; 386 | - performs spatial containment queries to label solar panels as members of a plant (i.e. plantref) if they're geographically inside them. 387 | """ 388 | # Find all objects that are relations and also plants. Then push down the metadata through their children (only for the temporary data), and also calculate the area for the parents. 389 | rels_postprocessed = 0 390 | self.plantoutlines = [] # {lat, lon, outlinepath, plantref} a list of ways that are either plants themselves, or non-inner members of plant relations; i.e. potential geo containers for panels 391 | for curitem in self.objs: 392 | if curitem['tag_power']=='plant' and curitem['objtype']=='way': 393 | self.plantoutlines.append({'lat': curitem['lat'], 'lon': curitem['lon'], 'outlinepath': self.waydata[curitem['id']]['outlinepath'], 'plantref': curitem['plantref']}) 394 | if curitem['tag_power'] in ['plant', 'generator'] and curitem['objtype']=='relation': 395 | if curitem['tag_power']=='plant': 396 | plantitem = curitem 397 | plantref = ('relation', curitem['id']) 398 | else: 399 | plantitem = None 400 | plantref = None 401 | curitem['calc_area'] = 0 402 | self._recurse_relation_info(curitem, plantitem, plantref) 403 | rels_postprocessed += 1 404 | 405 | print("Postprocessed %i power=* relations" % rels_postprocessed) 406 | print("Plant outlines for geo containment search: %i" % len(self.plantoutlines)) 407 | plantoutlines_kdtree = kdtree([[item['lat'], item['lon']] for item in self.plantoutlines]) 408 | # Now, for every generator object that DOESN'T have a plantref, we find its nearest-neighbour potential-containers and check for containment 409 | for curitem in self.objs: 410 | if curitem['tag_power']=='generator' and not curitem.get('plantref', None): 411 | for distance, arrayposition in zip(*plantoutlines_kdtree.query([curitem['lat'], curitem['lon']], 3, distance_upper_bound=1)): 412 | if distance != np.inf: 413 | if self.plantoutlines[arrayposition]['outlinepath'].contains_point([curitem['lat'], curitem['lon']]): 414 | curitem['plantref'] = self.plantoutlines[arrayposition]['plantref'] 415 | #print(" spatially inferred generator %s/%s belongs to plant %s" % (curitem['objtype'], curitem['id'], curitem['plantref'])) 416 | 417 | if False: # This should NOT NORMALLY be activated. It inserts "guesstimate" power capacities for small-scale solar PV 418 | for curitem in self.objs: 419 | if curitem['tag_power']=='generator' and curitem.get('calc_capacity', 0)==0 and not curitem.get('plantref', None): 420 | curitem['calc_capacity'] = guess_kilowattage(curitem) 421 | 422 | 423 | def _recurse_relation_info(self, curitem, plantitem, plantref): 424 | """Pushes down through relations' members, for two reasons: to compile their areas onto the parent, and to propagate the parent plant reference down to all. 425 | You will call it with curitem==plantitem for plants, and plantitem=None for gens; then the recursion keeps plantitem fixed and alters the immediate curitem.""" 426 | # first we recurse into the child relations - the ways and rels will then add their area to our plantitem 427 | #print("") 428 | #print("_recurse_relation_info(curitem=%s, plantitem=%s, plantref=%s)" % (curitem, plantitem, plantref)) 429 | for childinfo in curitem['relations']: 430 | therel = self.reldata[childinfo['ref']] 431 | if 'plantref' in therel: 432 | raise ValueError("Suspicious recursion: while analysing a plant relation (%i) we found a child rel (%i) which already has plantref set: %s" % (plantitem['id'], childid, str(therel['plantref']))) 433 | else: 434 | self._recurse_relation_info(therel, plantitem, plantref) 435 | # now we grab all area info from one-level-down, and also push the plantref down one level 436 | latslist = [] 437 | lonslist = [] 438 | for childtype, childlist, childdatacache in [ 439 | ('node', curitem['nodes'], self.nodedata), 440 | ('way', curitem['ways'], self.waydata), 441 | ('relation', curitem['relations'], self.reldata), 442 | ]: 443 | for childinfo in childlist: # each is a dict with 'ref' and 'role' 444 | childobj = childdatacache[childinfo['ref']] 445 | latslist.append(childobj['lat']) 446 | lonslist.append(childobj['lon']) 447 | multiplier = [1, -1][childinfo['role']=='inner'] # how to subtract inner-areas 448 | curitem['calc_area'] += multiplier * childobj.get('calc_area', 0) 449 | if plantref: 450 | childobj['plantref'] = plantref 451 | #if childtype=='way': 452 | # print(" from way %s we add area %g" % (childinfo['ref'], multiplier * childobj['calc_area'])) 453 | if childtype=='way' and childinfo['role']!='inner' and plantref: 454 | self.plantoutlines.append({'lat': childobj['lat'], 'lon': childobj['lon'], 'outlinepath': self.waydata[childinfo['ref']]['outlinepath'], 'plantref': plantref}) 455 | curitem['lat'] = np.mean(latslist) 456 | curitem['lon'] = np.mean(lonslist) 457 | 458 | 459 | ############## 460 | # let's go! 461 | 462 | if do_osmium: 463 | infp = subprocess.Popen(["osmium", "tags-filter", osmsourcefpath, "generator:method=photovoltaic", "plant:method=photovoltaic", "plant:source=solar", "-o", "-", "-f", "xml"], 464 | stdout=subprocess.PIPE).stdout 465 | else: 466 | infp = open(osmsourcefpath, 'rb') 467 | 468 | parser = sax.make_parser() 469 | handler = SolarXMLHandler() 470 | parser.setContentHandler(handler) 471 | parser.parse(infp) 472 | infp.close() 473 | handler.postprocess() 474 | 475 | # find all attribs in use 476 | allattribs = set() 477 | for obj in handler.objs: 478 | allattribs = allattribs.union(set(obj)) 479 | allattribs = allattribs.difference(set(['nodes', 'ways', 'relations', 'tags'])) 480 | 481 | # some overcomplex coding to sort attributes in the way I want 482 | attribstarters = ['objtype', 'id', 'user', 'timestamp', 'lat', 'lon'] 483 | def attribsorter(a): 484 | if a in attribstarters: 485 | return "a_%i" % attribstarters.index(a) 486 | else: 487 | return "b_%s" % a 488 | allattribs = sorted(list(allattribs), key=attribsorter) 489 | 490 | if False: 491 | print() 492 | print("All object attributes in play:") 493 | for anattrib in allattribs: 494 | print(anattrib) 495 | print() 496 | 497 | # output happy stats 498 | osmtotalobjs = len(handler.objs) 499 | print("") 500 | print("####################################################################") 501 | print(os.path.basename(osmsourcefpath)) 502 | print("parsed %i OSM objects (%i nodes, %i ways, %i relations)" % (osmtotalobjs, 503 | len([_ for _ in handler.objs if _['objtype']=='node']), 504 | len([_ for _ in handler.objs if _['objtype']=='way']), 505 | len([_ for _ in handler.objs if _['objtype']=='relation']) 506 | )) 507 | print("") 508 | 509 | 510 | # collect the unique REPD identifiers 511 | repds_used = [] 512 | for item in handler.objs: 513 | if item.get('tag_repd:id', False): 514 | repds_used.extend(item['tag_repd:id'].split(';')) 515 | 516 | readable = "standalone" 517 | subset = [_ for _ in handler.objs if _['tag_power']=='generator' and not _.get('plantref', None)] 518 | print("Solar PV panel items (power=generator) (%s):" % readable) 519 | print(" %i in total" % len([_ for _ in subset])) 520 | print(" %g sq km total surface area" % (1e-6 * np.sum([_['calc_area'] for _ in subset]))) 521 | print(" %g MW total generating capacity (NB metadata will be v incomplete for this)" % (1e-3 * np.sum([_.get('calc_capacity',0) for _ in subset]))) 522 | print(" %i nodes with no sqm tagged (could presume 'domestic', but needs more tagging)" % len([_ for _ in subset if _['calc_area']==0])) 523 | print(" %i areas <= 30 sqm (could presume 'domestic')" % len([_ for _ in subset if 0<_['calc_area']<=30])) 524 | print(" %i areas 30--2000 sqm (could presume 'commercial' or part of array)" % len([_ for _ in subset if 30<_['calc_area']<=2000])) 525 | print(" %i areas > 2000 sqm (inspect to see if should really be tagged 'solar farm')" % len([_ for _ in subset if _['calc_area']>2000])) 526 | 527 | readable = "within a farm" 528 | subset = [_ for _ in handler.objs if _['tag_power']=='generator' and _.get('plantref', None)] 529 | print("Solar PV panel items (power=generator) (%s):" % readable) 530 | print(" %i in total" % len([_ for _ in subset])) 531 | print(" %g sq km total surface area" % (1e-6 * np.sum([_['calc_area'] for _ in subset]))) 532 | print(" %i nodes with no sqm tagged" % len([_ for _ in subset if _['calc_area']==0])) 533 | print(" %i areas <= 30 sqm" % len([_ for _ in subset if 0<_['calc_area']<=30])) 534 | print(" %i areas 30--2000 sqm" % len([_ for _ in subset if 30<_['calc_area']<=2000])) 535 | print(" %i areas > 2000 sqm (inspect to see if should really be tagged 'solar farm')" % len([_ for _ in subset if _['calc_area']>2000])) 536 | 537 | print("Solar PV farm items (power=plant):") 538 | print(" %i in total" % len([_ for _ in handler.objs if _['tag_power']=='plant'])) 539 | print(" %i have REPD identifier tagged" % len([_ for _ in handler.objs if _['tag_power']=='plant' and _.get('tag_repd:id', False)])) 540 | print(" (%i REPD identifiers encountered)" % len(repds_used)) 541 | print(" %g sq km total surface area" % (1e-6 * np.sum([_['calc_area'] for _ in handler.objs if _['tag_power']=='plant']))) 542 | print(" %g MW total generating capacity" % (1e-3 * np.sum([_.get('calc_capacity',0) for _ in handler.objs if _['tag_power']=='plant']))) 543 | print(" %i nodes with no sqm tagged (needs more tagging)" % len([_ for _ in handler.objs if _['tag_power']=='plant' and _['calc_area']==0])) 544 | print(" %i areas <= 30 sqm - not including nodes" % len([_ for _ in handler.objs if _['tag_power']=='plant' and 0<_['calc_area']<=30])) 545 | print(" %i areas 30--2000 sqm" % len([_ for _ in handler.objs if _['tag_power']=='plant' and 30<_['calc_area']<=2000])) 546 | print(" %i areas > 2000 sqm" % len([_ for _ in handler.objs if _['tag_power']=='plant' and _['calc_area']>2000])) 547 | 548 | 549 | def csvformatspecialfields(k, v): 550 | "special formatting sometimes needed" 551 | if k=='plantref': 552 | if not v: return '' 553 | return "%s/%s" % v 554 | return v 555 | 556 | try: 557 | with open("osm.csv", 'w', buffering=1) as outfp: 558 | outfp.write(",".join(allattribs) + "\n") 559 | for obj in handler.objs: 560 | outfp.write(",".join(map(str, [csvformatspecialfields(anattrib, obj.get(anattrib, '')) for anattrib in allattribs])) + "\n") 561 | except: 562 | os.rename("osm.csv", "osm_ERROR.csv") 563 | raise 564 | 565 | print("==========================================================") 566 | print("Finished creating initial OSM PV solar extract spreadsheet") 567 | print("==========================================================") 568 | 569 | -------------------------------------------------------------------------------- /data/exported/analyse_exported.py: -------------------------------------------------------------------------------- 1 | 2 | # script to take the main exported pv data, and produce some descriptive statistics and plots 3 | # by Dan Stowell 2020 4 | 5 | # note that as well as our data, you will need geojson/shapefiles defining the regions that we plot/aggregate over: 6 | # LSOAs (specified by the UK statistical authority), GSP regions (specified by the UK National Grid ESO) 7 | 8 | import csv, os 9 | import numpy as np 10 | import pandas as pd 11 | import geopandas as gpd 12 | from shapely.geometry import Point 13 | 14 | import rtree # not used directly, but if you don't have it, geopandas will fail to perform sjoin 15 | import pdfpages # not used directly, but I needed it for mpl to work 16 | import descartes # not used directly, but gpd uses it for choropleths 17 | 18 | import seaborn as sns 19 | import matplotlib.pyplot as plt 20 | from matplotlib.backends.backend_pdf import PdfPages 21 | import matplotlib.pyplot as plt 22 | sns.set(style="whitegrid") 23 | 24 | from sklearn import linear_model 25 | 26 | ############################################################################## 27 | # config 28 | 29 | # input paths: 30 | pvexportfpath = os.path.expanduser("ukpvgeo_points.csv") 31 | geometryfpath = '../raw/osm-gb-solaronly.geojson' 32 | gspregionsfpath = os.path.expanduser("../other/gsp_regions_20181031.geojson") # GSP regions from NG ESO 33 | lsoaregionsfpath = os.path.expanduser("../other/Lower_Layer_Super_Output_Areas_December_2011_Full_Clipped__Boundaries_in_England_and_Wales.shp") 34 | 35 | # out paths: 36 | gsp_est_outfpath = os.path.expanduser("ukpvgeo_subtotals_gsp_capacity.csv") 37 | lsoa_est_outfpath = os.path.expanduser("ukpvgeo_subtotals_lsoa_capacity.csv") 38 | 39 | # if you have access to the Sheffield Solar data for validation, activate this and set the paths appropriately: 40 | got_sheff = True 41 | auxdocs_gdrive_path = os.path.expanduser("~/Documents/turing/turing-climate-call/Turing_OCF_OSM_gdrive/") 42 | sheff_cap_by_gsp_path = "%s/PV_capacity_by_GSP_and_LLSOA/capacity_by_llsoa_and_gsp_20200617T165804/20200617T165804_capacity_by_GSP_region.csv" % auxdocs_gdrive_path 43 | sheff_cap_by_lsoa_path = "%s/PV_capacity_by_GSP_and_LLSOA/capacity_by_llsoa_and_gsp_20200617T165804/20200617T165804_capacity_by_llsoa.csv" % auxdocs_gdrive_path 44 | 45 | # other: 46 | inttype = pd.Int64Dtype() 47 | 48 | 49 | ############################################################################## 50 | # Load data 51 | 52 | gspdf = gpd.read_file(gspregionsfpath) 53 | gspdf = gspdf.to_crs("EPSG:3857").sort_values("RegionID") 54 | 55 | lsoas = gpd.read_file(lsoaregionsfpath).to_crs("EPSG:3857") 56 | lsoas = lsoas.drop(['objectid', 'st_areasha', 'st_lengths', 'lsoa11nmw'], axis=1) 57 | 58 | df = pd.read_csv(pvexportfpath, dtype={'repd_id':inttype, 'osm_id':inttype, 'repd_cluster_id':inttype, 'osm_cluster_id':inttype, 'num_modules':inttype, 'orientation':inttype}) 59 | # convert the main CSV to a GeoDataFrame of points, so we can perform geo queries 60 | df = gpd.GeoDataFrame(df, crs="epsg:4326", geometry=[Point(xy) for xy in zip(df.longitude, df.latitude)]) 61 | df = df.to_crs("EPSG:3857") 62 | df = df.drop(['longitude', 'latitude'], axis=1) 63 | 64 | gdf = gpd.read_file(geometryfpath) # we're going to use this merely to check for containment. If an OSM item is contained entirely within another polygon, we shouldn't double-count its area. 65 | 66 | ############################################################################## 67 | # Simple subtotals 68 | 69 | print("ukpvgeo_points.csv file contains %i data rows." % len(df)) 70 | print("Simple subtotals, number of installations/clusters:") 71 | 72 | asubset = df[df['osm_id']>0] 73 | numinst = len(asubset[['osm_objtype', 'osm_id']].drop_duplicates()) 74 | numclus = len(asubset[['osm_cluster_id']].drop_duplicates()) 75 | print(f"From OSM: {numinst} / {numclus}") 76 | 77 | asubset = df[df['repd_id']>0] 78 | numinst = len(asubset[['repd_id']].drop_duplicates()) 79 | numclus = len(asubset[['repd_cluster_id']].drop_duplicates()) 80 | print(f"From REPD: {numinst} / {numclus}") 81 | 82 | asubset = df 83 | numinst = len(asubset[['osm_objtype', 'osm_id', 'repd_id']].drop_duplicates()) 84 | numclus = len(asubset[['osm_cluster_id', 'repd_id']].drop_duplicates()) 85 | print(f"Harmonised: {numinst} / {numclus}") 86 | print() 87 | 88 | ############################################################################## 89 | # Preprocessing 90 | 91 | df['centroid'] = df.centroid 92 | 93 | # Make a coalesced "capacity" column -- LATER add other sources e.g. estimates 94 | df['capacity_merged_MWp'] = df['capacity_osm_MWp'].combine_first(df['capacity_repd_MWp']) 95 | 96 | 97 | # categorise units into small/medium/large 98 | def categorise_entry(row): 99 | if row['capacity_merged_MWp'] > 0.1: 100 | return "large" 101 | elif row['capacity_merged_MWp'] > 0.01: 102 | return "medium" 103 | elif row['capacity_merged_MWp'] > 0: 104 | return "small" 105 | elif row['area_sqm'] > 2000: 106 | return "large" 107 | elif row['area_sqm'] > 30: 108 | return "medium" 109 | else: 110 | return "small" 111 | 112 | df['category'] = df.apply(categorise_entry, axis=1).astype(pd.CategoricalDtype(categories=["small", "medium", "large"], ordered=True)) 113 | 114 | 115 | # find fully-contained OSM items, and flag them in a special column so that we don't use them in capacity estimates 116 | gdf = gdf[~gdf.osm_id.isna()] 117 | gdf_within = gpd.sjoin(gdf, gdf, how='inner', op="within") 118 | containified = gdf_within[(gdf_within['osm_id_left']!=gdf_within['osm_id_right'])]['osm_id_left'] # the left-hand item is the contained one. 119 | containified = pd.to_numeric(containified.values) 120 | df['area_is_contained'] = df['osm_id'].isin(containified) 121 | print("Found %i OSM items that are entirely-contained within others --- and hence we won't use them for inferring capacity from area" % df['area_is_contained'].sum()) 122 | del gdf_within, containified 123 | 124 | def really_count_nonzero(ser): 125 | return ser.fillna(0, inplace=False).astype(bool).sum() 126 | 127 | ############################################################################## 128 | pdf = PdfPages("plot_analyse_exported.pdf") 129 | 130 | ############################################################################## 131 | # Simple stats about metadata presence/absence: 132 | def format_num_entries(colname, df): 133 | "Convenience function for counting positive entries in data column" 134 | return "%i (==%.1f %% of rows)" % (df[colname].count(), 135 | 100 * df[colname].count()/len(df)) 136 | 137 | print("") 138 | print("METADATA STATS:") 139 | for colname in ['capacity_merged_MWp', 'orientation', 'located', 'num_modules']: 140 | print("%s, num entries: %s" % (colname, format_num_entries(colname, df))) 141 | print("area_sqm, num entries: %i (==%.1f %% of rows)" % ((df['area_sqm'] > 0).sum(), 142 | 100 * (df['area_sqm']>0).sum()/len(df))) 143 | 144 | # capacity - num present, median, simple histogram of these 145 | print("Capacity, sum: %g MWp" % df['capacity_merged_MWp'].sum()) 146 | print("Capacity, median: %g MWp" % df['capacity_merged_MWp'].median()) 147 | fig, ax = plt.subplots(figsize=(10, 6)) 148 | notches = np.geomspace(1, 1e6, 49) * 0.001 149 | sns.distplot(df['capacity_merged_MWp'], norm_hist=False, kde=False, bins=(np.hstack(([0], notches)))) 150 | ax.set_xscale("log") 151 | ax.set_yscale("log") 152 | ax.set_xticks([1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]) 153 | ax.set_xticklabels(['0.1 kWp', '1 kWp', '10 kWp', '100 kWp', '1 MWp', '10 MWp', '100 MWp']) 154 | ax.set_ylabel("Number of installations") 155 | plt.title("Distribution of PV installation capacities (%i tagged)" % (1-df['capacity_merged_MWp'].isna()).sum()) 156 | pdf.savefig(fig) 157 | plt.close() 158 | 159 | # orientation (7800) - also plot a circular histogram of these 160 | fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(projection='polar')) 161 | anglebins = np.arange(0.03125 * np.pi, 2 * np.pi + 1e-3, 0.0625 * np.pi) 162 | sns.distplot(df['orientation'].astype('float') * np.pi / 180, bins=anglebins, norm_hist=False, kde=False) 163 | ax.set_theta_zero_location('N') 164 | plt.title("Distribution of PV installation orientations (%i tagged)" % (1-df['orientation'].isna()).sum()) 165 | pdf.savefig(fig) 166 | plt.close() 167 | 168 | # located (136,000) - give frequency of roof etc 169 | print("Values of 'located':") 170 | print(df.located.value_counts()) 171 | 172 | # num_modules (6000) 173 | fig, ax = plt.subplots(figsize=(10, 6)) 174 | notches = np.array([0, 1, 3, 10, 30, 100, 300, 1000, 3000, 10000, 30000, 100000]) 175 | sns.distplot(df['num_modules'].astype('float'), norm_hist=False, kde=False, bins=notches) 176 | ax.set_xscale("log") 177 | ax.set_yscale("log") 178 | ax.set_ylabel("Number of installations") 179 | plt.title("Distribution of PV installation num_modules (%i tagged)" % (1-df['num_modules'].isna()).sum()) 180 | pdf.savefig(fig) 181 | plt.close() 182 | 183 | # areas (11000) 184 | fig, ax = plt.subplots(figsize=(10, 6)) 185 | notches = np.array([0, 1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5, 3e5, 1e6]) 186 | sns.distplot(df[df.area_sqm>0]['area_sqm'], norm_hist=False, kde=False, bins=notches) 187 | ax.set_xscale("log") 188 | ax.set_yscale("log") 189 | ax.set_ylabel("Number of installations") 190 | ax.set_xlabel("Surface area of polygon (m^2)") 191 | plt.title("Distribution of PV installation polygon areas (%i tagged)" % (df.area_sqm>0).sum()) 192 | pdf.savefig(fig) 193 | plt.close() 194 | 195 | 196 | ############################################################################## 197 | # Capacity regress against area. Also capacity regress against num_modules, and even the joint version. 198 | print("") 199 | print("CAPACITY LINEAR REGRESSIONS:") 200 | 201 | #for whichcat in df['category'].values.categories: 202 | if True: 203 | 204 | fig, ax = plt.subplots(figsize=(10, 6)) 205 | 206 | whichcat = 'all' 207 | subset = df[(df.area_sqm>0) & (df.capacity_merged_MWp>0)] 208 | 209 | if False: # enable this to grab a specific subset 210 | whichcat = 'small' 211 | subset = subset[subset.category==whichcat] 212 | 213 | print(" Num items of type '%s' with area+capacity to regress: %i" % (whichcat, len(subset))) 214 | 215 | xvals = subset['area_sqm'] 216 | yvals = subset['capacity_merged_MWp'] 217 | regr = linear_model.LinearRegression(fit_intercept=False) # force line to pass through zero 218 | data_toregress = np.array(xvals).reshape(-1, 1) 219 | regr.fit(data_toregress, yvals) 220 | linregpredict = regr.predict(data_toregress) 221 | rsq = regr.score(data_toregress, yvals) 222 | 223 | print(" Slope: %.2f W / sq m R^2: %.3f" % (regr.coef_[0] * 1000000, rsq)) 224 | 225 | plt.plot(sorted(xvals * 1e-6), sorted(linregpredict), 'b-', alpha=0.4) 226 | plt.scatter(xvals * 1e-6, yvals, marker='+', alpha=0.4) 227 | plt.annotate("Slope: %.2f W / sq m\nR^2: %.3f" % (regr.coef_[0] * 1000000, rsq), xy=(0.8, 0.1), xycoords='axes fraction', color=(0.5, 0.5, 0.5)) 228 | 229 | #plt.xlim(1, 1000) 230 | plt.ylabel('Capacity (MWp)') 231 | plt.xlabel('Calculated area of PV object (sq km)') 232 | plt.title("Area vs capacity in OSM&REPD (UK)") 233 | 234 | pdf.savefig(fig) 235 | plt.close() 236 | 237 | area_regressor = regr.coef_[0] 238 | 239 | #for whichcat in df['category'].values.categories: 240 | if True: 241 | 242 | fig, ax = plt.subplots(figsize=(10, 6)) 243 | 244 | whichcat = 'all' 245 | subset = df[(df.num_modules>0) & (df.capacity_merged_MWp>0)] 246 | 247 | if True: # enable this to grab a specific subset 248 | whichcat = 'small' 249 | subset = subset[subset.category==whichcat] 250 | 251 | print(" Num items of type '%s' with nummod+capacity to regress: %i" % (whichcat, len(subset))) 252 | 253 | xvals = subset['num_modules'] 254 | yvals = subset['capacity_merged_MWp'] 255 | regr = linear_model.LinearRegression(fit_intercept=False) # force line to pass through zero 256 | data_toregress = np.array(xvals).reshape(-1, 1) 257 | regr.fit(data_toregress, yvals) 258 | linregpredict = regr.predict(data_toregress) 259 | rsq = regr.score(data_toregress, yvals) 260 | 261 | print(" Slope: %.2f W / unit R^2: %.3f" % (regr.coef_[0] * 1000000, rsq)) 262 | 263 | if False: 264 | plt.plot(sorted(xvals), sorted(linregpredict), 'b-', alpha=0.4) 265 | plt.scatter(xvals, yvals, marker='+', alpha=0.4) 266 | plt.annotate("Slope: %.2f W / unit\nR^2: %.3f" % (regr.coef_[0] * 1000000, rsq), xy=(0.8, 0.1), xycoords='axes fraction', color=(0.5, 0.5, 0.5)) 267 | 268 | #plt.xlim(1, 1000) 269 | plt.ylabel('Capacity (MWp)') 270 | plt.xlabel('num_modules of PV object') 271 | plt.title("num_modules vs capacity in OSM&REPD (UK) (type: %s)" % whichcat) 272 | 273 | pdf.savefig(fig) 274 | plt.close() 275 | 276 | 277 | 278 | ############################################################################## 279 | # Our estimate of UK's MW capacity - for each of the 3 types, and the total 280 | 281 | # calc this progressively with estimates too: pure-OSM, pure-REPD, +OSM, +regress_area, +guesstimate_points_as_3. 282 | 283 | print("") 284 | print("TOTAL MERGED CAPACITY ESTIMATES:") 285 | # merged2: +regress_area 286 | df['capacity_merged2_MWp'] = df['capacity_merged_MWp'].combine_first(df['area_sqm'] * area_regressor * ~df['area_is_contained']) 287 | # merged3: missing values as 3 kW 288 | df['capacity_merged3_MWp'] = df['capacity_merged2_MWp'] 289 | df.loc[(df.capacity_merged2_MWp==0) & (~df['area_is_contained']), 'capacity_merged3_MWp']=0.003 290 | 291 | # explicitly tag the source of each capacity 292 | def calc_sourceof_capacity(row): 293 | if (row['capacity_osm_MWp']>0) and (row['capacity_osm_MWp'] != row['capacity_repd_MWp']): 294 | return 'osm' 295 | elif row['capacity_repd_MWp']>0: 296 | return 'repd' 297 | elif (row['capacity_merged2_MWp']>0 and not row['capacity_merged_MWp']>0): 298 | return "area_regress" 299 | elif (row['capacity_merged3_MWp']>0 and not row['capacity_merged2_MWp']>0): 300 | return "point" 301 | else: 302 | return "HUH" 303 | 304 | df['sourceof_capacity'] = df.apply(calc_sourceof_capacity, axis=1).astype( 305 | pd.CategoricalDtype(categories=["repd", "osm"#, "area_regress"#, "point" #, "HUH" 306 | ], ordered=True)) 307 | 308 | cols = ['capacity_osm_MWp', #'capacity_repd_MWp', 309 | 'capacity_merged_MWp', 'capacity_merged2_MWp', 'capacity_merged3_MWp'] 310 | cols_lbls = ['OSM', 311 | 'OSM&REPD', '...+areas_infer', '...+points_est'] 312 | cols_lbls_long = ['capacity_osm_MWp', 313 | 'capacity_osmrepd_MWp', 'capacity_osmrepdareas_MWp', 'capacity_osmrepdareaspoints_MWp'] 314 | 315 | cols_notplotted = ['capacity_repd_MWp'] 316 | cols_all = cols + cols_notplotted 317 | cols_lbls_all = cols_lbls + ['REPD'] 318 | cols_lbls_long_all = cols_lbls_long + ['capacity_repd_MWp'] 319 | 320 | piv_mw_cat = pd.pivot_table(df, values=cols_all, index='category', aggfunc='sum') 321 | print(piv_mw_cat.T) 322 | print("Totals:") 323 | print(piv_mw_cat.sum()) # totals 324 | 325 | # output as a stacked plot, with y-axis as MWp, x-axis as these steps, the 3 categories. 326 | fig, ax = plt.subplots(figsize=(10, 6)) 327 | ax.stackplot(range(len(cols)), np.array([piv_mw_cat[col].values for col in cols]).T, 328 | labels=piv_mw_cat.index.categories.values, linewidth=0) 329 | ax.set_xticks(range(len(cols))) 330 | ax.set_xticklabels(cols_lbls) 331 | #ax.set_yscale('log') 332 | ax.set_ylabel("Total MWp") 333 | plt.legend(loc="lower left") 334 | 335 | plt.title("Total capacity, at various steps of merging/inference") 336 | pdf.savefig(fig) 337 | plt.close() 338 | 339 | 340 | # now the subtotals of num items, not of MW: 341 | piv_count_cat = pd.pivot_table(df, values=cols_all, index='category', aggfunc=really_count_nonzero).astype(int) 342 | print("Number of items with nonzero capacity:") 343 | print(piv_count_cat.T) 344 | print("Totals:") 345 | print(piv_count_cat.sum()) # totals 346 | 347 | fig, ax = plt.subplots(figsize=(10, 6)) 348 | ax.stackplot(range(len(cols)), np.array([piv_count_cat[col].values for col in cols]).T, 349 | labels=piv_count_cat.index.categories.values, linewidth=0) 350 | ax.set_xticks(range(len(cols))) 351 | ax.set_xticklabels(cols_lbls) 352 | #ax.set_yscale('log') 353 | ax.set_ylabel("Total number") 354 | plt.legend(loc="lower left") 355 | 356 | plt.title("Items with tagged capacity, at various steps of merging/inference") 357 | pdf.savefig(fig) 358 | plt.close() 359 | 360 | 361 | 362 | if True: 363 | # Let's do a log-log plot of the long-tail distribution: 364 | # rank position on the x-axis, value on the y-axis, sourceof as the category 365 | loglogvariable = 'capacity_merged2_MWp' 366 | loglogcols = ["osm", "repd"] #"area_regress"#, "point" 367 | loglogpalette = ['b', 'r'] #'y'#, 'k' 368 | dflt = df[[loglogvariable, 'sourceof_capacity']].sort_values(inplace=False, axis=0, by=loglogvariable, ascending=False) 369 | dflt = dflt[dflt['sourceof_capacity'].isin(loglogcols)] 370 | dflt['rank'] = dflt[loglogvariable].rank(ascending=False) 371 | # Now, to reduce plot kb bulk, we aggregate the data by counting 372 | gcount = dflt.groupby([loglogvariable, 'sourceof_capacity', 'rank'], 373 | observed=True).agg(count=('rank', 'count')) 374 | gcount.reset_index(inplace=True) 375 | g = sns.relplot(x="rank", y=loglogvariable, hue="sourceof_capacity", data=gcount, 376 | height=6, aspect=10/6, #size='count', 377 | alpha=0.5, marker='o', linewidths=0, edgecolor='none', 378 | hue_order=loglogcols, palette=loglogpalette) 379 | ax = g.facet_axis(0,0) 380 | ax.set_xscale('log') 381 | ax.set_yscale('log') 382 | ax.set_yticks([1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]) 383 | ax.set_yticklabels(['0.1 kWp', '1 kWp', '10 kWp', '100 kWp', '1 MWp', '10 MWp', '100 MWp']) 384 | ax.set_ylabel("Installation capacity") 385 | ax.set_xlabel("Rank position of capacity") 386 | plt.legend(loc="lower left") 387 | plt.title("log-log plot (for checking power-law behaviour)") 388 | 389 | pdf.savefig(g.fig) 390 | plt.close() 391 | 392 | ############################################################################## 393 | ############################################################################## 394 | ############################################################################## 395 | # Subtotals per-region (GSP, LSOA) --- choropleths and summary CSVs 396 | 397 | # dfc is just the centroids, in the same order as the main data 398 | dfc = df[['centroid']].set_geometry('centroid', inplace=False, crs=df.crs) 399 | 400 | df[['RegionID', 'RegionName']] = gpd.sjoin(dfc, gspdf, how='left', op='intersects' 401 | ).drop(['index_right'], axis=1)[['RegionID', 'RegionName']] 402 | df.RegionID = df.RegionID.astype(inttype) 403 | 404 | def statstr(df, col): 405 | themedian = df[col].median() 406 | themax = df[col].max() 407 | thesum = df[col].sum() 408 | theempty = (df[col]==0).sum() 409 | return "median %.1f\nmax %.1f\nsum %.0f\nempty %i" % (themedian, themax, thesum, theempty) 410 | 411 | def plot_choropleth(dataframe_per_region, col, vmax, plottitle, cmap='hot', legend=True, vmin=0, show_statstr=True): 412 | fig, ax = plt.subplots(figsize=(8, 10)) 413 | plot_choropleth_onax(ax, dataframe_per_region, col, vmax, plottitle, cmap, legend, vmin, show_statstr) 414 | pdf.savefig(fig) 415 | plt.close() 416 | 417 | def plot_choropleth_onax(ax, dataframe_per_region, col, vmax, plottitle, cmap='hot', legend=True, vmin=0, show_statstr=True): 418 | dataframe_per_region.plot(column=col, ax=ax, linewidth=0.01, edgecolor=(0.7, 0.7, 0.7), vmin=vmin, vmax=vmax, cmap=cmap, legend=legend) 419 | if plottitle: 420 | ax.set_title(plottitle) 421 | if show_statstr: 422 | plt.annotate(statstr(dataframe_per_region, col), xy=(0.7, 0.9), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=8) 423 | ax.set_xlim(left=-0.75e6) 424 | ax.set_ylim(top=8.0e6) 425 | ax.set_xticks([]) 426 | ax.set_yticks([]) 427 | 428 | if False: 429 | # just the regions 430 | fig, ax = plt.subplots(figsize=(8, 10)) 431 | gspdf.plot(ax=ax, linewidth=0.01) 432 | plt.title("GSP regions") 433 | plt.xlim(xmin=-0.75e6) 434 | plt.ylim(ymax=8.0e6) 435 | plt.xticks([]) 436 | plt.yticks([]) 437 | pdf.savefig(fig) 438 | plt.close() 439 | 440 | # These "pivot tables" (and more below) are our basic subtotals per-region summaries 441 | piv_count_gsp = pd.pivot_table(df, values='geometry', index='RegionID', aggfunc=really_count_nonzero).rename(columns={'geometry':'num'}) 442 | piv_mw_gsp = pd.pivot_table(df, values=cols_all, index='RegionID', aggfunc='sum') 443 | 444 | # NOTE about "pergsp": this has one row per GSP, and as we go through we will merge new columns on to it when we want to plot them. 445 | # This just means: avoid clashing column names. 446 | # same for "perlsoa" 447 | 448 | # num items per region 449 | fig, ax = plt.subplots(figsize=(8, 10)) 450 | pergsp = gspdf.merge(piv_count_gsp, how='left', on='RegionID').sort_values("RegionID") 451 | pergsp['num'].fillna(0, inplace=True) 452 | plot_choropleth(pergsp, "num", None, "Number of items (clustered) in each GSP region", cmap='copper') 453 | 454 | 455 | pergsp = pergsp.merge(piv_mw_gsp, how='left', on='RegionID').sort_values("RegionID") 456 | for col in cols_all: 457 | pergsp[col].fillna(0, inplace=True) 458 | 459 | vmax = max([pergsp[col].max() for col in cols]) 460 | for col, col_lbl in zip(cols, cols_lbls): 461 | plot_choropleth(pergsp, col, vmax, "Capacity in each GSP region (MWp): %s" % col_lbl) 462 | 463 | # csv 464 | with open(gsp_est_outfpath, 'w') as outfp: 465 | #header 466 | outfp.write("RegionID,RegionName," + ','.join(map(str, cols_lbls_long_all)) + "\n") 467 | for index, row in pergsp.iterrows(): 468 | outfp.write(("%i,%s," % (row['RegionID'], row['RegionName'])) + ','.join([("%.3f" % row[col]) for col in cols_all]) + "\n") 469 | 470 | ############################################################################## 471 | # subtotals (heatmap) again, but for LSOA 472 | 473 | # Add LSOA to our main data 474 | df[['lsoa11cd', 'lsoa11nm']] = gpd.sjoin(dfc, lsoas, how='left', op='intersects' 475 | ).drop(['index_right'], axis=1)[['lsoa11cd', 'lsoa11nm']] 476 | 477 | if False: 478 | # just the regions 479 | fig, ax = plt.subplots(figsize=(8, 10)) 480 | lsoas.plot(ax=ax, linewidth=0.01) 481 | plt.title("LSOA regions") 482 | plt.xlim(xmin=-0.75e6) 483 | plt.ylim(ymax=8.0e6) 484 | plt.xticks([]) 485 | plt.yticks([]) 486 | pdf.savefig(fig) 487 | plt.close() 488 | 489 | perlsoa = lsoas # will be merged 490 | 491 | if False: 492 | # num items per region 493 | piv_count_lsoa = pd.pivot_table(df, values='geometry', index='lsoa11cd', aggfunc=really_count_nonzero).rename(columns={'geometry':'num'}) 494 | perlsoa = perlsoa.merge(piv_count_lsoa, how='left', on='lsoa11cd').sort_values("lsoa11cd") 495 | perlsoa['num'].fillna(0, inplace=True) 496 | plot_choropleth(perlsoa, "num", None, "Number of items (clustered) in each LSOA region", cmap='copper') 497 | 498 | piv_mw_lsoa = pd.pivot_table(df, values=cols_all, index='lsoa11cd', aggfunc='sum') 499 | 500 | perlsoa = perlsoa.merge(piv_mw_lsoa, how='left', on='lsoa11cd').sort_values("lsoa11cd") 501 | for col in cols: 502 | perlsoa[col].fillna(0, inplace=True) 503 | 504 | for col, col_lbl in zip(cols, cols_lbls): 505 | print("Capacity in each LSOA region (MWp): %s" % col_lbl) 506 | print(statstr(perlsoa, col)) 507 | 508 | if False: 509 | vmax = max([piv_mw_lsoa[col].max() for col in cols]) 510 | for col, col_lbl in zip(cols, cols_lbls): 511 | plot_choropleth(perlsoa, col, vmax, "Capacity in each LSOA region (MWp): %s" % col_lbl) 512 | 513 | # csv 514 | with open(lsoa_est_outfpath, 'w') as outfp: 515 | #header 516 | outfp.write("lsoa11cd,lsoa11nm," + ','.join(map(str, cols_lbls_long_all)) + "\n") 517 | for index, row in perlsoa.iterrows(): 518 | outfp.write(("%s,%s," % (row['lsoa11cd'], row['lsoa11nm'])) + ','.join([("%.3f" % row[col]) for col in cols_all]) + "\n") 519 | 520 | ############################################################################## 521 | # Next: plot the estimates from Sheffield/SolarMedia data, and correlate them against ours 522 | 523 | if got_sheff: 524 | 525 | sheff_cap_by_gsp = pd.read_csv(sheff_cap_by_gsp_path) # NB! Use RegionID 526 | 527 | pergsp = pergsp.merge(sheff_cap_by_gsp, how='left', on='RegionID').sort_values("RegionID") 528 | 529 | col = 'dc_capacity' 530 | pergsp[col].fillna(0, inplace=True) 531 | 532 | vmax = sheff_cap_by_gsp[col].max() 533 | 534 | fig, ax = plt.subplots(figsize=(8, 10)) 535 | 536 | pergsp.plot(column=col, ax=ax, linewidth=0.01, edgecolor=(0.7, 0.7, 0.7), vmax=vmax, cmap='hot', legend=True) 537 | plt.title("Capacity in each GSP region (MWp): from Sheffield/SolarMedia") 538 | plt.annotate(statstr(pergsp, col), xy=(0.7, 0.9), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=8) 539 | plt.xlim(xmin=-0.75e6) 540 | plt.ylim(ymax=8.0e6) 541 | plt.xticks([]) 542 | plt.yticks([]) 543 | 544 | pdf.savefig(fig) 545 | plt.close() 546 | 547 | # create table with ours and theirs; measure correlation; scatterplot. 548 | gspcorreltab = sheff_cap_by_gsp.merge(piv_mw_gsp, how='outer', on='RegionID').fillna(0, inplace=False).sort_values("RegionID") 549 | gsprsq = gspcorreltab[['dc_capacity', 'capacity_merged3_MWp']].corr().iloc[0,1] ** 2 550 | print("Correlation between our and Sheffield's MW subtotals per-GSP: Rsq = %.3f" % gsprsq) 551 | 552 | fig, ax = plt.subplots(figsize=(8, 8)) 553 | ax = sns.regplot(x="dc_capacity", y="capacity_merged3_MWp", data=gspcorreltab, ax=ax, 554 | scatter_kws={"s": 50, "alpha": 0.5} 555 | ) 556 | ax.set_xlabel("DC capacity from Sheffield/SolarMedia (MW)") 557 | ax.set_ylabel("Capacity from OSM+REPD+inferred (MWp)") 558 | ax.set_title("Comparison of capacity subtotals per GSP region") 559 | ax.set_aspect(1) 560 | ax.set_xlim(-10, 500) 561 | ax.set_ylim(-10, 500) 562 | 563 | plt.annotate("R^2: %.3f" % gsprsq, xy=(0.05, 0.95), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=12) 564 | 565 | pdf.savefig(fig) 566 | plt.close() 567 | 568 | ### same for LSOA (NB no choro, too dense) 569 | 570 | sheff_cap_by_lsoa = pd.read_csv(sheff_cap_by_lsoa_path) # NB! Use lsoa11cd 571 | sheff_cap_by_lsoa = sheff_cap_by_lsoa.rename(columns={'LLSOACD': 'lsoa11cd'}) 572 | 573 | lsoacorreltab = sheff_cap_by_lsoa.merge(piv_mw_lsoa, how='outer', on='lsoa11cd').fillna(0, inplace=False).sort_values("lsoa11cd") 574 | lsoarsq = lsoacorreltab[['dc_capacity', 'capacity_merged3_MWp']].corr().iloc[0,1] ** 2 575 | print("Correlation between our and Sheffield's MW subtotals per-LSOA: Rsq = %.3f" % lsoarsq) 576 | 577 | fig, ax = plt.subplots(figsize=(8, 8)) 578 | ax = sns.regplot(x="dc_capacity", y="capacity_merged3_MWp", data=lsoacorreltab, ax=ax, 579 | scatter_kws={"s": 50, "alpha": 0.5} 580 | ) 581 | ax.set_xlabel("DC capacity from Sheffield/SolarMedia (MW)") 582 | ax.set_ylabel("Capacity from OSM+REPD+inferred (MWp)") 583 | ax.set_title("Comparison of capacity subtotals per LSOA region") 584 | ax.set_aspect(1) 585 | ax.set_xlim(-2, 80) 586 | ax.set_ylim(-2, 80) 587 | 588 | plt.annotate("R^2: %.3f" % lsoarsq, xy=(0.05, 0.95), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=12) 589 | 590 | pdf.savefig(fig) 591 | plt.close() 592 | 593 | ################################################################################# 594 | # choropleth but showing only the deltas between pure-REPD, our merged outcome, and SolarMedia outcome: 595 | if True: 596 | 597 | diffchoros = [ 598 | ('capacity_delta_ours_repd_MWp', 'DELTA ours from REPD', 'capacity_merged3_MWp', 'capacity_repd_MWp'), 599 | ('capacity_delta_SM_repd_MWp', 'DELTA SolarMedia from REPD', 'dc_capacity', 'capacity_repd_MWp'), 600 | ] 601 | 602 | for col, col_lbl, cola, colb in diffchoros: 603 | pergsp[col] = pergsp[cola] - pergsp[colb] 604 | 605 | vmax = max([pergsp[col].max() for col, _, _, _ in diffchoros]) 606 | 607 | for col, col_lbl, cola, colb in diffchoros: 608 | plot_choropleth(pergsp, col, vmax, "Capacity in each GSP region (MWp): %s" % col_lbl, vmin=-vmax, cmap="RdBu", show_statstr=False) 609 | 610 | ############################################################################## 611 | # Next: capacity choropleths, for a selection of high-contributing users 612 | 613 | # NOTE: according to OSM's GDPR policy we must not publish user ids. 614 | # That's why we use a list of users which is not stored in github, 615 | # and load their associations from the input file rather than our output. 616 | # Then we anonymise them to simple transitory integer values. 617 | 618 | if True: 619 | with open('users_to_plot.csv', 'rt') as fp: 620 | rdr = csv.reader(row for row in fp if not row.startswith('#')) 621 | users_to_plot = [line[0].strip() for line in rdr if len(line)] 622 | 623 | rawosmfpath = '../raw/osm.csv' 624 | 625 | df_userids = pd.read_csv(rawosmfpath, usecols=['objtype', 'id', 'user'], dtype={'id':inttype}).rename(columns={'objtype':'osm_objtype', 'id':'osm_id'}) 626 | users_to_plot_lookup = {u:k for k,u in enumerate(users_to_plot)} 627 | df_userids['user'] = df_userids['user'].map(users_to_plot_lookup).astype(inttype) 628 | df = df.merge(df_userids, how='left', on=['osm_objtype', 'osm_id']) 629 | num_persons = len(users_to_plot) 630 | 631 | # TMP [(personindex, sum(df[df['user']==personindex]['capacity_merged3_MWp'])) for personindex in range(len(users_to_plot))] 632 | 633 | # num items per person 634 | fig, axes = plt.subplots(1, num_persons, figsize=(8 * num_persons, 10)) 635 | for personindex, ax in enumerate(axes): 636 | piv_count_gsp_forthisperson = pd.pivot_table(df[df['user']==personindex], values='geometry', index='RegionID', aggfunc=really_count_nonzero).rename(columns={'geometry':'num'}) 637 | pergsp_forthisperson = gspdf.merge(piv_count_gsp_forthisperson, how='left', on='RegionID').sort_values("RegionID") 638 | pergsp_forthisperson['num'].fillna(0, inplace=True) 639 | plot_choropleth_onax(ax, pergsp_forthisperson, "num", None, cmap='copper', plottitle=None, show_statstr=False) 640 | pdf.savefig(fig) 641 | plt.close() 642 | 643 | # capacity per person 644 | col = 'capacity_merged3_MWp' 645 | fig, axes = plt.subplots(1, num_persons, figsize=(8 * num_persons, 10)) 646 | for personindex, ax in enumerate(axes): 647 | #print(f" Plotting for user #{personindex}") 648 | piv_mw_gsp_forthisperson = pd.pivot_table(df[df['user']==personindex], values=col, index='RegionID', aggfunc='sum') 649 | pergsp_forthisperson = gspdf.merge(piv_mw_gsp_forthisperson, how='left', on='RegionID').sort_values("RegionID") 650 | pergsp_forthisperson[col].fillna(0, inplace=True) 651 | vmax = pergsp_forthisperson[col].max() 652 | #print(" peak value: %f" % vmax) 653 | plottitle = "Capacity in each GSP region (MWp) edited by user #%i" % (personindex+1) 654 | plot_choropleth_onax(ax, pergsp_forthisperson, col, np.ceil(vmax/10)*10., plottitle=None, show_statstr=False) 655 | 656 | pdf.savefig(fig) 657 | plt.close() 658 | 659 | 660 | 661 | 662 | ############################################################################## 663 | pdf.close() 664 | 665 | --------------------------------------------------------------------------------