├── data
    ├── processed
    │   ├── .gitignore
    │   ├── README.md
    │   ├── Makefile
    │   ├── pre-process-mv.py
    │   ├── pre-process-fit.py
    │   ├── pre-process-osm.py
    │   └── pre-process-repd.py
    ├── raw
    │   ├── README
    │   ├── machine_vision.geojson
    │   ├── Makefile
    │   ├── convert_fit_excel_to_csv.py
    │   └── compile_osm_solar.py
    ├── as_received
    │   └── README
    ├── exported
    │   ├── Makefile
    │   ├── export_geometries.py
    │   └── analyse_exported.py
    └── README.md
├── requirements.txt
├── doc
    ├── repd-definitions.pdf
    ├── preprocessing.md
    ├── matching.md
    └── database.md
├── .gitignore
├── db
    ├── neighbour-finding.sql
    ├── data-matching-rules
    │   ├── rule-7.sql
    │   ├── rule-6.sql
    │   ├── rule-1.sql
    │   ├── rule-3.sql
    │   ├── rule-5.sql
    │   ├── rule-3a.sql
    │   ├── rule-5a.sql
    │   ├── rule-4.sql
    │   ├── rule-4a.sql
    │   ├── rule-25.sql
    │   ├── rule-25a.sql
    │   ├── rule-1a.sql
    │   ├── rule-2.sql
    │   └── rule-2a.sql
    ├── README.md
    ├── map-osm-repd.sql
    ├── neighbour-finding
    │   ├── mv-repd.sql
    │   ├── mv-osm.sql
    │   └── osm-repd.sql
    ├── area-adaptive-threshold.sql
    ├── mv.sql
    ├── make-database.sql
    ├── data-matching.sql
    ├── fit.sql
    ├── osm.sql
    ├── dedup-repd.sql
    ├── repd.sql
    ├── dedup-osm.sql
    └── export.sql
└── README.md


/data/processed/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bng_latlon
2 | matplotlib
3 | numpy
4 | pandas
5 | scipy
6 | geopandas
7 | openpyxl
8 | sklearn
9 | 


--------------------------------------------------------------------------------
/doc/repd-definitions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openclimatefix/solar-power-mapping-data/HEAD/doc/repd-definitions.pdf


--------------------------------------------------------------------------------
/data/raw/README:
--------------------------------------------------------------------------------
1 | This file is a placeholder, to ensure the folder appears in git. See readme in parent folder for info.
2 | 
3 | 


--------------------------------------------------------------------------------
/data/as_received/README:
--------------------------------------------------------------------------------
1 | This file is a placeholder, to ensure the folder appears in git. See readme in parent folder for info.
2 | 


--------------------------------------------------------------------------------
/data/processed/README.md:
--------------------------------------------------------------------------------
1 | Processed datasets, create programmatically.
2 | 
3 | (This file is a placeholder to make sure that git creates its parent directory.)
4 | 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | data/raw/fit.csv
3 | data/raw/osm.csv
4 | data/raw/repd.csv
5 | data/as_received/*.pbf
6 | data/as_received/*.xlsx
7 | data/as_received/*.geojson
8 | data/as_received/*.xml
9 | 


--------------------------------------------------------------------------------
/db/neighbour-finding.sql:
--------------------------------------------------------------------------------
1 | \echo -n Finding neighbouring objects across datasets ...
2 | 
3 | \include neighbour-finding/osm-repd.sql
4 | \include neighbour-finding/mv-osm.sql
5 | \include neighbour-finding/mv-repd.sql
6 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-7.sql:
--------------------------------------------------------------------------------
1 | -- Match rule 7 (see doc/matching.md)
2 | \echo -n Performing match rule 7...
3 | 
4 | insert into matches
5 | select '7', NULL, osm.master_osm_id, osm_mv_neighbours.mv_id
6 |   from osm_mv_neighbours, osm
7 |   where osm.osm_id = osm_mv_neighbours.osm_id
8 |   and osm_mv_neighbours.distance_meters < 1000;
9 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-6.sql:
--------------------------------------------------------------------------------
1 | -- Match rule 6 (see doc/matching.md)
2 | \echo -n Performing match rule 6...
3 | 
4 | insert into matches
5 | select '6', repd.master_repd_id, NULL, mv_repd_neighbours.mv_id
6 |   from mv_repd_neighbours, repd
7 |   where repd.repd_id = mv_repd_neighbours.repd_id
8 |   and mv_repd_neighbours.distance_meters < 1000;
9 | 


--------------------------------------------------------------------------------
/data/exported/Makefile:
--------------------------------------------------------------------------------
 1 | # export data. You can export AND analyse by invoking "make all".
 2 | 
 3 | ukpvgeo_geometries.geojson: ukpvgeo_points.csv ../raw/osm-gb-solaronly.geojson
 4 | 	python3 export_geometries.py
 5 | 
 6 | all: ukpvgeo_geometries.geojson plot_analyse_exported.pdf
 7 | 
 8 | plot_analyse_exported.pdf: ukpvgeo_points.csv
 9 | 	python3 analyse_exported.py
10 | 
11 | clean:
12 | 	rm ukpvgeo_geometries.geojson plot_analyse_exported.pdf
13 | 
14 | 


--------------------------------------------------------------------------------
/db/README.md:
--------------------------------------------------------------------------------
 1 | # Notes on setting up the database
 2 | 
 3 | For more details, see `doc/database.md`.
 4 | 
 5 | Om MacOS, with Postgres installed via homebrew, to run postgres as a background
 6 | service and restart at login:
 7 | 
 8 | ```bash
 9 | brew services start postgresql
10 | ```
11 | 
12 | To start postgres now:
13 | 
14 | ```bash
15 | pg_ctl -D /usr/local/var/postgres start
16 | ```
17 | 
18 | To create the database for first use:
19 | 
20 | ```bash
21 | createdb hut23-425 "Solar PV database matching"
22 | ```
23 | 
24 | To populate the database:
25 | 
26 | ```bash
27 | psql -f make-database.sql hut23-425
28 | ```
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/data/processed/Makefile:
--------------------------------------------------------------------------------
 1 | # Create "processed" datasets from "raw" datasets
 2 | 
 3 | all: repd.csv fit.csv osm.csv machine_vision.csv pre-process-repd.py pre-process-fit.py pre-process-osm.py pre-process-mv.py
 4 | 
 5 | repd.csv: ../raw/repd.csv pre-process-repd.py
 6 | 	./pre-process-repd.py < $< > $@
 7 | 
 8 | fit.csv: ../raw/fit.csv pre-process-fit.py
 9 | 	./pre-process-fit.py < $< > $@
10 | 
11 | osm.csv: ../raw/osm.csv pre-process-osm.py
12 | 	./pre-process-osm.py < $< > $@
13 | 
14 | machine_vision.csv: ../raw/machine_vision.geojson pre-process-mv.py
15 | 	./pre-process-mv.py < $< > $@
16 | 
17 | clean:
18 | 	rm repd.csv fit.csv osm.csv machine_vision.csv
19 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-1.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 1 (see doc/matching.md)
 2 | \echo -n Performing match rule 1...
 3 | 
 4 | drop table if exists repd_copy;
 5 | select * into repd_copy from repd_operational;
 6 | insert into matches
 7 | select '1', repd_copy.master_repd_id, osm.master_osm_id
 8 |   from osm_with_existing_repd_neighbours, osm, repd_operational, repd_copy
 9 |   -- Table linking:
10 |   where osm_with_existing_repd_neighbours.osm_id = osm.osm_id
11 |   and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id
12 |   and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id
13 |   -- Matching relevant:
14 |   and repd_operational.location::geography <-> repd_copy.location::geography < 500;
15 | 


--------------------------------------------------------------------------------
/db/map-osm-repd.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** Preliminary matching: OSM to REPD
 3 | **
 4 | ** In raw.osm, the field repd_id_str contains a semicolon-separated list of REPD
 5 | ** ids, possibly empty, whereas we would like this mapping in first normal form.
 6 | */
 7 | 
 8 | \echo Breaking out REPD id tags from the OSM dataset
 9 | 
10 | -- osm_repd_id_mapping(osm_id, repd_id)
11 | 
12 | drop table if exists osm_repd_id_mapping;
13 | 
14 | select raw.osm.osm_id, x.repd_id into osm_repd_id_mapping
15 |   from raw.osm cross join unnest(string_to_array(repd_id_str, ';')) with ordinality as x(repd_id)
16 |   order by raw.osm.osm_id, x.repd_id;
17 | 
18 | alter table osm_repd_id_mapping
19 |   alter column repd_id
20 |     set data type int
21 |       using cast (repd_id as integer);
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/db/neighbour-finding/mv-repd.sql:
--------------------------------------------------------------------------------
 1 | drop table if exists mv_repd_neighbours;
 2 | select closest_pt.repd_id,
 3 |        closest_pt.co_location_repd_id,
 4 |        closest_pt.master_repd_id,
 5 |        machine_vision.install_date as mv_date,
 6 |        machine_vision.area as mv_area,
 7 |        closest_pt.distance_meters,
 8 |        machine_vision.mv_id as mv_id
 9 |   into mv_repd_neighbours
10 |   from machine_vision
11 |   CROSS JOIN LATERAL
12 |     (SELECT
13 |        repd.repd_id,
14 |        repd.co_location_repd_id,
15 |        repd.master_repd_id,
16 |        machine_vision.location::geography <-> repd.location::geography as distance_meters
17 |        FROM repd
18 |        ORDER BY machine_vision.location::geography <-> repd.location::geography
19 |      LIMIT 1) AS closest_pt;
20 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | # Directory structure
 2 | 
 3 | `as_received`
 4 | 
 5 | : Datasets precisely in the form in which we originally got them, unchanged in
 6 |   content or format, whether text or binary, open or proprietary.
 7 | 
 8 | `raw`
 9 | 
10 | : Data that has been manually edited, at least to change the format to one that
11 |   we can use programmatically (if required) but also there may a small number of
12 |   edits to the data that do not seem to be generalisable, or we can't work out
13 |   how to automate, or are not likely to re-occur next time we download the data.
14 | 
15 | `processed`
16 | 
17 | : Datasets that have been programmatically modified (typically from `raw`). Note
18 |   that the `processed` directory is _local_, it is like a “compiled code”
19 |   directory: not under version control but neither on the shared drive.
20 | 
21 | 


--------------------------------------------------------------------------------
/db/area-adaptive-threshold.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** Area-adaptive threshold function for clustering PV geodata items
 3 | ** July 2020
 4 | ** Author: Dan Stowell
 5 | **
 6 | ** The function returns a distance threshold which is related to the 
 7 | ** geometry surface areas (sizes) of the two points being compared.
 8 | **
 9 | ** If area is unknown, we use a heuristic estimate based on capacity similar to "2 hectares per MW";
10 | ** The exact ratio is very close to the correlation seen in the paper.
11 | **
12 | */
13 | 
14 | CREATE FUNCTION area_adaptive_threshold(area1 float, area2 float, capacity1 float, capacity2 float)
15 | RETURNS float AS $$
16 | BEGIN
17 | 
18 | RETURN LEAST(1500, GREATEST(10,
19 | 	2 * SQRT(GREATEST(
20 | 		GREATEST(area1, capacity1 * 20000),
21 | 		GREATEST(area2, capacity2 * 20000)
22 | 	))));
23 | END; $$
24 | LANGUAGE plpgsql IMMUTABLE LEAKPROOF PARALLEL SAFE;
25 | 
26 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-3.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 3 (see doc/matching.md)
 2 | \echo -n Performing match rule 3...
 3 | 
 4 | insert into matches
 5 | select '3', repd_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_repd_neighbours, osm, repd_operational
 7 |   -- Table linking:
 8 |   where osm_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id
10 |   and osm.osm_id = osm.master_osm_id
11 |   and repd_operational.repd_id = repd_operational.master_repd_id
12 |   -- Not matches already found:
13 |   and not exists (
14 |     select
15 |     from matches
16 |     where matches.master_repd_id = repd_operational.master_repd_id
17 |     or matches.master_osm_id = osm.master_osm_id
18 |   )
19 |   -- Matching relevant:
20 |   and osm_repd_neighbours.distance_meters < 700
21 |   and osm.objtype != 'node';
22 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-5.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 5 (see doc/matching.md)
 2 | \echo -n Performing match rule 5...
 3 | 
 4 | insert into matches
 5 | select '5', repd_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_repd_neighbours, osm, repd_operational
 7 |   -- Table linking:
 8 |   where osm_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id
10 |   and osm.osm_id = osm.master_osm_id
11 |   and repd_operational.repd_id = repd_operational.master_repd_id
12 |   -- Not matches already found:
13 |   and not exists (
14 |     select
15 |     from matches
16 |     where matches.master_repd_id = repd_operational.master_repd_id
17 |     or matches.master_osm_id = osm.master_osm_id
18 |   )
19 |   -- Matching relevant:
20 |   and osm.objtype = 'node'
21 |   and repd_operational.site_name like '%Scheme%';
22 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-3a.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 3a (see doc/matching.md)
 2 | \echo -n Performing match rule 3a...
 3 | 
 4 | insert into matches
 5 | select '3a', repd_non_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_repd_neighbours, osm, repd_non_operational
 7 |   -- Table linking:
 8 |   where osm_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id
10 |   and osm.osm_id = osm.master_osm_id
11 |   and repd_non_operational.repd_id = repd_non_operational.master_repd_id
12 |   -- Not matches already found:
13 |   and not exists (
14 |     select
15 |     from matches
16 |     where matches.master_repd_id = repd_non_operational.master_repd_id
17 |     or matches.master_osm_id = osm.master_osm_id
18 |   )
19 |   -- Matching relevant:
20 |   and osm_repd_neighbours.distance_meters < 700
21 |   and osm.objtype != 'node';
22 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-5a.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 5a (see doc/matching.md)
 2 | \echo -n Performing match rule 5a...
 3 | 
 4 | insert into matches
 5 | select '5a', repd_non_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_repd_neighbours, osm, repd_non_operational
 7 |   -- Table linking:
 8 |   where osm_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id
10 |   and osm.osm_id = osm.master_osm_id
11 |   and repd_non_operational.repd_id = repd_non_operational.master_repd_id
12 |   -- Not matches already found:
13 |   and not exists (
14 |     select
15 |     from matches
16 |     where matches.master_repd_id = repd_non_operational.master_repd_id
17 |     or matches.master_osm_id = osm.master_osm_id
18 |   )
19 |   -- Matching relevant:
20 |   and osm.objtype = 'node'
21 |   and repd_non_operational.site_name like '%Scheme%';
22 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-4.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 4 (see doc/matching.md)
 2 | \echo -n Performing match rule 4...
 3 | 
 4 | insert into matches
 5 | select '4', repd_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_repd_neighbours, osm, repd_operational
 7 |   -- Table linking:
 8 |   where osm_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id
10 |   and osm.osm_id = osm.master_osm_id
11 |   and repd_operational.repd_id = repd_operational.master_repd_id
12 |   -- Not matches already found:
13 |   and not exists (
14 |     select
15 |     from matches
16 |     where matches.master_repd_id = repd_operational.master_repd_id
17 |     or matches.master_osm_id = osm.master_osm_id
18 |   )
19 |   -- Matching relevant:
20 |   and osm.objtype = 'node'
21 |   and repd_operational.site_name like '%Scheme%'
22 |   and osm_repd_neighbours.distance_meters < 5000;
23 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-4a.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 4a (see doc/matching.md)
 2 | \echo -n Performing match rule 4a...
 3 | 
 4 | insert into matches
 5 | select '4a', repd_non_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_repd_neighbours, osm, repd_non_operational
 7 |   -- Table linking:
 8 |   where osm_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id
10 |   and osm.osm_id = osm.master_osm_id
11 |   and repd_non_operational.repd_id = repd_non_operational.master_repd_id
12 |   -- Not matches already found:
13 |   and not exists (
14 |     select
15 |     from matches
16 |     where matches.master_repd_id = repd_non_operational.master_repd_id
17 |     or matches.master_osm_id = osm.master_osm_id
18 |   )
19 |   -- Matching relevant:
20 |   and osm.objtype = 'node'
21 |   and repd_non_operational.site_name like '%Scheme%'
22 |   and osm_repd_neighbours.distance_meters < 5000;
23 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-25.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 2.5 (see doc/matching.md)
 2 | \echo -n Performing match rule 2.5...
 3 | 
 4 | insert into matches
 5 | select '25', closest_pt.master_repd_id, osm.master_osm_id
 6 | from osm_repd_id_mapping LEFT JOIN osm USING (osm_id) LEFT JOIN repd_operational USING (repd_id)
 7 |  -- selecting matching repd IDs, and finding their distance:
 8 |  CROSS JOIN LATERAL
 9 |    (SELECT
10 |       repd.master_repd_id,
11 |       osm.location::geography <-> repd.location::geography as distance_meters
12 |       FROM repd
13 |       WHERE repd.repd_id=osm_repd_id_mapping.repd_id
14 |         -- Not matches already found:
15 | 	AND not exists (
16 | 	    select
17 | 	    from matches
18 | 	    where matches.master_repd_id = repd_operational.master_repd_id
19 | 	    or matches.master_osm_id = osm.master_osm_id
20 | 	  )
21 | -- Only the single nearest-neighbour:
22 | ORDER BY osm.location::geography <-> repd.location::geography
23 |     LIMIT 1) AS closest_pt;
24 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-25a.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 2.5a (see doc/matching.md)
 2 | \echo -n Performing match rule 2.5a...
 3 | 
 4 | insert into matches
 5 | select '25a', closest_pt.master_repd_id, osm.master_osm_id
 6 | from osm_repd_id_mapping LEFT JOIN osm USING (osm_id) LEFT JOIN repd_non_operational USING (repd_id)
 7 |  -- selecting matching repd IDs, and finding their distance:
 8 |  CROSS JOIN LATERAL
 9 |    (SELECT
10 |       repd.master_repd_id,
11 |       osm.location::geography <-> repd.location::geography as distance_meters
12 |       FROM repd
13 |       WHERE repd.repd_id=osm_repd_id_mapping.repd_id
14 |         -- Not matches already found:
15 | 	AND not exists (
16 | 	    select
17 | 	    from matches
18 | 	    where matches.master_repd_id = repd_non_operational.master_repd_id
19 | 	    or matches.master_osm_id = osm.master_osm_id
20 | 	  )
21 | -- Only the single nearest-neighbour:
22 | ORDER BY osm.location::geography <-> repd.location::geography
23 |     LIMIT 1) AS closest_pt;
24 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-1a.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 1a (see doc/matching.md)
 2 | \echo -n Performing match rule 1a...
 3 | 
 4 | drop table if exists repd_copy;
 5 | select * into repd_copy from repd_non_operational;
 6 | insert into matches
 7 | select '1a', repd_copy.master_repd_id, osm.master_osm_id
 8 |   from osm_with_existing_repd_neighbours, osm, repd_non_operational, repd_copy
 9 |   -- Table linking:
10 |   where osm_with_existing_repd_neighbours.osm_id = osm.osm_id
11 |   and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id
12 |   and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id
13 |   -- Not matches already found:
14 |   and not exists (
15 |     select
16 |     from matches
17 |     where matches.master_repd_id = repd_non_operational.master_repd_id
18 |     or matches.master_osm_id = osm.master_osm_id
19 |   )
20 |   -- Matching relevant:
21 |   and repd_non_operational.location::geography <-> repd_copy.location::geography < 500;
22 | 


--------------------------------------------------------------------------------
/db/neighbour-finding/mv-osm.sql:
--------------------------------------------------------------------------------
 1 | drop table if exists osm_ways_relations;
 2 | select * into osm_ways_relations from osm  where osm.objtype != 'node';
 3 | 
 4 | drop table if exists osm_mv_neighbours;
 5 | select closest_pt.osm_id,
 6 |        closest_pt.tag_start_date as osm_date,
 7 |        machine_vision.install_date as mv_date,
 8 |        closest_pt.area as osm_area,
 9 |        machine_vision.area as mv_area,
10 |        closest_pt.distance_meters,
11 |        machine_vision.mv_id as mv_id
12 |   into osm_mv_neighbours
13 |   from machine_vision
14 |   CROSS JOIN LATERAL
15 |     (SELECT
16 |        osm_ways_relations.osm_id,
17 |        osm_ways_relations.tag_start_date,
18 |        osm_ways_relations.area,
19 |        machine_vision.location::geography <-> osm_ways_relations.location::geography as distance_meters
20 |        FROM osm_ways_relations
21 |        ORDER BY machine_vision.location::geography <-> osm_ways_relations.location::geography
22 |      LIMIT 1) AS closest_pt;
23 | 
24 | drop table osm_ways_relations;
25 | 


--------------------------------------------------------------------------------
/db/mv.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** Create table containing Machine Vision dataset
 3 | */
 4 | 
 5 | \echo Creating Machine Vision table ...
 6 | 
 7 | drop table if exists machine_vision;
 8 | 
 9 | create table machine_vision (
10 |   mv_id          int,
11 |   area           float,
12 |   confidence     char(1),
13 |   install_date   date, -- problem: some don't have proper dates e.g. "<2016-06" and some have multiple dates separated by commas
14 |   iso_code_short char(2),
15 |   iso_code       char(6),
16 |   attribution    varchar(50),
17 |   longitude      float,
18 |   latitude       float,
19 |   primary key (mv_id)
20 | );
21 | 
22 | \copy machine_vision from '../data/processed/machine_vision.csv' delimiter ',' csv header;
23 | 
24 | /* -----------------------------------------------------------------------------
25 | ** Edit table as necessary
26 | */
27 | 
28 | -- Create geometry columns for geographical comparison/matching
29 | -- NB: Spatial Reference ID 4326 refers to WGS84
30 | 
31 | alter table machine_vision
32 |   add column location geometry(Point, 4326);
33 |   
34 | update machine_vision
35 |   set location = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/data/raw/machine_vision.geojson:
--------------------------------------------------------------------------------
1 | {"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Polygon", "coordinates": [[[-4.898715334732418, 50.35443341724983], [-4.898697262371517, 50.354495023984384], [-4.898593151707162, 50.354585020048205], [-4.898384858019185, 50.35471130228192], [-4.89819790185845, 50.354781804091964], [-4.898071546583224, 50.35478386702829], [-4.897965117800202, 50.35475862735909], [-4.897809030286477, 50.35465057847919], [-4.897697218033746, 50.35449051503744], [-4.8976718451404055, 50.354383053428755], [-4.897710220354532, 50.35428799656915], [-4.897795887159217, 50.354216473347165], [-4.8980244443516625, 50.35413180492261], [-4.898336992613523, 50.35406630164386], [-4.898375745656702, 50.35404487220399], [-4.898398896214309, 50.35400429274275], [-4.89843125355013, 50.35406253395206], [-4.898571721367052, 50.35410936496741], [-4.8986629634032, 50.35417798947747], [-4.898708533548572, 50.35425982182948], [-4.898715334732418, 50.35443341724983]]]}, "properties": {"area": 4415.68707833541, "confidence": "A", "install_date": "<2016-06", "iso-3166-1": "GB", "iso-3166-2": "GB-CON", "attribution": "University of Oxford & Descartes Labs Inc."}}]}
2 | 


--------------------------------------------------------------------------------
/db/make-database.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** Solar PV database creation and data ingest
 3 | ** March 2020
 4 | ** Authors: Ed Chalstery and James Geddes
 5 | **
 6 | ** Prerequisites:
 7 | **   i. A database named "hut23-425" is assumed to exist on the local PostgreSQL server
 8 | **
 9 | ** These psql files:
10 | **   1. Create database tables (deleting them if they exist already)
11 | **   2. Break out the REPD tags from the OSM dataset
12 | **   3. Deduplicate certain tables
13 | **
14 | ** See `doc/database.md` for details
15 | */
16 | 
17 | -- Preliminaries
18 | 
19 | \set ON_ERROR_STOP on
20 | alter database "hut23-425" set datestyle to "DMY"; -- to match FIT and REPD data files
21 | create schema if not exists raw;
22 | create extension if not exists postgis;
23 | 
24 | \include area-adaptive-threshold.sql
25 | 
26 | -- 1. Create tables and load data
27 | 
28 | \include osm.sql
29 | \include repd.sql
30 | \include fit.sql
31 | \include mv.sql
32 | 
33 | -- 2. Preliminary matching (REPD IDs already present in OSM)
34 | 
35 | \include map-osm-repd.sql
36 | 
37 | -- 3. Deduplicate
38 | 
39 | \include dedup-osm.sql
40 | \include dedup-repd.sql
41 | 
42 | -- 4. Find neighbours
43 | 
44 | \include neighbour-finding.sql
45 | 
46 | -- 5. Create matching table
47 | 
48 | \include data-matching.sql
49 | 


--------------------------------------------------------------------------------
/db/neighbour-finding/osm-repd.sql:
--------------------------------------------------------------------------------
 1 | -- Find the nearest neighbouring REPD object to each OSM object
 2 | drop table if exists osm_repd_neighbours;
 3 | select osm.osm_id,
 4 |       closest_pt.repd_id as closest_geo_match_from_repd_repd_id,
 5 |       closest_pt.co_location_repd_id as closest_geo_match_from_repd_co_location_repd_id,
 6 |       closest_pt.distance_meters
 7 |  into osm_repd_neighbours
 8 |  from osm
 9 |  CROSS JOIN LATERAL
10 |    (SELECT
11 |       repd.repd_id,
12 |       repd.co_location_repd_id,
13 |       osm.location::geography <-> repd.location::geography as distance_meters
14 |       FROM repd
15 |       ORDER BY osm.location::geography <-> repd.location::geography
16 |     LIMIT 1) AS closest_pt;
17 | 
18 | -- Create a table that compares the neighbour REPD ID to those that were found
19 | -- in the OSM data (where present)
20 | drop table if exists osm_with_existing_repd_neighbours;
21 | select osm_repd_neighbours.osm_id,
22 |        osm_repd_id_mapping.repd_id as repd_id_in_osm,
23 |        osm_repd_neighbours.closest_geo_match_from_repd_repd_id,
24 |        osm_repd_neighbours.closest_geo_match_from_repd_co_location_repd_id,
25 |        osm_repd_neighbours.distance_meters
26 |   into osm_with_existing_repd_neighbours
27 |   from osm_repd_neighbours, osm_repd_id_mapping
28 |   where osm_repd_neighbours.osm_id = osm_repd_id_mapping.osm_id;
29 | 


--------------------------------------------------------------------------------
/db/data-matching.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | \echo -n Finding matches across datasets ...
 3 | 
 4 | drop table if exists matches;
 5 | create table matches (
 6 |   match_rule     varchar(3),
 7 |   master_repd_id integer,
 8 |   master_osm_id  bigint,
 9 |   mv_id          integer,
10 |   fit_id         integer
11 | );
12 | 
13 | -- OSM-REPD matching for operational installations
14 | drop table if exists repd_operational;
15 | select * into repd_operational from repd where dev_status = 'Operational';
16 | 
17 | \include data-matching-rules/rule-1.sql
18 | \include data-matching-rules/rule-2.sql
19 | \include data-matching-rules/rule-25.sql
20 | \include data-matching-rules/rule-3.sql
21 | \include data-matching-rules/rule-4.sql
22 | \include data-matching-rules/rule-5.sql
23 | 
24 | -- OSM-REPD matching for non-operational installations
25 | drop table if exists repd_non_operational;
26 | select * into repd_non_operational from repd where dev_status != 'Operational';
27 | 
28 | \include data-matching-rules/rule-1a.sql
29 | \include data-matching-rules/rule-2a.sql
30 | \include data-matching-rules/rule-25a.sql
31 | \include data-matching-rules/rule-3a.sql
32 | \include data-matching-rules/rule-4a.sql
33 | \include data-matching-rules/rule-5a.sql
34 | 
35 | -- REPD-MV matching
36 | \include data-matching-rules/rule-6.sql
37 | 
38 | -- OSM-MV matching
39 | \include data-matching-rules/rule-7.sql
40 | 


--------------------------------------------------------------------------------
/data/processed/pre-process-mv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | ### Create a CSV for easy import to PostgreSQL
 3 | ### Get the centre of the polygons as point coordinates for the machine vision objects
 4 | ### Remove the string "<2016-06" from dates as this can't be loaded as a date in PostgreSQL
 5 | ### Reads from stdin, writes to stdout
 6 | 
 7 | import geopandas as gpd
 8 | import sys
 9 | 
10 | machine_vision = "../raw/machine_vision.geojson"
11 | machine_vision_df = gpd.read_file(machine_vision)
12 | 
13 | def getXY(pt):
14 |     return (pt.x, pt.y)
15 | centroidseries = machine_vision_df['geometry'].centroid
16 | centroidlist = map(getXY, centroidseries)
17 | machine_vision_df['x'], machine_vision_df['y'] = [list(t) for t in zip(*map(getXY, centroidseries))]
18 | 
19 | def remove_bad_date(dt_str):
20 |     """Remove the string representing any date before 2016-06 and also only take
21 |     the first of those with 2 dates; take the earlier date as correct for
22 |     install date"""
23 |     return(dt_str.replace("<2016-06", "").split(",")[0])
24 | machine_vision_df['install_date'] = list(map(remove_bad_date, machine_vision_df['install_date']))
25 | 
26 | # TODO: if we need this in PostgreSQL/PostGIS, figure out the correct data type for table column
27 | machine_vision_df = machine_vision_df.drop(['geometry'], axis=1)
28 | 
29 | mv_csv_str = machine_vision_df.to_csv(index=True)
30 | 
31 | sys.stdout.write(mv_csv_str)
32 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-2.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 2 (see doc/matching.md)
 2 | \echo -n Performing match rule 2...
 3 | 
 4 | insert into matches
 5 | select '2', repd_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_with_existing_repd_neighbours, osm, repd_operational, repd_copy
 7 |   -- Table linking:
 8 |   where osm_with_existing_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_operational.repd_id
10 |   and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id
11 |   -- Not matches already found:
12 |   and not exists (
13 |     select
14 |     from matches
15 |     where matches.master_repd_id = repd_operational.master_repd_id
16 |     or matches.master_osm_id = osm.master_osm_id
17 |   )
18 |   -- Matching relevant:
19 |   and   (osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id
20 |       or osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_co_location_repd_id
21 |       or osm_with_existing_repd_neighbours.repd_id_in_osm = repd_operational.master_repd_id
22 |       or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.co_location_repd_id
23 |       or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.master_repd_id
24 |     );
25 | 


--------------------------------------------------------------------------------
/db/data-matching-rules/rule-2a.sql:
--------------------------------------------------------------------------------
 1 | -- Match rule 2a (see doc/matching.md)
 2 | \echo -n Performing match rule 2a...
 3 | 
 4 | insert into matches
 5 | select '2a', repd_non_operational.master_repd_id, osm.master_osm_id
 6 |   from osm_with_existing_repd_neighbours, osm, repd_non_operational, repd_copy
 7 |   -- Table linking:
 8 |   where osm_with_existing_repd_neighbours.osm_id = osm.osm_id
 9 |   and osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_non_operational.repd_id
10 |   and osm_with_existing_repd_neighbours.repd_id_in_osm = repd_copy.repd_id
11 |   -- Not matches already found:
12 |   and not exists (
13 |     select
14 |     from matches
15 |     where matches.master_repd_id = repd_non_operational.master_repd_id
16 |     or matches.master_osm_id = osm.master_osm_id
17 |   )
18 |   -- Matching relevant:
19 |   and   (osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id
20 |       or osm_with_existing_repd_neighbours.repd_id_in_osm = osm_with_existing_repd_neighbours.closest_geo_match_from_repd_co_location_repd_id
21 |       or osm_with_existing_repd_neighbours.repd_id_in_osm = repd_non_operational.master_repd_id
22 |       or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.co_location_repd_id
23 |       or osm_with_existing_repd_neighbours.closest_geo_match_from_repd_repd_id = repd_copy.master_repd_id
24 |     );
25 | 


--------------------------------------------------------------------------------
/doc/preprocessing.md:
--------------------------------------------------------------------------------
 1 | Manual pre-processing datasets
 2 | ===========
 3 | 
 4 | These are the manual data transformations made to get the files in `data/raw` from `data/as_received`.
 5 | 
 6 | FiT
 7 | ----
 8 | 
 9 | No transormation needed -- running `make` in the `data/raw` folder should be sufficient to combine the 3 Excel spreadsheets into one and convert to CSV, yielding `fit.csv`.
10 | 
11 | OSM (csv)
12 | ----
13 | 
14 | There are sometimes manual edits needed, to fix typos in the uncontrolled OSM source data. We have fixed many of them, and also caught them in the preprocessing of `compile_osm_solar.py`, but there could be others in future OSM data releases. Here are examples of edits we made:
15 | 
16 | 1. Example edits:
17 | 
18 |     - Changed "14Synthetic"... to "14" for id=7784835486 in generator:solar:modules column.
19 |     - Changed "8node 0" to "8" for id=7772459006 in generator:solar:modules column
20 |     - Changed "36node 0" to "36" for id=7772459035 in generator:solar:modules column
21 |     - Changed "17generator:method" to "17" for id=7791014395 in generator:solar:modules column
22 |     - For object id=835531116, move "SE" from wrong column generator:solar:modules to orientation
23 | 
24 | 3. Save as `data/raw/osm.csv`
25 | 
26 | REPD
27 | ----
28 | 
29 | 1. Delete one unusual date/time cell: "00/01/1900"
30 | 2. Save as `data/raw/repd.csv`.
31 | 
32 | Machine vision dataset
33 | -----
34 | 
35 | No changes. Save as `data/raw/machine_vision.geojson`.
36 | 


--------------------------------------------------------------------------------
/db/fit.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** Create table containing fit dataset
 3 | */
 4 | 
 5 | \echo Creating fit table ...
 6 | 
 7 | drop table if exists raw.fit;
 8 | drop table if exists fit;
 9 | 
10 | create table raw.fit (
11 |   fit_id                int,
12 |   extension             char(1),
13 |   postcode_stub         varchar(7),
14 |   technology            varchar(24),
15 |   installed_capacity    float,
16 |   declared_net_capacity float,
17 |   application_date      date,
18 |   commissioning_date    date,
19 |   mcs_issue_date        date,
20 |   export_status         varchar(30),
21 |   tariff_code           varchar(20),
22 |   tariff_description    varchar(100),
23 |   installation_type     varchar(25),
24 |   country               varchar(15),
25 |   local_authority       varchar(40),
26 |   govt_office_region    varchar(40),
27 |   constituency          varchar(60),
28 |   accreditation_route   varchar(6),
29 |   mpan_prefix           float,
30 |   comm_school           varchar(40),
31 |   llsoa_code            varchar(20)
32 | );
33 | 
34 | \copy raw.fit from '../data/processed/fit.csv' delimiter ',' csv header;
35 | 
36 | -- Restrict raw.fit to Solar PV only
37 | select * into fit
38 |   from raw.fit
39 |   where technology = 'Photovoltaic';
40 | 
41 | -- Add roughly calculated area for raw.fit entries to aid matching.
42 | -- Solar PV has power approx. 52W / m^2
43 | alter table fit
44 |   add column "area" float;
45 | 
46 | update fit
47 |   set area = declared_net_capacity/52;
48 | 


--------------------------------------------------------------------------------
/data/raw/Makefile:
--------------------------------------------------------------------------------
 1 | # Preprocess OSM extract into filtered subsets; and compile FiT Excels into csv.
 2 | 
 3 | all: osm-gb-solaronly.osm.pbf osm-gb-solaronly.xml osm-gb-solaronly.geojson osm.csv fit.csv repd.csv
 4 | 
 5 | 
 6 | # basic OSM solarfiltering -- reduces 1.5 GB to approx 2 MB
 7 | osm-gb-solaronly.osm.pbf: ../as_received/great-britain-latest.osm.pbf
 8 | 	osmium tags-filter $< generator:method=photovoltaic plant:method=photovoltaic plant:source=solar -o $@
 9 | 
10 | # format-shifting OSM->XML
11 | osm-gb-solaronly.xml: osm-gb-solaronly.osm.pbf
12 | 	osmium cat $< -o $@
13 | 
14 | # format-shifting OSM->GeoJSON requires each "layer" to be exported
15 | osm-gb-solaronly.geojson: osm-gb-solaronly.osm.pbf
16 | 	ogr2ogr -f GeoJSON -overwrite      -addfields $@ $< points           -nln merged
17 | 	ogr2ogr -f GeoJSON -update -append -addfields $@ $< lines            -nln merged
18 | 	ogr2ogr -f GeoJSON -update -append -addfields $@ $< multilinestrings -nln merged
19 | 	ogr2ogr -f GeoJSON -update -append -addfields $@ $< multipolygons    -nln merged
20 | 	ogr2ogr -f GeoJSON -update -append -addfields $@ $< other_relations  -nln merged
21 | 
22 | osm.csv: osm-gb-solaronly.xml
23 | 	python3 compile_osm_solar.py
24 | 
25 | fit.csv: ../as_received/installation_report_apr2020_part_1.xlsx
26 | 	python3 convert_fit_excel_to_csv.py
27 | 
28 | repd.csv: ../as_received/renewable-energy-planning-database-march-2020.csv
29 | 	cat $< | sed -e "s|00/01/1900||g" > $@
30 | 
31 | clean:
32 | 	rm osm.csv fit.csv osm-gb-solaronly.osm.pbf osm-gb-solaronly.xml osm-gb-solaronly.geojson
33 | 
34 | 


--------------------------------------------------------------------------------
/data/raw/convert_fit_excel_to_csv.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import openpyxl
 3 | from openpyxl import load_workbook
 4 | 
 5 | # tested using openpyxl 3.0.3 on python 3.7.5, ubuntu 18.04
 6 | 
 7 | # TODO this script should be merged in with data/processed/pre-process-fit.py
 8 | 
 9 | headercols = None
10 | rowswritten = 0
11 | 
12 | def formatcell(item):
13 | 	"Appropriate conversions for CSV output"
14 | 	if item.value is None:
15 | 		return ""
16 | 	return str(item.internal_value)
17 | 
18 | with open("fit.csv", "wt") as outfp:
19 | 	for infname in sorted(glob.glob("../as_received/installation_report_*_part_*.xlsx")):
20 | 		print(infname)
21 | 		wb = load_workbook(filename=infname, read_only=True)
22 | 
23 | 		gotheader = False
24 | 		for whichrow, row in enumerate(wb.active.iter_rows()):
25 | 			at_repeated_header = False
26 | 			try:
27 | 				rowstrs = [formatcell(item) for item in row]
28 | 			except AttributeError as err:
29 | 				print(err)
30 | 				print("    skipping row %i (merged cell?)" % whichrow)
31 | 				continue
32 | 			for item in rowstrs:
33 | 				assert("," not in item)
34 | 			# Here we're detecting hitting the header in this XLS:
35 | 			if rowstrs[0] and (rowstrs[0].startswith("Extension")):
36 | 				gotheader = True
37 | 				#print(headercols)
38 | 				if headercols:
39 | 					assert(headercols==rowstrs)
40 | 					at_repeated_header = True
41 | 				else:
42 | 					headercols = rowstrs
43 | 			if gotheader and not at_repeated_header:
44 | 				#if whichrow < 20:
45 | 				#	print(rowstrs[0])
46 | 				outfp.write(",".join(rowstrs) + "\n")
47 | 				rowswritten += 1
48 | 		del wb
49 | 
50 | print("Written %i rows" % rowswritten)
51 | assert(rowswritten > 10000)
52 | assert(gotheader)
53 | 
54 | 


--------------------------------------------------------------------------------
/data/processed/pre-process-fit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | ### Process the FiT data to add an index column
 3 | ### Reads from stdin, writes to stdout
 4 | 
 5 | import sys
 6 | import pandas as pd
 7 | import numpy as np
 8 | 
 9 | fit_df = pd.read_csv(sys.stdin)
10 | 
11 | # Check the file has the columns we expect and order them as we expect
12 | # If the columns don't exist, make the column empty
13 | output_df = pd.DataFrame()
14 | required_columns = ['Extension (Y/N)',
15 |                     'PostCode',
16 |                     'Technology',
17 |                     'Installed capacity',
18 |                     'Declared net capacity',
19 |                     'Application date',
20 |                     'Commissioning date',
21 |                     'MCS issue date',
22 |                     'Export status',
23 |                     'TariffCode',
24 |                     'Tariff Description',
25 |                     'Installation Type',
26 |                     'Installation Country',
27 |                     'Local Authority',
28 |                     'Government Office Region',
29 |                     'Constituency',
30 |                     'Accreditation Route',
31 |                     'MPAN Prefix',
32 |                     'Community school category',
33 |                     'LLSOA Code']
34 | 
35 | for col in required_columns:
36 |     try:
37 |         output_df[col] = fit_df[col]
38 |     except KeyError:
39 |         output_df[col] = np.nan
40 | 
41 | # Also at this point reduce to PV only (reduces data volumes)
42 | output_df = output_df[output_df['Technology']=='Photovoltaic']
43 | 
44 | # Add index column
45 | fit_csv_str = output_df.to_csv(index=True)
46 | 
47 | sys.stdout.write(fit_csv_str)
48 | 


--------------------------------------------------------------------------------
/db/osm.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** Create table containing OSM database
 3 | */
 4 | 
 5 | \echo Creating OSM table ...
 6 | 
 7 | drop table if exists raw.osm;
 8 | 
 9 | create table raw.osm (
10 |   objtype        varchar(8),
11 |   osm_id         bigint,
12 |   username       varchar(60),
13 |   time_created   date,
14 |   latitude       float,
15 |   longitude      float,
16 |   area           float,
17 |   capacity       float,
18 |   modules        float,
19 |   located        varchar(20),
20 |   orientation    float,
21 |   plantref       varchar(20),
22 |   source_capacity varchar(255),
23 |   source_obj     varchar(255),
24 |   tag_power      varchar(15),
25 |   repd_id_str    varchar(20),
26 |   tag_start_date date,
27 |   primary key (osm_id)
28 | );
29 | 
30 | \copy raw.osm from '../data/processed/osm.csv' delimiter ',' csv header;
31 | 
32 | /* -----------------------------------------------------------------------------
33 | ** Edit table as necessary
34 | */
35 | 
36 | -- coerce some float columns to int, since float res is excessive (but sometimes present in the input)
37 | 
38 | alter table raw.osm
39 |   alter column modules
40 |     type bigint using round(modules)::bigint;
41 | alter table raw.osm
42 |   alter column orientation
43 |     type integer using round(orientation)::integer;
44 | 
45 | -- our input is using kW, convert to MW for coherence with use elsewhere in the db
46 | update raw.osm set capacity = 0.001 * capacity;
47 | 
48 | -- Create geometry columns for geographical comparison/matching
49 | -- NB: Spatial Reference ID 4326 refers to WGS84
50 | 
51 | alter table raw.osm
52 |   add column location geometry(Point, 4326);
53 | 
54 | update raw.osm
55 |   set location = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);
56 | 


--------------------------------------------------------------------------------
/data/processed/pre-process-osm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | ### Process the OSM data to fix date formatting
 3 | ### Reads from stdin, writes to stdout
 4 | 
 5 | import sys
 6 | import pandas as pd
 7 | from dateutil.parser import parse
 8 | 
 9 | osm_df = pd.read_csv(sys.stdin)
10 | 
11 | # Check the file has the columns we expect and order them as we expect
12 | # If the columns don't exist, make the column empty
13 | output_df = pd.DataFrame()
14 | required_columns = ['objtype',
15 |                     'id',
16 |                     'user',
17 |                     'timestamp',
18 |                     'lat',
19 |                     'lon',
20 |                     'calc_area',
21 |                     'calc_capacity',
22 |                     'generator:solar:modules',
23 |                     'location',
24 |                     'orientation',
25 |                     'plantref',
26 |                     'source_capacity',
27 |                     'source_obj',
28 |                     'tag_power',
29 |                     'tag_repd:id',
30 |                     'tag_start_date']
31 | 
32 | for col in required_columns:
33 |     try:
34 |         output_df[col] = osm_df[col]
35 |     except KeyError:
36 |         output_df[col] = np.nan
37 | 
38 | # Edit tagged date column with pandas
39 | dates = []
40 | before_date_strs = ['before ']
41 | after_date_strs = ['.']
42 | mistakes = [('-00', '-01')]
43 | for date in output_df['tag_start_date']:
44 |     if pd.notna(date):
45 |         og_date = date
46 |         date = str(date)
47 |         for string in before_date_strs:
48 |             date = date.split(string)[-1]
49 |         for string in after_date_strs:
50 |             date = date.split(string)[0]
51 |         for mistake, correction in mistakes:
52 |             date = date.replace(mistake, correction)
53 |         dates.append(parse(date, ignoretz=True, default=parse('2020-01-01')))
54 |     else:
55 |         dates.append(None)
56 | output_df['tag_start_date'] = dates
57 | 
58 | osm_csv_str = output_df.to_csv(index=False)
59 | 
60 | sys.stdout.write(osm_csv_str)
61 | 


--------------------------------------------------------------------------------
/doc/matching.md:
--------------------------------------------------------------------------------
 1 | % Matching
 2 | 
 3 | This note documents the matching of entities between the various datasets in the
 4 | `hut23-425` database.
 5 | 
 6 | 
 7 | # Match Rules:
 8 | 
 9 | ## OSM-REPD
10 | 
11 | First, proximity match to get the nearest neighbouring REPD for every single OSM object,
12 | 
13 | 1. The master REPD id of the REPD id in OSM, where this is < 500m from the nearest neighbour REPD object to an OSM object
14 | 2. The nearest neighbour in meters of OSM entries to REPD entries where the REPD ID is the same as that already present in OSM data. Accept those that are the same (regardless of distance). This should catch any that aren't covered by rule 0. (Implementation note: added rule 2.5 which comes after rule 2, implementing the same idea with a different proximity search. It re-runs the nearest neighbour search but only for matching REPD IDs rather than any REPD ID.)
15 | 3. Nearest neighbour for remaining OSM/REPD where the OSM `objtype` is `way` or `relation` and the object is a "group master" (the `osm_id = master_osm_id` or `repd_id = master_repd_id`). Accept all matches that are below a distance threshold of 700 meters.
16 | 4. Nearest REPD neighbour for OSM nodes, where REPD has "Scheme" in the title, with a distance threshold of 5KM.
17 | 5. Repeat 4, with no distance threshold
18 | 
19 | Repeat matching rules 1-5 for non-operational REPD entries. Label these match rules 1a, 2a etc.
20 | 
21 | ## Machine Vision dataset
22 | 
23 | First, proximity match to get the nearest neighbouring OSM way/relation for every single Machine Vision object and separately, the nearest REPD for each Machine Vision Object.
24 | 
25 | 6. Nearest neighbour REPD to each Machine Vision object, below a 1000m distance threshold.
26 | 7. Nearest neighbour OSM way/relation to each Machine Vision object, below a 1000m distance threshold.
27 | 
28 | # Matches
29 | 
30 | | Rule | Count | Total |
31 | | ---  | ---   |  ---  |
32 | |  1   |  825  |  825  |
33 | |  2   |  0    |  825  |
34 | |  3   |  88   |  913  |
35 | |  4   |  1,192|  2,105|
36 | |  5   |  68   |  2,173|
37 | |  1a  |  2    |  2,175|
38 | |  2a  |  0    |  2,175|
39 | |  3a  |  46   |  2,221|
40 | |  4a  |  893  |  3,114|
41 | |  5a  |  0    |  3,114|
42 | | ---- | ----- | ----- |
43 | |  6   |  1,759|       |
44 | |  7   |  1,851|       |
45 | 
46 | # Notes
47 | 
48 | Doesn't look like any of the nodes where nearest REPD is not a scheme are genuine matches. Many matches to "Goldthorpe" REPD, which made me think this could be a scheme. Unlikely though, because the operator is Aldi.
49 | 
50 | Rule 4/5 are intended to distinguish between those OSM nodes that are very likely to be part of a scheme and those that are slightly less likely.
51 | 


--------------------------------------------------------------------------------
/db/dedup-repd.sql:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** Deduplicate the REPD dataset.
  3 | **
  4 | ** Identify individual entries as being the same solar farm if:
  5 | ** 1. They are within a certain disance; and
  6 | ** 2. They have a "similar" name
  7 | **
  8 | ** Names are similar if their reduced forms are close by trigram matching.
  9 | ** The reduced form of a name is the name with certain common words (like "Farm") removed.
 10 | **
 11 | */
 12 | 
 13 | create extension if not exists pg_trgm; -- trigram matching
 14 | 
 15 | \echo -n Deduplicating REPD dataset ...
 16 | 
 17 | -- PARAMETERS:
 18 | --
 19 | -- cluster_distance is the distance (in metres) within which we count
 20 | -- two objects as potentially being part of the same cluster.
 21 | --
 22 | -- name_distance is the threshold (in trigram matching) for counting two site
 23 | -- names as potentially representing the same site.
 24 | 
 25 | \set cluster_distance 1380
 26 | \set identical_cluster_distance 5
 27 | \set name_distance 0.2
 28 | 
 29 | 
 30 | create temporary view repd_parts(repd_id1, repd_id2) as
 31 | with temp(repd_id, location, capacity, reduced_site_name) as (
 32 |   -- Within `site_name` remove the following strings:
 33 |   --   solar, Solar, park, Park, farm, Farm, resubmission, (resubmission), (Resubmission),
 34 |   --   extension, Extension, ()
 35 |   -- also remove ' - ' and reduce two consecutive spaces to one.
 36 |   select repd_id,
 37 |          location,
 38 |          capacity,
 39 |          regexp_replace(
 40 |            regexp_replace(
 41 |            site_name,
 42 |            'solar|Solar|park|Park|farm|Farm|\(resubmission\)|\(Resubmission\)|resubmission|Resubmission|extension|Extension|\(\)', '', 'g'),
 43 |          ' +', ' ', 'g') as reduced_site_name
 44 |    from repd
 45 | )
 46 |   select x.repd_id as repd_id1,
 47 |          y.repd_id as repd_id2
 48 |     from temp as x cross join temp as y
 49 |     -- two sites are the same if they are close and have similar names; or if they
 50 |     -- are so close as to be clearly the same.
 51 |     where x.repd_id != y.repd_id
 52 |       and ((x.location::geography <-> y.location::geography <
 53 | 		area_adaptive_threshold(NULL, NULL, x.capacity, y.capacity)
 54 |             and similarity(x.reduced_site_name, y.reduced_site_name) >= :name_distance)
 55 |            or x.location::geography <-> y.location::geography < :identical_cluster_distance);
 56 | 
 57 | 
 58 | -- repd_clusters(repd_id1, repd_id2)
 59 | -- Objects that can be reached through a chain of connections
 60 | 
 61 | create temporary view repd_clusters as
 62 | with recursive repd_clusters(repd_id1, repd_id2) as (
 63 |     select repd_id1, repd_id2
 64 |     from repd_parts
 65 |   union
 66 |     select repd_clusters.repd_id1 as repd_id1, repd_parts.repd_id2 as repd_id2
 67 |     from repd_clusters cross join repd_parts
 68 |     where repd_clusters.repd_id2 = repd_parts.repd_id1
 69 |   )
 70 |   select repd_id1, repd_id2 FROM repd_clusters;
 71 | 
 72 | 
 73 | -- repd_dedup(repd_id, master_repd_id)
 74 | -- master_repd_id is the largest repd_id over all objects within the
 75 | -- same cluster
 76 | 
 77 | create temporary view repd_dedup as
 78 | select repd_id1 as repd_id, max(repd_id2) as master_repd_id
 79 |   from repd_clusters
 80 |   group by repd_id1;
 81 | 
 82 | /*
 83 | ** Merge the new groupings into the repd table
 84 | */
 85 | 
 86 | alter table repd
 87 |   add column master_repd_id integer;  -- default is NULL
 88 | 
 89 | \echo clustering ...
 90 | 
 91 | update repd
 92 |   set (master_repd_id) =
 93 |     (select master_repd_id
 94 |        from repd_dedup
 95 |       where repd_dedup.repd_id = repd.repd_id)
 96 |     where master_repd_id is null;
 97 | 
 98 | -- Add master id identical to id for all singletons, to aid matching
 99 | update repd
100 |   set master_repd_id = repd_id
101 |   where master_repd_id is null;
102 | 


--------------------------------------------------------------------------------
/db/repd.sql:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** Create table containing REPD dataset
  3 | */
  4 | 
  5 | \echo Creating REPD table ...
  6 | 
  7 | drop table if exists raw.repd;
  8 | drop table if exists repd cascade;
  9 | 
 10 | create table raw.repd (
 11 |   old_repd_id                    varchar(15),
 12 |   repd_id                        integer,
 13 |   record_last_updated            date,
 14 |   operator                       varchar(100),
 15 |   site_name                      varchar(100),
 16 |   tech_type                      varchar(40),
 17 |   storage_type                   varchar(40),
 18 |   co_location_repd_id            float, -- can't make this int as csv contains NaN, but this is fixed below
 19 |   capacity                       varchar(8),
 20 |   chp_enabled                    varchar(3),
 21 |   ro_banding                     varchar(10),
 22 |   fit_tariff                     float,
 23 |   cfd_capacity                   varchar(10),
 24 |   turbine_capacity               varchar(10),
 25 |   num_turbines                   varchar(10),
 26 |   height_turbines                varchar(10),
 27 |   mounting_type                  varchar(10),
 28 |   dev_status                     varchar(40),
 29 |   dev_status_short               varchar(30),
 30 |   address                        varchar(300),
 31 |   county                         varchar(30),
 32 |   region                         varchar(20),
 33 |   country                        varchar(20),
 34 |   postcode                       varchar(15),
 35 |   x                              float,
 36 |   y                              float,
 37 |   planning_authority             varchar(70),
 38 |   planning_application_reference varchar(50),
 39 |   appeal_reference               varchar(50),
 40 |   sec_state_ref                  varchar(50),
 41 |   type_sec_state_intervention    varchar(20),
 42 |   judicial_review                float,
 43 |   offshore_wind_round            varchar(10),
 44 |   planning_application_submitted date,
 45 |   planning_application_withdrawn date,
 46 |   planning_permission_refused    date,
 47 |   appeal_lodged                  date,
 48 |   appeal_withdrawn               date,
 49 |   appeal_refused                 date,
 50 |   appeal_granted                 date,
 51 |   planning_permission_granted    date,
 52 |   sec_state_intervened           date,
 53 |   sec_state_refused              date,
 54 |   sec_state_granted              date,
 55 |   planning_permission_expired    date,
 56 |   under_construction             date,
 57 |   operational                    date,
 58 |   latitude                       float,
 59 |   longitude                      float,
 60 |   primary key (repd_id)
 61 | );
 62 | 
 63 | \copy raw.repd from '../data/processed/repd.csv' delimiter ',' csv header;
 64 | 
 65 | 
 66 | /* -----------------------------------------------------------------------------
 67 | ** Edit table as necessary
 68 | */
 69 | 
 70 | -- Change co_location_repd_id to integer
 71 | alter table raw.repd
 72 |   alter column co_location_repd_id
 73 |     set data type int
 74 |       using cast (co_location_repd_id as integer);
 75 | 
 76 | alter table raw.repd
 77 |   alter column capacity
 78 |     type float using (capacity)::float;
 79 | 
 80 | -- Create geometry columns for geographical comparison/matching
 81 | -- NB: Spatial Reference ID 4326 refers to WGS84
 82 | 
 83 | alter table raw.repd
 84 |   add column location geometry(Point, 4326);
 85 | 
 86 | update raw.repd
 87 |   set location = ST_SetSRID(ST_MakePoint(longitude, latitude), 4326);
 88 | 
 89 | /* -----------------------------------------------------------------------------
 90 | ** Generate final table
 91 | */
 92 | 
 93 | -- Restrict REPD to operational Solar PV only
 94 | select * into repd
 95 |   from raw.repd
 96 |   where tech_type = 'Solar Photovoltaics';
 97 | 
 98 | alter table repd
 99 |   drop column tech_type;
100 | 


--------------------------------------------------------------------------------
/db/dedup-osm.sql:
--------------------------------------------------------------------------------
  1 | /*
  2 | ** Deduplicate the OSM dataset and add field `master_osm_id`
  3 | **
  4 | ** Some OSM rows represent geographical entities that are in reality component parts
  5 | ** of a single installation (often a solar "farm").
  6 | ** This script adds a field `master_osm_id` that contains a unique `osm_id` for all
  7 | ** members of a single group.
  8 | */
  9 | 
 10 | \echo -n Deduplicating OSM dataset ...
 11 | 
 12 | -- PARAMETERS: cluster_distance is the distance (in metres) within which we count
 13 | -- two objects as certainly being part of the same cluster.
 14 | 
 15 | \set cluster_distance 300
 16 | 
 17 | drop table if exists osm cascade;
 18 | drop table if exists osm_dedup;
 19 | 
 20 | -- The field plantref, if not null, contains a value of the form 'way/123456789'.
 21 | -- We extract the part after the "/", which corresponds to another osm_id.
 22 | 
 23 | select objtype,
 24 |        osm_id,
 25 |        username,
 26 |        time_created,
 27 |        latitude,
 28 |        longitude,
 29 |        area,
 30 |        capacity,
 31 |        modules,
 32 |        located,
 33 |        orientation,
 34 |        cast(split_part(plantref, '/', 2) as bigint) as master_osm_id,
 35 |        source_capacity,
 36 |        source_obj,
 37 |        tag_power,
 38 |        repd_id_str,
 39 |        tag_start_date,
 40 |        location
 41 |   into osm
 42 |   from raw.osm;
 43 | 
 44 | /*
 45 | ** Deduplicate objects that are part of the same farm
 46 | **
 47 | ** 1. Find groups of objects within 300m of each other;
 48 | ** 2. Call these "the same" and close over this equivalence relation
 49 | ** 3. Choose one osm_id from each equivalence class
 50 | */
 51 | 
 52 | \echo clustering ...
 53 | 
 54 | -- osm_parts(osm_id1, osm_id2)
 55 | -- All pairs of objects that are within 300m of each other
 56 | 
 57 | create temporary view osm_parts as
 58 | with maybe_dupes(osm_id, location, area, capacity) as (
 59 |   -- ignore nodes, (various misspellings of) rooftop things,
 60 |   -- and cases where there is already a master_osm_id.
 61 |   -- NB. "X is not true" is true if X is false or X is null
 62 |   select osm_id, location, area, capacity from osm
 63 |   where
 64 |     objtype != 'node'
 65 |     and (located in ('roof', 'rood', 'roofq', 'rof', 'roofs')) is not true
 66 |     and master_osm_id is null
 67 | )
 68 | -- find objects within appropriate distance of each other
 69 | select md1.osm_id as osm_id1, md2.osm_id as osm_id2
 70 |   from maybe_dupes as md1, maybe_dupes as md2
 71 |   where md1.osm_id != md2.osm_id
 72 |         and md1.location::geography <-> md2.location::geography <
 73 | 		area_adaptive_threshold(md1.area, md2.area, md1.capacity, md2.capacity);
 74 | 
 75 | -- osm_clusters(osm_id1, osm_id2)
 76 | -- Objects that can be reached through a chain of connections
 77 | 
 78 | create temporary view osm_clusters as
 79 | with recursive osm_clusters(osm_id1, osm_id2) as (
 80 |     select osm_id1, osm_id2
 81 |     from osm_parts
 82 |   union
 83 |     select osm_clusters.osm_id1 as osm_id1, osm_parts.osm_id2 as osm_id2
 84 |     from osm_clusters cross join osm_parts
 85 |     where osm_clusters.osm_id2 = osm_parts.osm_id1
 86 |   )
 87 |   select osm_id1, osm_id2 FROM osm_clusters;
 88 | 
 89 | -- osm_dedup(osm_id, master_osm_id)
 90 | -- master_osm_id is the largest osm_id over all objects within the
 91 | -- same cluster
 92 | 
 93 | select osm_id1 as osm_id, max(osm_id2) as master_osm_id
 94 |   into osm_dedup
 95 |   from osm_clusters
 96 |   group by osm_id1;
 97 | 
 98 | /*
 99 | ** Merge the new groupings into the osm table
100 | */
101 | 
102 | update osm
103 |   set (master_osm_id) =
104 |     (select master_osm_id
105 |      from osm_dedup
106 |      where osm_dedup.osm_id = osm.osm_id)
107 |   where master_osm_id is null;
108 | 
109 | -- Add master id identical to id for all singletons, to aid matching
110 | update osm
111 |   set master_osm_id = osm_id
112 |   where master_osm_id is null;
113 | 


--------------------------------------------------------------------------------
/data/processed/pre-process-repd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | ### Process the REPD data to remove oddities
  3 | ### Reads from stdin, writes to stdout
  4 | 
  5 | from bng_to_latlon import OSGB36toWGS84 as convert
  6 | import sys
  7 | import pandas as pd
  8 | import re
  9 | 
 10 | # Remove "carriage returns" and the dagger symbol
 11 | def clean_repd_csv(csv_str):
 12 |     csv_str = csv_str.replace("\r", "").replace("†", "")
 13 |     rgx_list = ['\s*<br*\s*"', '\s*<br>\s*"']
 14 |     for rgx_match in rgx_list:
 15 |         csv_str = re.sub(rgx_match, '"', csv_str)
 16 |     return csv_str
 17 | 
 18 | sys.stdin.reconfigure(encoding='iso-8859-1')
 19 | repd_df = pd.read_csv(sys.stdin, skiprows=1)
 20 | 
 21 | # Check the file has the columns we expect and order them as we expect
 22 | # If the columns don't exist, make the column empty
 23 | output_df = pd.DataFrame()
 24 | required_columns = ['Old Ref ID',
 25 |                     'Ref ID',
 26 |                     'Record Last Updated (dd/mm/yyyy)',
 27 |                     'Operator (or Applicant)',
 28 |                     'Site Name',
 29 |                     'Technology Type',
 30 |                     'Storage Type',
 31 |                     'Storage Co-location REPD Ref ID',
 32 |                     'Installed Capacity (MWelec)',
 33 |                     'CHP Enabled',
 34 |                     'RO Banding (ROC/MWh)',
 35 |                     'FiT Tariff (p/kWh)',
 36 |                     'CfD Capacity (MW)',
 37 |                     'Turbine Capacity (MW)',
 38 |                     'No. of Turbines',
 39 |                     'Height of Turbines (m)',
 40 |                     'Mounting Type for Solar',
 41 |                     'Development Status',
 42 |                     'Development Status (short)',
 43 |                     'Address',
 44 |                     'County',
 45 |                     'Region',
 46 |                     'Country',
 47 |                     'Post Code',
 48 |                     'X-coordinate',
 49 |                     'Y-coordinate',
 50 |                     'Planning Authority',
 51 |                     'Planning Application Reference',
 52 |                     'Appeal Reference',
 53 |                     'Secretary of State Reference',
 54 |                     'Type of Secretary of State Intervention',
 55 |                     'Judicial Review',
 56 |                     'Offshore Wind Round',
 57 |                     'Planning Application Submitted',
 58 |                     'Planning Application Withdrawn',
 59 |                     'Planning Permission Refused',
 60 |                     'Appeal Lodged',
 61 |                     'Appeal Withdrawn',
 62 |                     'Appeal Refused',
 63 |                     'Appeal Granted',
 64 |                     'Planning Permission Granted',
 65 |                     'Secretary of State - Intervened',
 66 |                     'Secretary of State - Refusal',
 67 |                     'Secretary of State - Granted',
 68 |                     'Planning Permission Expired',
 69 |                     'Under Construction',
 70 |                     'Operational']
 71 | 
 72 | for col in required_columns:
 73 |     try:
 74 |         output_df[col] = repd_df[col]
 75 |     except KeyError:
 76 |         output_df[col] = np.nan
 77 | 
 78 | # Also at this point reduce to PV only (reduces data volumes)
 79 | output_df = output_df[output_df['Technology Type']=='Solar Photovoltaics']
 80 | 
 81 | # Remove thousand-separator commas from number fields
 82 | output_df['Storage Co-location REPD Ref ID'] = output_df['Storage Co-location REPD Ref ID'].map(lambda x: float(str(x).replace(',','')))
 83 | output_df['X-coordinate'] = output_df['X-coordinate'].map(lambda x: float(str(x).replace(',','')))
 84 | output_df['Y-coordinate'] = output_df['Y-coordinate'].map(lambda x: float(str(x).replace(',','')))
 85 | 
 86 | # Remove spaces from postcodes
 87 | output_df['Post Code'] = output_df['Post Code'].map(lambda x: str(x).replace(' ',''))
 88 | 
 89 | # Ensure the tariff is numeric
 90 | output_df['FiT Tariff (p/kWh)'] = output_df['FiT Tariff (p/kWh)'].map(lambda x: float(x))
 91 | 
 92 | # Remove line breaks from within certain fields
 93 | output_df['Site Name'] = output_df['Site Name'].str.strip()
 94 | output_df['Address'] = output_df['Address'].str.replace('\r\n', ', ')
 95 | output_df['Appeal Reference'] = output_df['Appeal Reference'].map(lambda x: str(x).replace('\r\n', ''))
 96 | 
 97 | # Convert each BNG coordinate to lat and lon and add new columns
 98 | for index, row in output_df.iterrows():
 99 |     # Ignore any rows that don't have an X and Y coordinate
100 |     if pd.notna(row['X-coordinate']) and pd.notna(row['Y-coordinate']):
101 |         lat, lon = convert(row['X-coordinate'], row['Y-coordinate'])
102 |         output_df.at[index,'latitude'] = lat
103 |         output_df.at[index,'longitude'] = lon
104 | 
105 | repd_csv_str = output_df.to_csv(index=False)
106 | 
107 | # Make generic edits and write out
108 | sys.stdout.write(clean_repd_csv(repd_csv_str))
109 | 


--------------------------------------------------------------------------------
/db/export.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | ** Solar PV database export to CSV etc
 3 | ** April--August 2020
 4 | ** Author: Dan Stowell
 5 | */
 6 | 
 7 | -- "Suggested REPD IDs for OSM"
 8 | CREATE TEMP TABLE "tmp_export_osm_repd" AS
 9 | SELECT match_rule, osm.objtype, osm.osm_id, osm.latitude as "osm_latitude", osm.longitude as "osm_longitude", osm.repd_id_str as "osm_repd_id", repd.*
10 | 	FROM matches
11 | 		LEFT JOIN repd ON matches.master_repd_id=repd.repd_id
12 | 		LEFT JOIN osm ON matches.master_osm_id=osm.osm_id
13 | 	WHERE osm.repd_id_str IS NULL   -- this one excludes all OSM items with no listed REPD
14 | 		OR (NOT osm.repd_id_str=cast(repd.repd_id as varchar(10)))   -- and this one adds in the ones present but mismatching
15 | 	ORDER BY (match_rule IN ('4', '4a')), repd.repd_id;
16 | 
17 | \copy "tmp_export_osm_repd" TO '../data/exported/osm_repd_proposed_matches.csv' WITH DELIMITER ',' CSV HEADER;
18 | 
19 | -- "Grand unified [over osm & repd] CSV of PV geolocations"
20 | CREATE TEMP TABLE "tmp_export_pvgeo" AS
21 | 	SELECT DISTINCT osm.objtype as osm_objtype, osm.osm_id,
22 | 		repd.repd_id, repd.site_name as repd_site_name,
23 | 		repd.capacity::float as "capacity_repd_MWp", osm.capacity as "capacity_osm_MWp",
24 | 	COALESCE(osm.latitude, repd.latitude) as latitude,
25 | 	COALESCE(osm.longitude, repd.longitude) as longitude,
26 | 	osm.area as area_sqm, osm.located, osm.orientation, osm.tag_power as osm_power_type, osm.tag_start_date as osm_tag_start_date,
27 | 	osm.modules as num_modules, -- osm.source_obj as osm_source_obj, osm.source_capacity as osm_source_capacity,
28 | 	repd.dev_status_short as repd_status, repd.operational as repd_operational_date, repd.old_repd_id,
29 | 	osm.master_osm_id as osm_cluster_id, repd.master_repd_id as repd_cluster_id,
30 | 	source_capacity, source_obj,
31 | 	matches.match_rule
32 | 	FROM (matches
33 | 		FULL JOIN repd ON (matches.master_repd_id=repd.master_repd_id
34 | 			AND repd.dev_status_short NOT IN ('Abandoned', 'Application Refused', 'Application Withdrawn',  'Planning Permission Expired')
35 | 			AND match_rule NOT IN ('4', '4a'))   -- skip matches that were "schemes"
36 | 		FULL JOIN osm ON (matches.master_osm_id=osm.master_osm_id
37 | 			))
38 | 	ORDER BY repd.repd_id, osm.osm_id;
39 | 
40 | -- Delete irrelevant REPD entries (i.e. no OSM ID and status hints nonexistence).
41 | -- (Doing this in the main query was not working correctly for some reason.)
42 | -- Note that some statuses e.g. 'Awaiting Construction', 'Under Construction' imply they are likely to exist soon.
43 | -- Here we try to strike a balance between being conservative (which means, ignore everything not flagged as operational)
44 | --    and accounting for the time-lag in the official data (which means, include items likely to come online very soon).
45 | DELETE FROM tmp_export_pvgeo WHERE (osm_id IS NULL) AND ((repd_status IS NULL) OR
46 | -- (repd_status IN ('Abandoned', 'Application Refused', 'Application Submitted', 'Application Withdrawn', 'Awaiting Construction', 'Planning Permission Expired')));
47 |  (NOT repd_status IN ('No Application Required', 'Operational', 'Under Construction', 'Awaiting Construction')));
48 | 
49 | 
50 | -- Copy in, and redistribute, the REPD official capacities: if there are multiple clusters with the same REPD ID, split the capacity equally over them. Any item that's not the cluster representative should not list the repd capacity. This way, the REPD capacity column can be meaningfully summed.
51 | -- (Note: NOT across items with the same cluster ID. We do not want to spread the REPD capacities over every subelement, but when there are multiple "top-level" REPD matches we have no alternative but to spread the capacity to ensure we don't double-count the capacity.)
52 | UPDATE tmp_export_pvgeo SET "capacity_repd_MWp"=NULL WHERE NOT (osm_id=osm_cluster_id or osm_id is null);  -- Note, do not null out the REPD cap where not repd-cluster, since each REPD may have its own cap.
53 | UPDATE tmp_export_pvgeo SET "capacity_repd_MWp"=portioned FROM (
54 | 	select repd_cluster_id, "capacity_repd_MWp", "capacity_repd_MWp"::float/COUNT(repd_cluster_id) as portioned from tmp_export_pvgeo where repd_cluster_id>0 and repd_id=repd_cluster_id and (osm_id=osm_cluster_id or osm_id is null) group by repd_cluster_id, "capacity_repd_MWp") as portiontable WHERE tmp_export_pvgeo.repd_cluster_id=portiontable.repd_cluster_id and tmp_export_pvgeo.repd_cluster_id=tmp_export_pvgeo.repd_id and (osm_id=osm_cluster_id or osm_id is null);
55 | UPDATE tmp_export_pvgeo SET "capacity_repd_MWp"=NULL WHERE "capacity_repd_MWp"=0; -- proper NA entries.
56 | 
57 | -- We do the same for duplicated OSM IDs too.
58 | -- It may seem strange that we have OSM IDs duplicated, but the reason is that they can meaningfully match two REPD entries (eg a farm, plus its extension).
59 | -- In such a case the OSM ID appears in two rows. We don't want the osm-tagged capacity double-counted, so we subdivide it.
60 | UPDATE tmp_export_pvgeo SET "capacity_osm_MWp"=portioned FROM (
61 | 	select osm_id, "capacity_osm_MWp", "capacity_osm_MWp"::float/COUNT(osm_id) as portioned from tmp_export_pvgeo where osm_id>0 and "capacity_osm_MWp">0 group by osm_id, "capacity_osm_MWp"
62 | ) as portiontable WHERE tmp_export_pvgeo.osm_id=portiontable.osm_id;
63 | UPDATE tmp_export_pvgeo SET "capacity_osm_MWp"=NULL WHERE "capacity_osm_MWp"=0; -- proper NA entries.
64 | 
65 | 
66 | \copy "tmp_export_pvgeo" TO '../data/exported/ukpvgeo_points.csv' WITH DELIMITER ',' CSV HEADER;
67 | 
68 | -- deactivated: cluster-only export.
69 | -- \copy (SELECT * FROM tmp_export_pvgeo WHERE (osm_id = osm_cluster_id OR osm_cluster_id IS NULL OR osm_id IS NULL) AND (repd_id = repd_cluster_id OR repd_cluster_id IS NULL OR repd_id IS NULL)) TO '../data/exported/ukpvgeo_points_merged_deduped_osm-repd_clusters.csv' WITH DELIMITER ',' CSV HEADER;
70 | 
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Solar Power Mapping Data
 2 | 
 3 | This software combines datasets about solar panels (photovoltaic "PV" panels), and combines them into a harmonised data source which can be used for machine vision, short-term solar forecasting and other applications.
 4 | 
 5 | The goal is to produce a large and rich dataset of PV, which is fully open data.
 6 | 
 7 | This software was developed under the Alan Turing Institute's Climate Action project grant R-SPES-115: "Enabling worldwide solar PV nowcasting via machine vision and open data".
 8 | 
 9 | 
10 | ## Overview of the directory structure
11 | 
12 | ```
13 | .
14 | |-- admin            -- project process and planning docs
15 | |-- data
16 | |   |-- as_received  -- downloaded data files
17 | |   |-- raw          -- manually edited files (replace dummy data)
18 | |   |-- processed
19 | |-- db               -- database creation
20 | |-- doc              -- documentation
21 | |-- explorations     -- exploratory work
22 | `-- notebooks
23 | ```
24 | 
25 | 
26 | ## Data
27 | 
28 | Data is held in three directories: `as_received` contains the data precisely as
29 | downloaded from its original source and in its original format; `raw` contains
30 | data that has been manually restructured or reformatted to be suitable for use by
31 | software in the project (see "Using this repo" header below). `processed` contains data that may have been processed in some way, such as by Python code, but is still thought of as “source” data.
32 | 
33 | The following sources of data are used:
34 | 
35 | - OpenStreetMap - [Great Britain download (Geofabrik)](https://download.geofabrik.de/europe/great-britain.html).
36 |     - [OSM data types](https://wiki.openstreetmap.org/wiki/Elements)
37 |     - [Solar PV tagging](https://wiki.openstreetmap.org/wiki/Tag:generator:source%3Dsolar)
38 | - [FiT](https://www.ofgem.gov.uk/environmental-programmes/fit/contacts-guidance-and-resources/public-reports-and-data-fit/installation-reports) - Report of installed PV (and other tech including wind). 100,000s entries.
39 | - [REPD](https://www.gov.uk/government/publications/renewable-energy-planning-database-monthly-extract) - Official UK data from the "renewable energy planning database". It contains large solar farms only.
40 | - Machine Vision dataset - supplied by Descartes labs (Oxford), not publicly available yet.
41 | 
42 | ## Project outcome
43 | 
44 | This repo includes a set of scripts that will take
45 | input datasets (REPD, OSM, FiT and machine vision – each in diff format),
46 | perform data cleaning/conversion, populate a PostgreSQL database, perform
47 | grouping of data where necessary (there are duplicate entries in REPD, multiple solar farm
48 | components in OSM) and then match entries between the data tables, based on the
49 | matching criteria we have come up with.
50 | 
51 | The database creation and matching scripts should work with newer versions of the source data files, or at least do so with minimal changes to the data processing (see "Using this repo" below).
52 | 
53 | The result of matching is a table in the database called `matches` that links the unique identifiers of the
54 | data tables. This also contains a column called `match_rule`, which refers to the method by which the match was determined, as documented in [doc/matching](doc/matching.md).
55 | 
56 | ## Using this repo
57 | 
58 | First, download (or clone) this repository.
59 | 
60 | ### Install requirements
61 | 
62 | 1. Install [PostgreSQL](https://www.postgresql.org/download/) with [PostGIS](http://postgis.net/install/)
63 | 2. Install Python 3 (version 3.7 or later) and `pip`
64 | 3. Run `pip install -r requirements.txt`
65 | 4. Install [Osmium](https://osmcode.org/osmium-tool/)
66 | 
67 | ### Download and prepare data files
68 | 
69 | 1. Download the following data files from the internet and store locally. We recommend saving these original data files within the directory structure under `data/as_received`:
70 |     - OSM PBF file (GB extract): [Download](https://download.geofabrik.de/europe/great-britain-latest.osm.pbf)
71 |     - FiT reports: Navigate to [ofgem](https://www.ofgem.gov.uk/environmental-programmes/fit/contacts-guidance-and-resources/public-reports-and-data-fit/installation-reports) and click the link for the latest Installation Report (during the Turing project, 30 September 2019 was used), then download the main document AND subsidiary documents
72 |     - REPD CSV file: [Download](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/879414/renewable-energy-planning-database-march-2020.csv) - this is always the most up to date version
73 |     - Machine Vision dataset: supplied by Descartes labs (Oxford), not publicly available yet.
74 | 2. Navigate to `data/raw` and type `make` - this will convert some of the downloads into other file formats ready for further processing.
75 |     - Note that the OpenStreetMap data will have been processed into a file `osm.csv`. If you do not need to do any merging/clustering, you could use this file directly, as a simplified extract of OSM solar PV data.
76 | 3. Carry out manual edits to the data files, as described in [doc/preprocessing](doc/preprocessing.md), editing the file copies in `data/raw` under the names suggested by the doc.
77 | 4. Navigate to `data/processed` and type `make` - this will create versions of the data files ready for import to PostgreSQL
78 | 
79 | ### Run the database creation and data matching
80 | 
81 | 5. Make sure you have PostgreSQL on your machine, then run the command: `createdb hut23-425 "Solar PV database matching"` - this creates the empty database.
82 | 6. Navigate to `db` and run the command `psql -f make-database.sql hut23-425` - this populates the database (see [doc/database](doc/database.md)), carries out some de-duplication of the datasets and performs the matching procedure (see [doc/matching](doc/matching.md)). Note: this may take several minutes.
83 | 
84 | Note that the above commands require you to have admin rights on your PostgreSQL server. On standard Debian-based machines you could prepend the commands with `sudo -u postgres`, or you could assign privileges to your own user account.
85 | 
86 | ### Export data from the database
87 | 
88 | The data tables in PostgreSQL can be used for further analysis. To make a data "snapshot" we export back out again:
89 | 
90 | 7. Navigate to `db` and run the command `psql -f export.sql hut23-425`
91 | 8. Navigate to `data/exported` and run `make`. Note: this may take several minutes.
92 | 
93 |     You can also run the statistical analysis and plotting -- however, this relies on some external data files such as GSP regions and LSOA regions. The file `analyse_exported.py` makes use of some local file paths (in `data/other`, not in the public source code). To do the additional plotting+stats, in `data/exported` run `make all`.
94 | 
95 | As a result of this, you should have a CSV and a GeoJSON file representing the harmonised data exported from the local database.
96 | 


--------------------------------------------------------------------------------
/data/exported/export_geometries.py:
--------------------------------------------------------------------------------
  1 | # export_geometries.py
  2 | # Dan Stowell April 2020
  3 | 
  4 | # This script takes as input:
  5 | # (1) a CSV file as produced by db/export.sql
  6 | # (2) a GeoJSON file produced by taking our filtered OSM extract of UK PV
  7 | # and unifies them into a GeoJSON tagged with our PV data.
  8 | 
  9 | # How to convert the OSM extract to GeoJSON, using ogr2ogr on a linux commandline:
 10 | #    rm data/exported/osm_layers_merged.geojson
 11 | #    for layer in points lines multilinestrings multipolygons other_relations;
 12 | #       do echo "Extracting layer $layer";
 13 | #          ogr2ogr -f GeoJSON -update -append -addfields data/exported/osm_layers_merged.geojson data/exported/osm-gb-solaronly.osm.pbf $layer -nln merged;
 14 | #    done
 15 | 
 16 | 
 17 | 
 18 | import csv, os
 19 | import numpy as np
 20 | import pandas as pd
 21 | import geopandas as gpd
 22 | from shapely.geometry import Point, Polygon, MultiPolygon, LineString, GeometryCollection
 23 | 
 24 | pvexportfpath = 'ukpvgeo_points.csv'
 25 | geometryfpath = '../raw/osm-gb-solaronly.geojson'
 26 | 
 27 | 
 28 | # load ukpvgeo_all.csv to df
 29 | inttype = pd.Int64Dtype()
 30 | df = pd.read_csv(pvexportfpath, dtype={'repd_id':inttype, 'osm_id':inttype, 'repd_cluster_id':inttype, 'osm_cluster_id':inttype, 'num_modules':inttype, 'orientation':inttype})
 31 | # load osm geojson to gdf
 32 | gdf = gpd.read_file(geometryfpath)
 33 | 
 34 | # summarise the geometries loaded
 35 | print("Loaded GeoJSON source, with the following geometry objects:")
 36 | print(gdf.geom_type.value_counts())
 37 | 
 38 | # delete columns that we don't care about. for example: the other_tags from gdf
 39 | #   DON'T YET delete the lat lon from csv.
 40 | for colname in ['other_tags', 'barrier', 'man_made', 'highway', 'landuse', 'building', 'tourism', 'amenity', 'shop', 'natural', 'leisure', 'sport', 'z_order', 'type']:
 41 | 	if colname in gdf:
 42 | 		del gdf[colname]
 43 | 
 44 | # csv: check stats on osm_id, osm_way_id and their co-occurrence --- then merge the columns
 45 | # cute row selection: gdf[~gdf.barrier.isna() & ~gdf.osm_id.isna()]
 46 | print("GDF columns:")
 47 | print(gdf.columns)
 48 | assert ( gdf.osm_id.isna() &  gdf.osm_way_id.isna()).sum()==0, "Violated expectation that no GeoJSON object can have BOTH osm_id and osm_way_id"
 49 | assert (~gdf.osm_id.isna() & ~gdf.osm_way_id.isna()).sum()==0, "Violated expectation that every GeoJSON object must have osm_id or osm_way_id"
 50 | gdf.osm_id.update(gdf.osm_way_id) # coalesce in-place - this impl assumes no overlap
 51 | del gdf['osm_way_id']
 52 | # now convert the osm_id to the same data type as in the other dataset
 53 | gdf["osm_id"] = pd.to_numeric(gdf["osm_id"])
 54 | gdf = gdf.astype({'osm_id': inttype})
 55 | 
 56 | # rename some columns to clarify their origin
 57 | gdf = gdf.rename(columns={'name':'osm_name'})
 58 | 
 59 | ####################################################################
 60 | # convert lines into polygons
 61 | # https://gis.stackexchange.com/questions/321004/iterating-features-and-buffering-using-geopandas
 62 | 
 63 | def convert_line_to_polygon(obj):
 64 | 	"NOTE: returns false if no conversion possible/needed"
 65 | 	coords = obj.coords
 66 | 	closed = coords[0]==coords[-1]
 67 | 	if not closed:
 68 | 		print("Warning, unclosed way (has %i points), consider inspecting it." % len(coords))
 69 | 		return False
 70 | 	else:
 71 | 		return Polygon(coords)
 72 | 
 73 | geomconverted = {'LineString': 0, 'GeometryCollection_LineString': 0, 'nonosm_Point': 0}
 74 | 
 75 | for index, row in  gdf[gdf.geom_type=='LineString'].iterrows():
 76 | 	newgeom = convert_line_to_polygon(row.geometry)
 77 | 	if not newgeom:
 78 | 		print(row.osm_id)
 79 | 	else:
 80 | 		gdf.loc[index, 'geometry'] = newgeom
 81 | 		geomconverted['LineString'] += 1
 82 | 
 83 | # TODO I can't correctly update the geomcoll for some reason
 84 | if False:
 85 | 	for index, row in gdf[gdf.geom_type=='GeometryCollection'].iterrows():
 86 | 		print("%s %i, OSM id %i:" % (gdf.geom_type[index], index, row.osm_id))
 87 | 		print([item.geom_type for item in row.geometry])
 88 | 
 89 | 		for gindex, item in enumerate(row.geometry):
 90 | 			print(item)
 91 | 			bignewgeom = []
 92 | 			if item.geom_type=='LineString':
 93 | 				newgeom = convert_line_to_polygon(item)
 94 | 				if not newgeom:
 95 | 					print(row.osm_id, "entry %i" % gindex)
 96 | 					bignewgeom.append(item)
 97 | 				else:
 98 | 					bignewgeom.append(newgeom)
 99 | 					geomconverted['GeometryCollection_LineString'] += 1
100 | 		#gdf.loc[index, 'geometry'] = GeometryCollection(bignewgeom)
101 | 		row.geometry = GeometryCollection(bignewgeom)
102 | 		gdf.iloc[index] = row
103 | 
104 | print("Converted geometries. Converted: %s. Results:" % str(geomconverted))
105 | print(gdf.geom_type.value_counts())
106 | 
107 | #########################################################################
108 | # perform a join -- a right join, to capture the REPD items with no osmid
109 | udf = gdf.merge(df, on='osm_id', how='right')
110 | 
111 | print("Items with osmid and no geom (will be dropped, assumed merged into a relation's multipol): %i" %   (udf.geometry.isna() & ~udf.osm_id.isna()).sum())
112 | udf.drop(udf[udf.geometry.isna() & ~udf.osm_id.isna()].index, inplace=True)
113 | print("Items with osmid and no geom (now dropped): %i" %   (udf.geometry.isna() & ~udf.osm_id.isna()).sum())
114 | 
115 | print("Items with no geom and no lat-lon (will be dropped, cannot geolocate): %i" %   (udf.geometry.isna() & udf.latitude.isna()).sum())
116 | udf.drop(udf[udf.geometry.isna() & udf.latitude.isna()].index, inplace=True)
117 | 
118 | 
119 | # the resulting items with no osm_id - are they the ones we expect? is it the same set as the entries with no geom?
120 | print("Merged data.")
121 | print("Items with    geometry: %i" % (~udf.geometry.isna()).sum())
122 | print("Items without geometry: %i" %   udf.geometry.isna().sum())
123 | print("Items with    osmid   : %i" % (~udf.osm_id.isna()).sum())
124 | print("Items without osmid   : %i" %   udf.osm_id.isna().sum())
125 | 
126 | # the resulting items with no osm_id - assert no geom, add point geom from lat+lon
127 | assert (~udf[udf.osm_id.isna()].geometry.isna()).sum()==0, "Rows with no osm_id should also have no geometry"
128 | 
129 | for index, row in udf[udf.osm_id.isna()].iterrows():
130 | 	udf.loc[index, 'geometry'] = Point(row['longitude'], row['latitude'])
131 | 	geomconverted['nonosm_Point'] += 1
132 | 
133 | print("Created geometries for non-osm points. Converted: %i. Results:" % geomconverted['nonosm_Point'])
134 | print(gdf.geom_type.value_counts())
135 | 
136 | assert udf.geometry.isna().sum()==0, "After creating lat-lon points, no-one should have null geometry: we have %i" % udf.geometry.isna().sum()
137 | 
138 | # near end: delete columns latitude, longitude -- these were derived from the geometry anyway
139 | for colname in ['latitude', 'longitude']:
140 | 	if colname in udf:
141 | 		del udf[colname]
142 | 
143 | #########################################################################
144 | print("===========================================================================")
145 | print("Finished filtering and merging CSV and GeoJSON data.")
146 | print(udf.describe(exclude=gpd.array.GeometryDtype))
147 | 
148 | # write file out
149 | udf.to_file("ukpvgeo_geometries.geojson", driver='GeoJSON')
150 | 
151 | 


--------------------------------------------------------------------------------
/doc/database.md:
--------------------------------------------------------------------------------
  1 | % The hut23-425 database
  2 | 
  3 | This note documents the creation of the `hut23-425` database and the initial
  4 | deduplication of the data.
  5 | 
  6 | We use an "Extract-Load-Transform" methodology: Tables of the source datasets
  7 | are first uploaded (from the `data/processed` directory) into the schema `raw`
  8 | in the database, then post-processed in the database and saved as tables in the
  9 | default schema. (Some tables do not require post-processing and are uploaded
 10 | directly into the default schema.) In this note, we assume that the
 11 | pre-processing has already taken place.
 12 | 
 13 | In summary, the final tables created, and row counts as of commit `0d8d38b` are:
 14 | 
 15 | | Table                 | Row count | Master entity field | Master entity count |
 16 | |-----------------------|-----------|---------------------|---------------------|
 17 | | `osm`                 |    126,939| `master_osm_id`     |              119,427|
 18 | | `repd`                |      1,986| `master_repd_id`    |               1,736 |
 19 | | `fit`                 |    863,079|                     |                     |
 20 | | `machine_vision`      |      2,221|                     |                     |
 21 | | `osm_repd_id_mapping` |       933 |                     |                     |
 22 | 
 23 | (In the above table, the master entity count is the number of the distinct
 24 | entries in the named column)
 25 | 
 26 | # 1. Database creation scripts
 27 | 
 28 | These scripts assume the existence of a local Postgres installation containing a
 29 | database called `hut23-425`. To create the database, run:
 30 | 
 31 | ```bash
 32 | createdb hut23-425 "Solar PV database matching"
 33 | ```
 34 | 
 35 | ## Upload of source data
 36 | 
 37 | SQL code to create the tables and populate the database is in the `db/`
 38 | directory. To create the complete database, change to that directory and run:
 39 | 
 40 | ```bash
 41 | psql -f make-database.sql hut23-425
 42 | ```
 43 | 
 44 | This script will in turn call `osm.sql`, `repd.sql`, `fit.sql`, `mv.sql`,
 45 | `match-osm-repd.sql`, and `dedup-osm.sql` which create and populate the data
 46 | tables from the respective source data in `../data/processed/` (adding a small
 47 | number of additional columns) and then postprocessing. Note that the schema
 48 | `raw` is used as a staging area for certain tables where it is necessary to do
 49 | some postprocessing. After postprocessing the working tables will be in the
 50 | default schema.
 51 | 
 52 | - `raw.osm`: The raw OSM data.
 53 | - `raw.repd`: The raw REPD data.
 54 | - `repd`: The REPD data, restricted to solar PV technologies.
 55 | - `osm`: The OSM data, de-duplicated.
 56 | - `fit`: The FiT data.
 57 | - `machine_vision`: The machine vision data.
 58 | 
 59 | A field, `area`, is added to the `fit` table, containing an estimate of the area
 60 | of the solar panel(s) based on the declared net capacity.
 61 | 
 62 | The tables `osm`, `repd`, and `machine_vision` include a latitude and longitude
 63 | for each installation. An additional field `location` is added to these tables
 64 | containing these coordinates converted to a Postgis point.
 65 | 
 66 | The `repd` table has been restricted to those installations whose technology
 67 | type is “solar Photovoltaics” and whose development status is “Operational.”
 68 | 
 69 | ## Primary keys for the uploaded data
 70 | 
 71 | ### FiT: `row_id`
 72 | 
 73 | We presume each row of the FiT data denotes an individual installation. However,
 74 | there is no defined primary key for this dataset. To allow us to reference the
 75 | original rows later an index is added to the dataset between `raw` and
 76 | `processed`.
 77 | 
 78 | ### REPD: `repd_id`
 79 | 
 80 | The source data contains a unique identifier, `Ref ID`, for each
 81 | installation. This field has been renamed to `repd_id` and used as the primary
 82 | key.
 83 | 
 84 | ### OSM: `osm_id`
 85 | 
 86 | The OSM data has a unique identifier, `id`, for each row. This field has been
 87 | renamed `osm_id` and used as the primary key but note that it does not
 88 | necessarily represent a unique installation.
 89 | 
 90 | ### Machine Vision: `mv_id`
 91 | 
 92 | We have added a row identifier, `mv_id`, to the pre-processed machine vision dataset.
 93 | 
 94 | # 2. Preliminary matching between OSM and REPD
 95 | 
 96 | The table `osm_repd_id_mapping(osm_id, repd_id)` maps OSM identifiers to REPD
 97 | identifiers.
 98 | 
 99 | The entries in the OSM dataset were tagged (in the original data) with zero
100 | or more REPD identifiers. These are present in the field `repd_id_str` as a
101 | semicolon-separated list. The script `match-osm-repd.sql` “un-nests” these
102 | identifiers as a set of rows matched to the corresponding `osm_id`.
103 | 
104 | # 3. Deduplication of the OSM dataset
105 | 
106 | An OSM entry `objtype` can be one of `relation`, `way`, or `node`.  In the case
107 | of a `relation`, there may be several other entries classified as `way` that are
108 | actually the components of the `relation`, all of which refer to a single PV
109 | installation. There may also be several `way`s that are part of the same
110 | installation even though there is no unifying `relation`.
111 | 
112 | The script `dedup-osm.sql` identifies groups of objects in the OSM data that are
113 | likely part of the same installation. An extra column, `master_osm_id` is added
114 | to the `osm` table; this column contains a unique `osm_id` for each object in
115 | a single cluster. (The particular `osm_id` used has no significance.)
116 | 
117 | ## Using the `plantref` field
118 | 
119 | Some of the OSM objects have already been tagged as being part of the same
120 | installation. These are indicated by an entry in the field `plantref` of the
121 | form `way/123456789` where the digits indicate another `osm_id`. If this field
122 | is non-`NULL`, the number is copied across to `osm_master_id`.
123 | 
124 | ## Using proximity
125 | 
126 | The remainder of the script identifies pairs of installations that are within
127 | 300 metres of each other; it then extends this relation to an equivalence
128 | relation and tags objects that are equivalent to each other with a common
129 | `master_osm_id`. (In fact, the tag is the greatest `osm_id` from the group but
130 | this choice is simply for convenience.)
131 | 
132 | ### Technical note
133 | 
134 | The relation that contains parts of objects within 300m of each other is clearly
135 | a symmetric relation but it is not necessarily transitive. (It may be the case
136 | that A and B are within 300m of each other and B and C are within 300m of each
137 | other but A is more than 300m from C.) To extend the proximity relation to an
138 | equivalence relation we form the transitive closure of the proximity relation.
139 | 
140 | Taking the transitive closure is acheived in SQL through the use of a "recursive
141 | common table expression" (recursive CTE). In the script, it is the query that
142 | begins “`WITH RECURSIVE ...`”. The primary use of recursive CTEs is, in fact, to
143 | compute transitive closures.
144 | 
145 | 
146 | # 4. Deduplication of the REPD dataset
147 | 
148 | The REPD dataset also contains objects that are close enough in both proximity
149 | and name that we believe they are likely to be part of the same installation.
150 | 
151 | Deduplication proceeds in a similar manner to the OSM database. We use a
152 | slightly larger distance threshold (1380 m) but include a measure of similarity
153 | between the installation names using Postgres' `similarity` function. In
154 | addition, prior to computing the similarity of names, we “normalise” the names
155 | to remove certain common words (such as “farm”).
156 | 
157 | As with the OSM data, a new field, `master_repd_id` is added to the `repd` table
158 | that is non-`NULL` and unique for sites that are believed to be the same site.
159 | 


--------------------------------------------------------------------------------
/data/raw/compile_osm_solar.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Script to parse an OSM XML extract for solar PV data.
  4 | # Dan Stowell, 2019-2020.
  5 | 
  6 | import os, sys, csv, subprocess, re
  7 | from functools import reduce
  8 | from xml import sax
  9 | import numpy as np
 10 | 
 11 | from matplotlib.path import Path
 12 | from scipy.spatial import KDTree as kdtree
 13 | 
 14 | import pandas as pd
 15 | from matplotlib.backends.backend_pdf import PdfPages
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | ############################################
 19 | # User configuration:
 20 | 
 21 | # do_osmium = True   # if False, it won't try to filter the input OSM file
 22 | # osmsourcefpath = os.path.expanduser('~/osm/great-britain/great-britain-200729.osm.pbf')
 23 | 
 24 | do_osmium = False
 25 | osmsourcefpath = "osm-gb-solaronly.xml"
 26 | 
 27 | # The extract must ALREADY have been processed by osmium to filter down to just the generator:method=photovoltaic items.
 28 | # Here's what I do:
 29 | # osmium tags-filter ~/osm/great-britain/great-britain-200729.osm.pbf generator:method=photovoltaic plant:method=photovoltaic plant:source=solar -o  ~/osm/solarsearch/gb-solarextracts/gb-200729-solar-withreferenced.xml
 30 | # In the public scripts, this is all included in the work done by the makefile
 31 | 
 32 | 
 33 | ############################################
 34 | # Helper functions:
 35 | 
 36 | earthradius = 6364380.0  # earth radius (m) at Manchester
 37 | degrees_to_metres = 2 * np.pi * earthradius / 360
 38 | def angular_area_to_sqm(anga, latitude_deg):
 39 | 	"""Use simple angular calculation to approximately convert angular areas (calculated from GPS coords) to areas in square metres.
 40 | 	WARNING: APPROXIMATE. Uses spherical assumptions, and does not know about tilt of objects."""
 41 | 	latitude_correction = np.cos(np.radians(latitude_deg)) # 53.4427
 42 | 	return degrees_to_metres * degrees_to_metres * latitude_correction * anga
 43 | 
 44 | def PolyArea(x,y):
 45 | 	"Calculate area of a polygon from its coordinates (shoelace formula)"
 46 | 	return 0.5*np.abs(np.dot(x,np.roll(y,1))-np.dot(y,np.roll(x,1)))  # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
 47 | 
 48 | def guess_kilowattage(anobj):
 49 | 	"This should NOT NORMALLY BE USED since it is really only a rule of thumb."
 50 | 	if not anobj['calc_area']:
 51 | 		return 1.
 52 | 	else:
 53 | 		return anobj['calc_area'] * 0.15
 54 | 
 55 | regex_numbers_semicolon_start = re.compile('^([0-9;]*).*')
 56 | 
 57 | compasspoints = {
 58 | 	'N':     0,
 59 | 	'NNE':  22.5,
 60 | 	'NE':   45,
 61 | 	'ENE':  67.5,
 62 | 	'E':    90,
 63 | 	'ESE': 112.5,
 64 | 	'SE':  135,
 65 | 	'SSE': 157.5,
 66 | 	'S':   180,
 67 | 	'SSW': 202.5,
 68 | 	'SW':  225,
 69 | 	'WSW': 247.5,
 70 | 	'W':   270,
 71 | 	'WNW': 292.2,
 72 | 	'NW':  315,
 73 | 	'NNW': 337.5,
 74 | 	# unconventional but seen in data:
 75 | 	'NORTH':   0,
 76 | 	'NORTHEAST':   45,
 77 | 	'NORTH_EAST':   45,
 78 | 	'EAS':    90,                   # TEMPORARY fix for wonky entry
 79 | 	'EAST':    90,
 80 | 	'SOUTHEAST':    135,
 81 | 	'SOUTH_EAST':    135,
 82 | 	'SOUTH':  180,
 83 | 	'SOUTHWEST':    225,
 84 | 	'SOUTH_WEST':    225,
 85 | 	'WEST':   270,
 86 | 	'NORTHWEST':   315,
 87 | 	'NORTH_WEST':   315,
 88 | }
 89 | 
 90 | ##############################################################################
 91 | # The main routine, which progressively reacts to XML content as it is loaded:
 92 | 
 93 | class SolarXMLHandler(sax.handler.ContentHandler):
 94 | 	"""Parses solar PV data from OSM XML. After this has finished, the 'objs' member is a list of processed PV objects (panels as well as plants).
 95 | 	Note that after the initial parse of the XML, you then need to call postprocess() which will propagate information down from relation-containment and geographic-containment."""
 96 | 	def __init__ (self):
 97 | 		sax.handler.ContentHandler.__init__(self)
 98 | 		self.curitem = None
 99 | 		self.objs = []
100 | 		# these value-stores are for intermediate processing of relationships (e.g. the parent way/rel to know their own accumulated contents) - they are not used as the output data.
101 | 		self.nodedata = {}
102 | 		self.waydata = {}
103 | 		self.reldata = {}
104 | 
105 | 	def startElement (self, name, attrs):
106 | 		if name in ['node', 'way', 'relation']: # start a new "object" with empty tags
107 | 			self.curitem = {'id': attrs['id'], 'timestamp': attrs['timestamp'], 'user': attrs.get('user', ''), 'tags':{}, 'objtype': name}
108 | 		if name == 'node':
109 | 			self.curitem['lat'] = float(attrs['lat'])
110 | 			self.curitem['lon'] = float(attrs['lon'])
111 | 		elif name == 'way':
112 | 			self.curitem['nodes'] = []
113 | 		elif name == 'relation':
114 | 			self.curitem['nodes'] = []
115 | 			self.curitem['ways'] = []
116 | 			self.curitem['relations'] = []
117 | 		elif name == 'tag':
118 | 			self.curitem['tags'][attrs['k']] = attrs['v']
119 | 		elif name == 'nd': # This is a node reference from within a way
120 | 			self.curitem['nodes'].append(attrs['ref'])
121 | 		elif name == 'member': # This is a node/way/rel reference from within a relation
122 | 			if attrs['type'] == 'way':
123 | 				self.curitem['ways'].append({'ref':attrs['ref'], 'role':attrs['role']})
124 | 			elif attrs['type'] == 'relation':
125 | 				self.curitem['relations'].append({'ref':attrs['ref'], 'role':attrs['role']})
126 | 			elif attrs['type'] == 'node':
127 | 				self.curitem['nodes'].append({'ref':attrs['ref'], 'role':attrs['role']})
128 | 			else:
129 | 				raise ValueError("This script does not know how to handle the following member-type found in relation %s: '%s'" % (self.curitem['id'], attrs['type']))
130 | 
131 | 	def endElement(self, name):
132 | 		if name in ['node', 'way', 'relation']: # finish off, and store, the "object"
133 | 			curitem = self.curitem # for speed
134 | 
135 | 			if ('highway' in curitem['tags']) and (curitem['tags']['highway']=='turning_circle'):
136 | 				raise ValueError("Data contains highway=turning_circle items. Are you sure this OSM data has been preprocessed to strip it down only to the photovoltaics?")
137 | 
138 | 			# Cache the object data for use in lookups etc (NOT for output, these data structures - actually "curitem" is where the data for output live.)
139 | 			datacache = {
140 | 				'node': self.nodedata,
141 | 				'way': self.waydata,
142 | 				'relation': self.reldata,
143 | 			}[name]
144 | 			if curitem['id'] in datacache:
145 | 				raise ValueError("datacache seems to encounter a duplicate item: type %s, id %i" % (name, curitem['id']))
146 | 			datacache[curitem['id']] = {}
147 | 			datacacheitem = datacache[curitem['id']]
148 | 
149 | 			if curitem['objtype']=='node':
150 | 				datacacheitem['lat'] = curitem['lat']
151 | 				datacacheitem['lon'] = curitem['lon']
152 | 				curitem['calc_area'] = 0   # TODO there may be a tag telling you the area; else, as a node, it's useful to make clear we have no area estimate
153 | 
154 | 			elif curitem['objtype']=='way':
155 | 				latlist = [self.nodedata[nodeid]['lat'] for nodeid in curitem['nodes']]
156 | 				lonlist = [self.nodedata[nodeid]['lon'] for nodeid in curitem['nodes']]
157 | 				datacacheitem['outlinepath'] = Path([_ for _ in zip(latlist, lonlist)])
158 | 				# calculate its centroid as mean(nodes)
159 | 				curitem['lat'] = np.mean(latlist)
160 | 				curitem['lon'] = np.mean(lonlist)
161 | 				datacacheitem['lat'] = curitem['lat']
162 | 				datacacheitem['lon'] = curitem['lon']
163 | 				# calculate its area using PolyArea
164 | 				curitem['calc_area'] = PolyArea(latlist, lonlist)
165 | 				# convert to square metres
166 | 				curitem['calc_area'] = round(angular_area_to_sqm(curitem['calc_area'], curitem['lat']), 1)
167 | 				datacacheitem['calc_area'] = curitem['calc_area']
168 | 
169 | 			elif curitem['objtype']=='relation':
170 | 				datacacheitem['relations'] = curitem['relations']
171 | 				datacacheitem['ways'] = curitem['ways']
172 | 				datacacheitem['nodes'] = curitem['nodes']
173 | 				datacacheitem['calc_area'] = 0
174 | 				# NB most of the relationship-handling comes at the end in postprocess()
175 | 
176 | 			####################################
177 | 			# Processing tags to log a PV object
178 | 			if (curitem['tags'].get('power')=='generator' and curitem['tags'].get('generator:method')=='photovoltaic') \
179 | 			or (curitem['tags'].get('power')=='plant'     and (curitem['tags'].get('plant:method')=='photovoltaic' or curitem['tags'].get('plant:source')=='solar')):
180 | 
181 | 				for k,v in curitem['tags'].items():
182 | 					# In here we aim to process ALL KNOWN tags, including irrelevant ones, so that no information is lost
183 | 					ok = True
184 | 					if k=='power':
185 | 						curitem['tag_power'] = v
186 | 					elif k in ['generator:output:electricity', 'plant:output:electricity']:
187 | 						v = v.replace(",", ".").upper()
188 | 						if v in ['YES', 'SMALL_INSTALLATION']:
189 | 							pass
190 | 						elif v.endswith(" W"):
191 | 							curitem['calc_capacity'] = float(v[:-2]) * 0.001
192 | 						elif v.endswith(" KW"):
193 | 							curitem['calc_capacity'] = float(v[:-3])
194 | 						elif v.endswith(" MW"):
195 | 							curitem['calc_capacity'] = float(v[:-3]) * 1000
196 | 						elif v.endswith(" MWP"): # NB probably not possible to make use of difference between MW and MWp
197 | 							curitem['calc_capacity'] = float(v[:-3]) * 1000
198 | 						elif v.endswith("KW"):
199 | 							curitem['calc_capacity'] = float(v[:-2])
200 | 						elif v.endswith("MW"):
201 | 							curitem['calc_capacity'] = float(v[:-2]) * 1000
202 | 						else:
203 | 							ok = False
204 | 					elif k in ['location', 'generator:place', 'generator:location']:
205 | 						if v=='rooftop':
206 | 							v = 'roof'
207 | 						curitem['location'] = v
208 | 					elif k in ['source', 'source:geometry']:
209 | 						curitem['source_obj'] = v.replace(",", ";")
210 | 					elif k in ['source:generator:output:electricity', 'source:plant:output:electricity', "source:power:output", "source:output", "source:power"]:
211 | 						if v=='REPD Open Data':
212 | 							v = 'repd'
213 | 						curitem['source_capacity'] = v.replace(",", ";")
214 | 					elif k=='note':
215 | 						if v=='roof household':
216 | 							curitem['location'] = 'roof'
217 | 						else:
218 | 							pass # TODO check all unhandled "note", see if they're OK
219 | 					elif k=='notional_area':
220 | 						if curitem['calc_area'] != 0:
221 | 							#TODO consider: print("  WARNING: skipping notional_area for %s %s because calc_area already filled in" % (curitem['objtype'], curitem['id']))
222 | 							pass
223 | 						if v.endswith(' sq m'):
224 | 							try:
225 | 								curitem['calc_area'] = float(v[:-5].replace(',', '.', ))
226 | 							except:
227 | 								print("Couldn't handle this notional_area: " + v)
228 | 						else:
229 | 							ok = False
230 | 					elif k in ['direction', 'generator:orientation', 'orientation']:
231 | 						v = v.replace('`', '').upper()  # some people write it this way
232 | 						if v in compasspoints:
233 | 							curitem['orientation'] = compasspoints[v]
234 | 						elif v in ['ESW']:
235 | 							pass # uninterpretable case seen in data... so skip
236 | 						elif v in ['FLAT']:
237 | 							pass # flat-mounted items have no orientation
238 | 						else:
239 | 							try:
240 | 								curitem['orientation'] = int(v) # NB there could of course be parse failures here
241 | 							except ValueError:
242 | 								print("Un-parseable orientation value in %s %s: %s=%s" % (curitem['objtype'], curitem['id'], k, v))
243 | 					elif k=='pv_module_array':
244 | 						splitvals = v.split(" by ")
245 | 						if len(splitvals)==2:
246 | 							curitem['generator:solar:modules'] = int(splitvals[0]) * int(splitvals[1])
247 | 						else:
248 | 							ok = False
249 | 					elif k in ['modules', 'generator:solar:modules', 'generator:modules']:
250 | 						if v=='unknown':
251 | 							v=''
252 | 						else:
253 | 							# It's quite common (for some UI reason) to get typos in this field with characters pasted after the initial digits. We trim them off.
254 | 							v = regex_numbers_semicolon_start.sub('\\1', v)
255 | 						if ';' in v:
256 | 							v = str(reduce(lambda a, b: a+b, map(int, v.split(';')))) # entries could be e.g. "7;5;2" and here we reduce them to a single integer sum
257 | 						curitem['generator:solar:modules'] = v
258 | 					#elif k=='':
259 | 					#elif k=='':
260 | 					#elif k=='':
261 | 					#elif k=='':
262 | 					#elif k=='':
263 | 					#elif k=='':
264 | 					#elif k=='':
265 | 					elif k in [
266 | # Here are ALL the ones we copy into the output data unedited
267 | 'start_date',
268 | 'repd:id',
269 | 						]:
270 | 						curitem['tag_%s' % k] = v
271 | 					elif k in [
272 | # Here are ALL the ones we don't need information from:
273 | 'generator',
274 | 'generator:source',
275 | 'generator:method',
276 | 'generator:type',
277 | 'generator:output',
278 | 'generator:note',
279 | 'generator:strings',
280 | 'generator:output:biogas',
281 | 'generator:output:hot_water',
282 | 'generator:plant',
283 | 'plant:source',
284 | 'plant:method',
285 | 'plant:type',
286 | 'power_source',
287 | 'note:generator:output:electricity',
288 | 'voltage',
289 | 'fixme',
290 | 'earliest_start_date',
291 | 'latest_start_date',
292 | 'amenity',
293 | 'capacity',
294 | 'floating',
295 | 'area',
296 | 'ref',
297 | 'website',
298 | 'alt_name',
299 | 'url',
300 | 'landcover',
301 | 'email',
302 | 'fax',
303 | 'postal_code',
304 | 'phone',
305 | 'site',
306 | 'surface',
307 | 'notes',
308 | 'industrial',
309 | 'listed_status',
310 | 'survey_date',
311 | 'man_made',
312 | 'barrier',
313 | 'fence_type',
314 | 'height',
315 | 'shop',
316 | 'wheelchair',
317 | 'sport',
318 | 'brand',
319 | 'leisure',
320 | 'frequency',
321 | 'manufacturer',
322 | 'architect',
323 | 'HE_ref',
324 | 'note_2',
325 | 'survey:date',
326 | 'mapillary',
327 | 'natural',
328 | 'construction',
329 | 'farmland',
330 | 'opening_hours',
331 | 	] or k.split(':')[0] in [
332 | 'source',  # NB there are lots of different source tags. Here we're suppressing any leftovers, even though we may use specific source tags already.
333 | 'operator',
334 | 'owner',
335 | 'description',
336 | 'contact',
337 | 'name',
338 | 'fence',
339 | 'social_facility',
340 | 'flickr',
341 | 'highway',
342 | 'landuse',
343 | 'fhrs',
344 | 'layer',
345 | 'ref',
346 | 'level',
347 | 'wikimedia_commons',
348 | 'wikidata',
349 | 'wikipedia',
350 | 'geograph',
351 | 'type', # rel multipolygon
352 | 'tourism',
353 | 'building',
354 | 'addr',
355 | 'roof',
356 | 'demolished',
357 | 'indoor',
358 | ]:
359 | 						pass
360 | 					else:
361 | 						ok = False
362 | 
363 | 					if not ok:
364 | 						astr = "Un-recognised tag in %s %s: %s=%s" % (curitem['objtype'], curitem['id'], k, v)
365 | 						print(astr)
366 | 						#raise ValueError(astr)
367 | 
368 | 					# TODO also try to calc_type: rooftop or infarm (location=roof; large size) or unknown
369 | 
370 | 				if curitem['tag_power']=='plant': # We store the power-plant's ID info in a form which will be easy to propagate down to child members
371 | 					curitem['plantref'] = (curitem['objtype'], curitem['id'])
372 | 
373 | 				# OK now store it - we only need to store top-level PV items, child-nodes etc are not needed except for the data stored elsewhere
374 | 				self.objs.append(self.curitem)
375 | 
376 | 
377 | 			self.curitem = None      # NB we need to clear "curitem" in ALL cases where it was a node/way/item, NOT just if it's a PV item processed.
378 | 
379 | 	def postprocess(self):
380 | 		"""
381 | 		This MUST be called, once, after the XML has been loaded.
382 | 		It:
383 | 		- ensures relation-members know about their parent object (incl. the plantref if parent is plant);
384 | 		- calculates the total area of each relation;
385 | 		- calculates the centroid of each relation;
386 | 		- performs spatial containment queries to label solar panels as members of a plant (i.e. plantref) if they're geographically inside them.
387 | 		"""
388 | 		# Find all objects that are relations and also plants. Then push down the metadata through their children (only for the temporary data), and also calculate the area for the parents.
389 | 		rels_postprocessed = 0
390 | 		self.plantoutlines = [] # {lat, lon, outlinepath, plantref} a list of ways that are either plants themselves, or non-inner members of plant relations; i.e. potential geo containers for panels
391 | 		for curitem in self.objs:
392 | 			if curitem['tag_power']=='plant' and curitem['objtype']=='way':
393 | 				self.plantoutlines.append({'lat': curitem['lat'], 'lon': curitem['lon'], 'outlinepath': self.waydata[curitem['id']]['outlinepath'], 'plantref': curitem['plantref']})
394 | 			if curitem['tag_power'] in ['plant', 'generator'] and curitem['objtype']=='relation':
395 | 				if curitem['tag_power']=='plant':
396 | 					plantitem = curitem
397 | 					plantref = ('relation', curitem['id'])
398 | 				else:
399 | 					plantitem = None
400 | 					plantref = None
401 | 				curitem['calc_area'] = 0
402 | 				self._recurse_relation_info(curitem, plantitem, plantref)
403 | 				rels_postprocessed += 1
404 | 
405 | 		print("Postprocessed %i power=* relations" % rels_postprocessed)
406 | 		print("Plant outlines for geo containment search: %i" % len(self.plantoutlines))
407 | 		plantoutlines_kdtree = kdtree([[item['lat'], item['lon']] for item in self.plantoutlines])
408 | 		# Now, for every generator object that DOESN'T have a plantref, we find its nearest-neighbour potential-containers and check for containment
409 | 		for curitem in self.objs:
410 | 			if curitem['tag_power']=='generator' and not curitem.get('plantref', None):
411 | 				for distance, arrayposition in zip(*plantoutlines_kdtree.query([curitem['lat'], curitem['lon']], 3, distance_upper_bound=1)):
412 | 					if distance != np.inf:
413 | 						if self.plantoutlines[arrayposition]['outlinepath'].contains_point([curitem['lat'], curitem['lon']]):
414 | 							curitem['plantref'] = self.plantoutlines[arrayposition]['plantref']
415 | 							#print("       spatially inferred generator %s/%s belongs to plant %s" % (curitem['objtype'], curitem['id'], curitem['plantref']))
416 | 
417 | 		if False: # This should NOT NORMALLY be activated. It inserts "guesstimate" power capacities for small-scale solar PV
418 | 			for curitem in self.objs:
419 | 				if curitem['tag_power']=='generator' and curitem.get('calc_capacity', 0)==0 and not curitem.get('plantref', None):
420 | 					curitem['calc_capacity'] = guess_kilowattage(curitem)
421 | 
422 | 
423 | 	def _recurse_relation_info(self, curitem, plantitem, plantref):
424 | 		"""Pushes down through relations' members, for two reasons: to compile their areas onto the parent, and to propagate the parent plant reference down to all.
425 | 		You will call it with curitem==plantitem for plants, and plantitem=None for gens; then the recursion keeps plantitem fixed and alters the immediate curitem."""
426 | 		# first we recurse into the child relations - the ways and rels will then add their area to our plantitem
427 | 		#print("")
428 | 		#print("_recurse_relation_info(curitem=%s, plantitem=%s, plantref=%s)"  % (curitem, plantitem, plantref))
429 | 		for childinfo in curitem['relations']:
430 | 			therel = self.reldata[childinfo['ref']]
431 | 			if 'plantref' in therel:
432 | 				raise ValueError("Suspicious recursion: while analysing a plant relation (%i) we found a child rel (%i) which already has plantref set: %s" % (plantitem['id'], childid, str(therel['plantref'])))
433 | 			else:
434 | 				self._recurse_relation_info(therel, plantitem, plantref)
435 | 		# now we grab all area info from one-level-down, and also push the plantref down one level
436 | 		latslist = []
437 | 		lonslist = []
438 | 		for childtype, childlist, childdatacache in [
439 | 			('node',     curitem['nodes'],     self.nodedata),
440 | 			('way',      curitem['ways'],      self.waydata),
441 | 			('relation', curitem['relations'], self.reldata),
442 | 			]:
443 | 			for childinfo in childlist: # each is a dict with 'ref' and 'role'
444 | 				childobj = childdatacache[childinfo['ref']]
445 | 				latslist.append(childobj['lat'])
446 | 				lonslist.append(childobj['lon'])
447 | 				multiplier = [1, -1][childinfo['role']=='inner']  # how to subtract inner-areas
448 | 				curitem['calc_area'] += multiplier * childobj.get('calc_area', 0)
449 | 				if plantref:
450 | 					childobj['plantref'] = plantref
451 | 				#if childtype=='way':
452 | 				#	print("             from way %s we add area %g" % (childinfo['ref'], multiplier * childobj['calc_area']))
453 | 				if childtype=='way' and childinfo['role']!='inner' and plantref:
454 | 					self.plantoutlines.append({'lat': childobj['lat'], 'lon': childobj['lon'], 'outlinepath': self.waydata[childinfo['ref']]['outlinepath'], 'plantref': plantref})
455 | 		curitem['lat'] = np.mean(latslist)
456 | 		curitem['lon'] = np.mean(lonslist)
457 | 
458 | 
459 | ##############
460 | # let's go!
461 | 
462 | if do_osmium:
463 | 	infp = subprocess.Popen(["osmium", "tags-filter", osmsourcefpath, "generator:method=photovoltaic", "plant:method=photovoltaic", "plant:source=solar", "-o", "-", "-f", "xml"],
464 | 			stdout=subprocess.PIPE).stdout
465 | else:
466 | 	infp = open(osmsourcefpath, 'rb')
467 | 
468 | parser = sax.make_parser()
469 | handler = SolarXMLHandler()
470 | parser.setContentHandler(handler)
471 | parser.parse(infp)
472 | infp.close()
473 | handler.postprocess()
474 | 
475 | # find all attribs in use
476 | allattribs = set()
477 | for obj in handler.objs:
478 | 	allattribs = allattribs.union(set(obj))
479 | allattribs = allattribs.difference(set(['nodes', 'ways', 'relations', 'tags']))
480 | 
481 | # some overcomplex coding to sort attributes in the way I want
482 | attribstarters = ['objtype', 'id', 'user', 'timestamp', 'lat', 'lon']
483 | def attribsorter(a):
484 | 	if a in attribstarters:
485 | 		return "a_%i" % attribstarters.index(a)
486 | 	else:
487 | 		return "b_%s" % a
488 | allattribs = sorted(list(allattribs), key=attribsorter)
489 | 
490 | if False:
491 | 	print()
492 | 	print("All object attributes in play:")
493 | 	for anattrib in allattribs:
494 | 		print(anattrib)
495 | 	print()
496 | 
497 | # output happy stats
498 | osmtotalobjs = len(handler.objs)
499 | print("")
500 | print("####################################################################")
501 | print(os.path.basename(osmsourcefpath))
502 | print("parsed %i OSM objects (%i nodes, %i ways, %i relations)" % (osmtotalobjs,
503 | 	len([_ for _ in handler.objs if _['objtype']=='node']),
504 | 	len([_ for _ in handler.objs if _['objtype']=='way']),
505 | 	len([_ for _ in handler.objs if _['objtype']=='relation'])
506 | 	))
507 | print("")
508 | 
509 | 
510 | # collect the unique REPD identifiers
511 | repds_used = []
512 | for item in handler.objs:
513 | 	if item.get('tag_repd:id', False):
514 | 		repds_used.extend(item['tag_repd:id'].split(';'))
515 | 
516 | readable = "standalone"
517 | subset = [_ for _ in handler.objs if _['tag_power']=='generator' and not _.get('plantref', None)]
518 | print("Solar PV panel items (power=generator) (%s):" % readable)
519 | print("   %i in total"                                                                    % len([_ for _ in subset]))
520 | print("   %g sq km total surface area"  % (1e-6 * np.sum([_['calc_area']                           for _ in subset])))
521 | print("   %g MW total generating capacity (NB metadata will be v incomplete for this)" % (1e-3 * np.sum([_.get('calc_capacity',0)             for _ in subset])))
522 | print("   %i nodes with no sqm tagged (could presume 'domestic', but needs more tagging)" % len([_ for _ in subset if    _['calc_area']==0]))
523 | print("   %i areas <= 30 sqm (could presume 'domestic')"                                  % len([_ for _ in subset if  0<_['calc_area']<=30]))
524 | print("   %i areas 30--2000 sqm (could presume 'commercial' or part of array)"            % len([_ for _ in subset if 30<_['calc_area']<=2000]))
525 | print("   %i areas > 2000 sqm (inspect to see if should really be tagged 'solar farm')"   % len([_ for _ in subset if    _['calc_area']>2000]))
526 | 
527 | readable = "within a farm"
528 | subset = [_ for _ in handler.objs if _['tag_power']=='generator' and     _.get('plantref', None)]
529 | print("Solar PV panel items (power=generator) (%s):" % readable)
530 | print("   %i in total"                                                                    % len([_ for _ in subset]))
531 | print("   %g sq km total surface area"  % (1e-6 * np.sum([_['calc_area']                           for _ in subset])))
532 | print("   %i nodes with no sqm tagged"                                                    % len([_ for _ in subset if    _['calc_area']==0]))
533 | print("   %i areas <= 30 sqm"                                                             % len([_ for _ in subset if  0<_['calc_area']<=30]))
534 | print("   %i areas 30--2000 sqm"                                                          % len([_ for _ in subset if 30<_['calc_area']<=2000]))
535 | print("   %i areas > 2000 sqm (inspect to see if should really be tagged 'solar farm')"   % len([_ for _ in subset if    _['calc_area']>2000]))
536 | 
537 | print("Solar PV farm items (power=plant):")
538 | print("   %i in total"                                                                    % len([_ for _ in handler.objs if _['tag_power']=='plant']))
539 | print("   %i have REPD identifier tagged"                                                 % len([_ for _ in handler.objs if _['tag_power']=='plant' and _.get('tag_repd:id', False)]))
540 | print("        (%i REPD identifiers encountered)"                                         % len(repds_used))
541 | print("   %g sq km total surface area"  % (1e-6 * np.sum([_['calc_area']                           for _ in handler.objs if _['tag_power']=='plant'])))
542 | print("   %g MW total generating capacity"  % (1e-3 * np.sum([_.get('calc_capacity',0)             for _ in handler.objs if _['tag_power']=='plant'])))
543 | print("   %i nodes with no sqm tagged (needs more tagging)"                               % len([_ for _ in handler.objs if _['tag_power']=='plant' and    _['calc_area']==0]))
544 | print("   %i areas <= 30 sqm - not including nodes"                                       % len([_ for _ in handler.objs if _['tag_power']=='plant' and  0<_['calc_area']<=30]))
545 | print("   %i areas 30--2000 sqm"                                                          % len([_ for _ in handler.objs if _['tag_power']=='plant' and 30<_['calc_area']<=2000]))
546 | print("   %i areas > 2000 sqm"                                                            % len([_ for _ in handler.objs if _['tag_power']=='plant' and    _['calc_area']>2000]))
547 | 
548 | 
549 | def csvformatspecialfields(k, v):
550 | 	"special formatting sometimes needed"
551 | 	if k=='plantref':
552 | 		if not v: return ''
553 | 		return "%s/%s" % v
554 | 	return v
555 | 
556 | try:
557 | 	with open("osm.csv", 'w', buffering=1) as outfp:
558 | 		outfp.write(",".join(allattribs) + "\n")
559 | 		for obj in handler.objs:
560 | 			outfp.write(",".join(map(str, [csvformatspecialfields(anattrib, obj.get(anattrib, '')) for anattrib in allattribs])) + "\n")
561 | except:
562 | 	os.rename("osm.csv", "osm_ERROR.csv")
563 | 	raise
564 | 
565 | print("==========================================================")
566 | print("Finished creating initial OSM PV solar extract spreadsheet")
567 | print("==========================================================")
568 | 
569 | 


--------------------------------------------------------------------------------
/data/exported/analyse_exported.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # script to take the main exported pv data, and produce some descriptive statistics and plots
  3 | # by Dan Stowell 2020
  4 | 
  5 | # note that as well as our data, you will need geojson/shapefiles defining the regions that we plot/aggregate over:
  6 | #   LSOAs (specified by the UK statistical authority), GSP regions (specified by the UK National Grid ESO)
  7 | 
  8 | import csv, os
  9 | import numpy as np
 10 | import pandas as pd
 11 | import geopandas as gpd
 12 | from shapely.geometry import Point
 13 | 
 14 | import rtree       # not used directly, but if you don't have it, geopandas will fail to perform sjoin
 15 | import pdfpages    # not used directly, but I needed it for mpl to work
 16 | import descartes   # not used directly, but gpd uses it for choropleths
 17 | 
 18 | import seaborn as sns
 19 | import matplotlib.pyplot as plt
 20 | from matplotlib.backends.backend_pdf import PdfPages
 21 | import matplotlib.pyplot as plt
 22 | sns.set(style="whitegrid")
 23 | 
 24 | from sklearn import linear_model
 25 | 
 26 | ##############################################################################
 27 | # config
 28 | 
 29 | # input paths:
 30 | pvexportfpath = os.path.expanduser("ukpvgeo_points.csv")
 31 | geometryfpath = '../raw/osm-gb-solaronly.geojson'
 32 | gspregionsfpath = os.path.expanduser("../other/gsp_regions_20181031.geojson") # GSP regions from NG ESO
 33 | lsoaregionsfpath = os.path.expanduser("../other/Lower_Layer_Super_Output_Areas_December_2011_Full_Clipped__Boundaries_in_England_and_Wales.shp")
 34 | 
 35 | # out paths:
 36 | gsp_est_outfpath  = os.path.expanduser("ukpvgeo_subtotals_gsp_capacity.csv")
 37 | lsoa_est_outfpath = os.path.expanduser("ukpvgeo_subtotals_lsoa_capacity.csv")
 38 | 
 39 | # if you have access to the Sheffield Solar data for validation, activate this and set the paths appropriately:
 40 | got_sheff = True
 41 | auxdocs_gdrive_path = os.path.expanduser("~/Documents/turing/turing-climate-call/Turing_OCF_OSM_gdrive/")
 42 | sheff_cap_by_gsp_path  = "%s/PV_capacity_by_GSP_and_LLSOA/capacity_by_llsoa_and_gsp_20200617T165804/20200617T165804_capacity_by_GSP_region.csv" % auxdocs_gdrive_path
 43 | sheff_cap_by_lsoa_path = "%s/PV_capacity_by_GSP_and_LLSOA/capacity_by_llsoa_and_gsp_20200617T165804/20200617T165804_capacity_by_llsoa.csv" % auxdocs_gdrive_path
 44 | 
 45 | # other:
 46 | inttype = pd.Int64Dtype()
 47 | 
 48 | 
 49 | ##############################################################################
 50 | # Load data
 51 | 
 52 | gspdf = gpd.read_file(gspregionsfpath)
 53 | gspdf = gspdf.to_crs("EPSG:3857").sort_values("RegionID")
 54 | 
 55 | lsoas = gpd.read_file(lsoaregionsfpath).to_crs("EPSG:3857")
 56 | lsoas = lsoas.drop(['objectid', 'st_areasha', 'st_lengths', 'lsoa11nmw'], axis=1)
 57 | 
 58 | df = pd.read_csv(pvexportfpath, dtype={'repd_id':inttype, 'osm_id':inttype, 'repd_cluster_id':inttype, 'osm_cluster_id':inttype, 'num_modules':inttype, 'orientation':inttype})
 59 | # convert the main CSV to a GeoDataFrame of points, so we can perform geo queries
 60 | df = gpd.GeoDataFrame(df, crs="epsg:4326", geometry=[Point(xy) for xy in zip(df.longitude, df.latitude)])
 61 | df = df.to_crs("EPSG:3857")
 62 | df = df.drop(['longitude', 'latitude'], axis=1)
 63 | 
 64 | gdf = gpd.read_file(geometryfpath)  # we're going to use this merely to check for containment. If an OSM item is contained entirely within another polygon, we shouldn't double-count its area.
 65 | 
 66 | ##############################################################################
 67 | # Simple subtotals
 68 | 
 69 | print("ukpvgeo_points.csv file contains %i data rows." % len(df))
 70 | print("Simple subtotals, number of installations/clusters:")
 71 | 
 72 | asubset = df[df['osm_id']>0]
 73 | numinst = len(asubset[['osm_objtype', 'osm_id']].drop_duplicates())
 74 | numclus = len(asubset[['osm_cluster_id']].drop_duplicates())
 75 | print(f"From OSM:   {numinst} / {numclus}")
 76 | 
 77 | asubset = df[df['repd_id']>0]
 78 | numinst = len(asubset[['repd_id']].drop_duplicates())
 79 | numclus = len(asubset[['repd_cluster_id']].drop_duplicates())
 80 | print(f"From REPD:  {numinst} / {numclus}")
 81 | 
 82 | asubset = df
 83 | numinst = len(asubset[['osm_objtype', 'osm_id', 'repd_id']].drop_duplicates())
 84 | numclus = len(asubset[['osm_cluster_id', 'repd_id']].drop_duplicates())
 85 | print(f"Harmonised: {numinst} / {numclus}")
 86 | print()
 87 | 
 88 | ##############################################################################
 89 | # Preprocessing
 90 | 
 91 | df['centroid'] = df.centroid
 92 | 
 93 | # Make a coalesced "capacity" column -- LATER add other sources e.g. estimates
 94 | df['capacity_merged_MWp'] = df['capacity_osm_MWp'].combine_first(df['capacity_repd_MWp'])
 95 | 
 96 | 
 97 | # categorise units into small/medium/large
 98 | def categorise_entry(row):
 99 | 	if row['capacity_merged_MWp'] > 0.1:
100 | 		return "large"
101 | 	elif row['capacity_merged_MWp'] > 0.01:
102 | 		return "medium"
103 | 	elif row['capacity_merged_MWp'] > 0:
104 | 		return "small"
105 | 	elif row['area_sqm'] > 2000:
106 | 		return "large"
107 | 	elif row['area_sqm'] > 30:
108 | 		return "medium"
109 | 	else:
110 | 		return "small"
111 | 
112 | df['category'] = df.apply(categorise_entry, axis=1).astype(pd.CategoricalDtype(categories=["small", "medium", "large"], ordered=True))
113 | 
114 | 
115 | # find fully-contained OSM items, and flag them in a special column so that we don't use them in capacity estimates
116 | gdf = gdf[~gdf.osm_id.isna()]
117 | gdf_within = gpd.sjoin(gdf, gdf, how='inner', op="within")
118 | containified = gdf_within[(gdf_within['osm_id_left']!=gdf_within['osm_id_right'])]['osm_id_left'] # the left-hand item is the contained one.
119 | containified = pd.to_numeric(containified.values)
120 | df['area_is_contained'] = df['osm_id'].isin(containified)
121 | print("Found %i OSM items that are entirely-contained within others --- and hence we won't use them for inferring capacity from area" % df['area_is_contained'].sum())
122 | del gdf_within, containified
123 | 
124 | def really_count_nonzero(ser):
125 |     return ser.fillna(0, inplace=False).astype(bool).sum()
126 | 
127 | ##############################################################################
128 | pdf = PdfPages("plot_analyse_exported.pdf")
129 | 
130 | ##############################################################################
131 | # Simple stats about metadata presence/absence:
132 | def format_num_entries(colname, df):
133 | 	"Convenience function for counting positive entries in data column"
134 | 	return "%i (==%.1f %% of rows)" % (df[colname].count(),
135 | 		100 * df[colname].count()/len(df))
136 | 
137 | print("")
138 | print("METADATA STATS:")
139 | for colname in ['capacity_merged_MWp', 'orientation', 'located', 'num_modules']:
140 | 	print("%s, num entries: %s" % (colname, format_num_entries(colname, df)))
141 | print("area_sqm, num entries: %i (==%.1f %% of rows)" % ((df['area_sqm'] > 0).sum(),
142 | 		100 * (df['area_sqm']>0).sum()/len(df)))
143 | 
144 | # capacity - num present, median, simple histogram of these
145 | print("Capacity, sum: %g MWp" % df['capacity_merged_MWp'].sum())
146 | print("Capacity, median: %g MWp" % df['capacity_merged_MWp'].median())
147 | fig, ax = plt.subplots(figsize=(10, 6))
148 | notches = np.geomspace(1, 1e6, 49) * 0.001
149 | sns.distplot(df['capacity_merged_MWp'], norm_hist=False, kde=False, bins=(np.hstack(([0], notches))))
150 | ax.set_xscale("log")
151 | ax.set_yscale("log")
152 | ax.set_xticks([1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2])
153 | ax.set_xticklabels(['0.1 kWp', '1 kWp', '10 kWp', '100 kWp', '1 MWp', '10 MWp', '100 MWp'])
154 | ax.set_ylabel("Number of installations")
155 | plt.title("Distribution of PV installation capacities (%i tagged)" % (1-df['capacity_merged_MWp'].isna()).sum())
156 | pdf.savefig(fig)
157 | plt.close()
158 | 
159 | # orientation (7800) - also plot a circular histogram of these
160 | fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(projection='polar'))
161 | anglebins = np.arange(0.03125 * np.pi, 2 * np.pi + 1e-3, 0.0625 * np.pi)
162 | sns.distplot(df['orientation'].astype('float') * np.pi / 180, bins=anglebins, norm_hist=False, kde=False)
163 | ax.set_theta_zero_location('N')
164 | plt.title("Distribution of PV installation orientations (%i tagged)" % (1-df['orientation'].isna()).sum())
165 | pdf.savefig(fig)
166 | plt.close()
167 | 
168 | # located (136,000) - give frequency of roof etc
169 | print("Values of 'located':")
170 | print(df.located.value_counts())
171 | 
172 | # num_modules (6000) 
173 | fig, ax = plt.subplots(figsize=(10, 6))
174 | notches = np.array([0, 1, 3, 10, 30, 100, 300, 1000, 3000, 10000, 30000, 100000])
175 | sns.distplot(df['num_modules'].astype('float'), norm_hist=False, kde=False, bins=notches)
176 | ax.set_xscale("log")
177 | ax.set_yscale("log")
178 | ax.set_ylabel("Number of installations")
179 | plt.title("Distribution of PV installation num_modules (%i tagged)" % (1-df['num_modules'].isna()).sum())
180 | pdf.savefig(fig)
181 | plt.close()
182 | 
183 | # areas (11000) 
184 | fig, ax = plt.subplots(figsize=(10, 6))
185 | notches = np.array([0, 1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5, 3e5, 1e6])
186 | sns.distplot(df[df.area_sqm>0]['area_sqm'], norm_hist=False, kde=False, bins=notches)
187 | ax.set_xscale("log")
188 | ax.set_yscale("log")
189 | ax.set_ylabel("Number of installations")
190 | ax.set_xlabel("Surface area of polygon (m^2)")
191 | plt.title("Distribution of PV installation polygon areas (%i tagged)" % (df.area_sqm>0).sum())
192 | pdf.savefig(fig)
193 | plt.close()
194 | 
195 | 
196 | ##############################################################################
197 | # Capacity regress against area. Also capacity regress against num_modules, and even the joint version.
198 | print("")
199 | print("CAPACITY LINEAR REGRESSIONS:")
200 | 
201 | #for whichcat in df['category'].values.categories:
202 | if True:
203 | 
204 | 	fig, ax = plt.subplots(figsize=(10, 6))
205 | 
206 | 	whichcat = 'all'
207 | 	subset = df[(df.area_sqm>0) & (df.capacity_merged_MWp>0)]
208 | 
209 | 	if False: # enable this to grab a specific subset
210 | 		whichcat = 'small'
211 | 		subset = subset[subset.category==whichcat]
212 | 
213 | 	print("   Num items of type '%s' with area+capacity to regress: %i" % (whichcat, len(subset)))
214 | 
215 | 	xvals = subset['area_sqm']
216 | 	yvals = subset['capacity_merged_MWp']
217 | 	regr = linear_model.LinearRegression(fit_intercept=False) # force line to pass through zero
218 | 	data_toregress = np.array(xvals).reshape(-1, 1)
219 | 	regr.fit(data_toregress, yvals)
220 | 	linregpredict = regr.predict(data_toregress)
221 | 	rsq = regr.score(data_toregress, yvals)
222 | 
223 | 	print("     Slope: %.2f W / sq m     R^2: %.3f" % (regr.coef_[0] * 1000000, rsq))
224 | 
225 | 	plt.plot(sorted(xvals * 1e-6), sorted(linregpredict), 'b-', alpha=0.4)
226 | 	plt.scatter(xvals * 1e-6, yvals, marker='+', alpha=0.4)
227 | 	plt.annotate("Slope: %.2f W / sq m\nR^2: %.3f" % (regr.coef_[0] * 1000000, rsq), xy=(0.8, 0.1), xycoords='axes fraction', color=(0.5, 0.5, 0.5))
228 | 
229 | 	#plt.xlim(1, 1000)
230 | 	plt.ylabel('Capacity (MWp)')
231 | 	plt.xlabel('Calculated area of PV object (sq km)')
232 | 	plt.title("Area vs capacity in OSM&REPD (UK)")
233 | 
234 | 	pdf.savefig(fig)
235 | 	plt.close()
236 | 
237 | area_regressor = regr.coef_[0]
238 | 
239 | #for whichcat in df['category'].values.categories:
240 | if True:
241 | 
242 | 	fig, ax = plt.subplots(figsize=(10, 6))
243 | 
244 | 	whichcat = 'all'
245 | 	subset = df[(df.num_modules>0) & (df.capacity_merged_MWp>0)]
246 | 
247 | 	if True: # enable this to grab a specific subset
248 | 		whichcat = 'small'
249 | 		subset = subset[subset.category==whichcat]
250 | 
251 | 	print("   Num items of type '%s' with nummod+capacity to regress: %i" % (whichcat, len(subset)))
252 | 
253 | 	xvals = subset['num_modules']
254 | 	yvals = subset['capacity_merged_MWp']
255 | 	regr = linear_model.LinearRegression(fit_intercept=False) # force line to pass through zero
256 | 	data_toregress = np.array(xvals).reshape(-1, 1)
257 | 	regr.fit(data_toregress, yvals)
258 | 	linregpredict = regr.predict(data_toregress)
259 | 	rsq = regr.score(data_toregress, yvals)
260 | 
261 | 	print("     Slope: %.2f W / unit     R^2: %.3f" % (regr.coef_[0] * 1000000, rsq))
262 | 
263 | 	if False:
264 | 		plt.plot(sorted(xvals), sorted(linregpredict), 'b-', alpha=0.4)
265 | 		plt.scatter(xvals, yvals, marker='+', alpha=0.4)
266 | 		plt.annotate("Slope: %.2f W / unit\nR^2: %.3f" % (regr.coef_[0] * 1000000, rsq), xy=(0.8, 0.1), xycoords='axes fraction', color=(0.5, 0.5, 0.5))
267 | 
268 | 		#plt.xlim(1, 1000)
269 | 		plt.ylabel('Capacity (MWp)')
270 | 		plt.xlabel('num_modules of PV object')
271 | 		plt.title("num_modules vs capacity in OSM&REPD (UK) (type: %s)" % whichcat)
272 | 
273 | 		pdf.savefig(fig)
274 | 		plt.close()
275 | 
276 | 
277 | 
278 | ##############################################################################
279 | # Our estimate of UK's MW capacity - for each of the 3 types, and the total
280 | 
281 | # calc this progressively with estimates too: pure-OSM, pure-REPD, +OSM, +regress_area, +guesstimate_points_as_3.
282 | 
283 | print("")
284 | print("TOTAL MERGED CAPACITY ESTIMATES:")
285 | # merged2: +regress_area
286 | df['capacity_merged2_MWp'] = df['capacity_merged_MWp'].combine_first(df['area_sqm'] * area_regressor * ~df['area_is_contained'])
287 | # merged3: missing values as 3 kW
288 | df['capacity_merged3_MWp'] = df['capacity_merged2_MWp']
289 | df.loc[(df.capacity_merged2_MWp==0) & (~df['area_is_contained']), 'capacity_merged3_MWp']=0.003
290 | 
291 | # explicitly tag the source of each capacity
292 | def calc_sourceof_capacity(row):
293 | 	if (row['capacity_osm_MWp']>0) and (row['capacity_osm_MWp'] != row['capacity_repd_MWp']):
294 | 		return 'osm'
295 | 	elif row['capacity_repd_MWp']>0:
296 | 		return 'repd'
297 | 	elif (row['capacity_merged2_MWp']>0 and not row['capacity_merged_MWp']>0):
298 | 		return "area_regress"
299 | 	elif (row['capacity_merged3_MWp']>0 and not row['capacity_merged2_MWp']>0):
300 | 		return "point"
301 | 	else:
302 | 		return "HUH"
303 | 
304 | df['sourceof_capacity'] = df.apply(calc_sourceof_capacity, axis=1).astype(
305 | 		pd.CategoricalDtype(categories=["repd", "osm"#, "area_regress"#, "point" #, "HUH"
306 | 	], ordered=True))
307 | 
308 | cols = ['capacity_osm_MWp', #'capacity_repd_MWp',
309 |         'capacity_merged_MWp', 'capacity_merged2_MWp', 'capacity_merged3_MWp']
310 | cols_lbls = ['OSM',
311 |              'OSM&REPD', '...+areas_infer', '...+points_est']
312 | cols_lbls_long = ['capacity_osm_MWp',
313 |                   'capacity_osmrepd_MWp', 'capacity_osmrepdareas_MWp', 'capacity_osmrepdareaspoints_MWp']
314 | 
315 | cols_notplotted = ['capacity_repd_MWp']
316 | cols_all = cols + cols_notplotted
317 | cols_lbls_all      = cols_lbls + ['REPD']
318 | cols_lbls_long_all = cols_lbls_long + ['capacity_repd_MWp']
319 | 
320 | piv_mw_cat = pd.pivot_table(df, values=cols_all, index='category', aggfunc='sum')
321 | print(piv_mw_cat.T)
322 | print("Totals:")
323 | print(piv_mw_cat.sum()) # totals
324 | 
325 | # output as a stacked plot, with y-axis as MWp, x-axis as these steps, the 3 categories.
326 | fig, ax = plt.subplots(figsize=(10, 6))
327 | ax.stackplot(range(len(cols)), np.array([piv_mw_cat[col].values for col in cols]).T,
328 | 	labels=piv_mw_cat.index.categories.values, linewidth=0)
329 | ax.set_xticks(range(len(cols)))
330 | ax.set_xticklabels(cols_lbls)
331 | #ax.set_yscale('log')
332 | ax.set_ylabel("Total MWp")
333 | plt.legend(loc="lower left")
334 | 
335 | plt.title("Total capacity, at various steps of merging/inference")
336 | pdf.savefig(fig)
337 | plt.close()
338 | 
339 | 
340 | # now the subtotals of num items, not of MW:
341 | piv_count_cat = pd.pivot_table(df, values=cols_all, index='category', aggfunc=really_count_nonzero).astype(int)
342 | print("Number of items with nonzero capacity:")
343 | print(piv_count_cat.T)
344 | print("Totals:")
345 | print(piv_count_cat.sum()) # totals
346 | 
347 | fig, ax = plt.subplots(figsize=(10, 6))
348 | ax.stackplot(range(len(cols)), np.array([piv_count_cat[col].values for col in cols]).T,
349 | 	labels=piv_count_cat.index.categories.values, linewidth=0)
350 | ax.set_xticks(range(len(cols)))
351 | ax.set_xticklabels(cols_lbls)
352 | #ax.set_yscale('log')
353 | ax.set_ylabel("Total number")
354 | plt.legend(loc="lower left")
355 | 
356 | plt.title("Items with tagged capacity, at various steps of merging/inference")
357 | pdf.savefig(fig)
358 | plt.close()
359 | 
360 | 
361 | 
362 | if True:
363 | 	# Let's do a log-log plot of the long-tail distribution:
364 | 	# rank position on the x-axis, value on the y-axis, sourceof as the category
365 | 	loglogvariable = 'capacity_merged2_MWp'
366 | 	loglogcols = ["osm", "repd"] #"area_regress"#, "point"
367 | 	loglogpalette = ['b', 'r'] #'y'#, 'k'
368 | 	dflt = df[[loglogvariable, 'sourceof_capacity']].sort_values(inplace=False, axis=0, by=loglogvariable, ascending=False)
369 | 	dflt = dflt[dflt['sourceof_capacity'].isin(loglogcols)]
370 | 	dflt['rank'] = dflt[loglogvariable].rank(ascending=False)
371 | 	# Now, to reduce plot kb bulk, we aggregate the data by counting
372 | 	gcount = dflt.groupby([loglogvariable, 'sourceof_capacity', 'rank'],
373 | 			observed=True).agg(count=('rank', 'count'))
374 | 	gcount.reset_index(inplace=True)
375 | 	g = sns.relplot(x="rank", y=loglogvariable, hue="sourceof_capacity", data=gcount,
376 | 		        height=6, aspect=10/6, #size='count',
377 | 			alpha=0.5, marker='o', linewidths=0, edgecolor='none',
378 | 			hue_order=loglogcols, palette=loglogpalette)
379 | 	ax = g.facet_axis(0,0)
380 | 	ax.set_xscale('log')
381 | 	ax.set_yscale('log')
382 | 	ax.set_yticks([1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2])
383 | 	ax.set_yticklabels(['0.1 kWp', '1 kWp', '10 kWp', '100 kWp', '1 MWp', '10 MWp', '100 MWp'])
384 | 	ax.set_ylabel("Installation capacity")
385 | 	ax.set_xlabel("Rank position of capacity")
386 | 	plt.legend(loc="lower left")
387 | 	plt.title("log-log plot (for checking power-law behaviour)")
388 | 
389 | 	pdf.savefig(g.fig)
390 | 	plt.close()
391 | 
392 | ##############################################################################
393 | ##############################################################################
394 | ##############################################################################
395 | # Subtotals per-region (GSP, LSOA) --- choropleths and summary CSVs
396 | 
397 | # dfc is just the centroids, in the same order as the main data
398 | dfc = df[['centroid']].set_geometry('centroid', inplace=False, crs=df.crs)
399 | 
400 | df[['RegionID', 'RegionName']] = gpd.sjoin(dfc, gspdf, how='left', op='intersects'
401 |                                           ).drop(['index_right'], axis=1)[['RegionID', 'RegionName']]
402 | df.RegionID = df.RegionID.astype(inttype)
403 | 
404 | def statstr(df, col):
405 | 	themedian = df[col].median()
406 | 	themax    = df[col].max()
407 | 	thesum    = df[col].sum()
408 | 	theempty  = (df[col]==0).sum()
409 | 	return "median %.1f\nmax %.1f\nsum %.0f\nempty %i" % (themedian, themax, thesum, theempty)
410 | 
411 | def plot_choropleth(dataframe_per_region, col, vmax, plottitle, cmap='hot', legend=True, vmin=0, show_statstr=True):
412 | 	fig, ax = plt.subplots(figsize=(8, 10))
413 | 	plot_choropleth_onax(ax, dataframe_per_region, col, vmax, plottitle, cmap, legend, vmin, show_statstr)
414 | 	pdf.savefig(fig)
415 | 	plt.close()
416 | 
417 | def plot_choropleth_onax(ax, dataframe_per_region, col, vmax, plottitle, cmap='hot', legend=True, vmin=0, show_statstr=True):
418 | 	dataframe_per_region.plot(column=col, ax=ax, linewidth=0.01, edgecolor=(0.7, 0.7, 0.7), vmin=vmin, vmax=vmax, cmap=cmap, legend=legend)
419 | 	if plottitle:
420 | 		ax.set_title(plottitle)
421 | 	if show_statstr:
422 | 		plt.annotate(statstr(dataframe_per_region, col), xy=(0.7, 0.9), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=8)
423 | 	ax.set_xlim(left=-0.75e6)
424 | 	ax.set_ylim(top=8.0e6)
425 | 	ax.set_xticks([])
426 | 	ax.set_yticks([])
427 | 
428 | if False:
429 | 	# just the regions
430 | 	fig, ax = plt.subplots(figsize=(8, 10))
431 | 	gspdf.plot(ax=ax, linewidth=0.01)
432 | 	plt.title("GSP regions")
433 | 	plt.xlim(xmin=-0.75e6)
434 | 	plt.ylim(ymax=8.0e6)
435 | 	plt.xticks([])
436 | 	plt.yticks([])
437 | 	pdf.savefig(fig)
438 | 	plt.close()
439 | 
440 | # These "pivot tables" (and more below) are our basic subtotals per-region summaries
441 | piv_count_gsp = pd.pivot_table(df, values='geometry', index='RegionID', aggfunc=really_count_nonzero).rename(columns={'geometry':'num'})
442 | piv_mw_gsp    = pd.pivot_table(df, values=cols_all, index='RegionID', aggfunc='sum')
443 | 
444 | # NOTE about "pergsp": this has one row per GSP, and as we go through we will merge new columns on to it when we want to plot them.
445 | # This just means: avoid clashing column names.
446 | # same for "perlsoa"
447 | 
448 | # num items per region
449 | fig, ax = plt.subplots(figsize=(8, 10))
450 | pergsp = gspdf.merge(piv_count_gsp, how='left', on='RegionID').sort_values("RegionID")
451 | pergsp['num'].fillna(0, inplace=True)
452 | plot_choropleth(pergsp, "num", None, "Number of items (clustered) in each GSP region", cmap='copper')
453 | 
454 | 
455 | pergsp = pergsp.merge(piv_mw_gsp, how='left', on='RegionID').sort_values("RegionID")
456 | for col in cols_all:
457 | 	pergsp[col].fillna(0, inplace=True)
458 | 
459 | vmax = max([pergsp[col].max() for col in cols])
460 | for col, col_lbl in zip(cols, cols_lbls):
461 | 	plot_choropleth(pergsp, col, vmax, "Capacity in each GSP region (MWp): %s" % col_lbl)
462 | 
463 | # csv
464 | with open(gsp_est_outfpath, 'w') as outfp:
465 | 	#header
466 | 	outfp.write("RegionID,RegionName," + ','.join(map(str, cols_lbls_long_all)) + "\n")
467 | 	for index, row in pergsp.iterrows():
468 | 		outfp.write(("%i,%s," % (row['RegionID'], row['RegionName'])) + ','.join([("%.3f" % row[col]) for col in cols_all]) + "\n")
469 | 
470 | ##############################################################################
471 | # subtotals (heatmap) again, but for LSOA
472 | 
473 | # Add LSOA to our main data
474 | df[['lsoa11cd', 'lsoa11nm']] = gpd.sjoin(dfc, lsoas, how='left', op='intersects'
475 |                                           ).drop(['index_right'], axis=1)[['lsoa11cd', 'lsoa11nm']]
476 | 
477 | if False:
478 | 	# just the regions
479 | 	fig, ax = plt.subplots(figsize=(8, 10))
480 | 	lsoas.plot(ax=ax, linewidth=0.01)
481 | 	plt.title("LSOA regions")
482 | 	plt.xlim(xmin=-0.75e6)
483 | 	plt.ylim(ymax=8.0e6)
484 | 	plt.xticks([])
485 | 	plt.yticks([])
486 | 	pdf.savefig(fig)
487 | 	plt.close()
488 | 
489 | perlsoa = lsoas   # will be merged
490 | 
491 | if False:
492 | 	# num items per region
493 | 	piv_count_lsoa = pd.pivot_table(df, values='geometry', index='lsoa11cd', aggfunc=really_count_nonzero).rename(columns={'geometry':'num'})
494 | 	perlsoa = perlsoa.merge(piv_count_lsoa, how='left', on='lsoa11cd').sort_values("lsoa11cd")
495 | 	perlsoa['num'].fillna(0, inplace=True)
496 | 	plot_choropleth(perlsoa, "num", None, "Number of items (clustered) in each LSOA region", cmap='copper')
497 | 
498 | piv_mw_lsoa = pd.pivot_table(df, values=cols_all, index='lsoa11cd', aggfunc='sum')
499 | 
500 | perlsoa = perlsoa.merge(piv_mw_lsoa, how='left', on='lsoa11cd').sort_values("lsoa11cd")
501 | for col in cols:
502 | 	perlsoa[col].fillna(0, inplace=True)
503 | 
504 | for col, col_lbl in zip(cols, cols_lbls):
505 | 	print("Capacity in each LSOA region (MWp): %s" % col_lbl)
506 | 	print(statstr(perlsoa, col))
507 | 
508 | if False:
509 | 	vmax = max([piv_mw_lsoa[col].max() for col in cols])
510 | 	for col, col_lbl in zip(cols, cols_lbls):
511 | 		plot_choropleth(perlsoa, col, vmax, "Capacity in each LSOA region (MWp): %s" % col_lbl)
512 | 
513 | # csv
514 | with open(lsoa_est_outfpath, 'w') as outfp:
515 | 	#header
516 | 	outfp.write("lsoa11cd,lsoa11nm," + ','.join(map(str, cols_lbls_long_all)) + "\n")
517 | 	for index, row in perlsoa.iterrows():
518 | 		outfp.write(("%s,%s," % (row['lsoa11cd'], row['lsoa11nm'])) + ','.join([("%.3f" % row[col]) for col in cols_all]) + "\n")
519 | 
520 | ##############################################################################
521 | # Next: plot the estimates from Sheffield/SolarMedia data, and correlate them against ours
522 | 
523 | if got_sheff:
524 | 
525 | 	sheff_cap_by_gsp = pd.read_csv(sheff_cap_by_gsp_path) # NB! Use RegionID
526 | 
527 | 	pergsp = pergsp.merge(sheff_cap_by_gsp, how='left', on='RegionID').sort_values("RegionID")
528 | 
529 | 	col = 'dc_capacity'
530 | 	pergsp[col].fillna(0, inplace=True)
531 | 
532 | 	vmax = sheff_cap_by_gsp[col].max()
533 | 
534 | 	fig, ax = plt.subplots(figsize=(8, 10))
535 | 
536 | 	pergsp.plot(column=col, ax=ax, linewidth=0.01, edgecolor=(0.7, 0.7, 0.7), vmax=vmax, cmap='hot', legend=True)
537 | 	plt.title("Capacity in each GSP region (MWp): from Sheffield/SolarMedia")
538 | 	plt.annotate(statstr(pergsp, col), xy=(0.7, 0.9), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=8)
539 | 	plt.xlim(xmin=-0.75e6)
540 | 	plt.ylim(ymax=8.0e6)
541 | 	plt.xticks([])
542 | 	plt.yticks([])
543 | 
544 | 	pdf.savefig(fig)
545 | 	plt.close()
546 | 
547 | 	# create table with ours and theirs; measure correlation; scatterplot.
548 | 	gspcorreltab = sheff_cap_by_gsp.merge(piv_mw_gsp, how='outer', on='RegionID').fillna(0, inplace=False).sort_values("RegionID")
549 | 	gsprsq = gspcorreltab[['dc_capacity', 'capacity_merged3_MWp']].corr().iloc[0,1] ** 2
550 | 	print("Correlation between our and Sheffield's MW subtotals per-GSP: Rsq = %.3f" % gsprsq)
551 | 
552 | 	fig, ax = plt.subplots(figsize=(8, 8))
553 | 	ax = sns.regplot(x="dc_capacity", y="capacity_merged3_MWp", data=gspcorreltab, ax=ax,
554 | 		   scatter_kws={"s": 50, "alpha": 0.5}
555 | 		  )
556 | 	ax.set_xlabel("DC capacity from Sheffield/SolarMedia (MW)")
557 | 	ax.set_ylabel("Capacity from OSM+REPD+inferred (MWp)")
558 | 	ax.set_title("Comparison of capacity subtotals per GSP region")
559 | 	ax.set_aspect(1)
560 | 	ax.set_xlim(-10, 500)
561 | 	ax.set_ylim(-10, 500)
562 | 
563 | 	plt.annotate("R^2: %.3f" % gsprsq, xy=(0.05, 0.95), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=12)
564 | 
565 | 	pdf.savefig(fig)
566 | 	plt.close()
567 | 
568 | 	### same for LSOA (NB no choro, too dense)
569 | 
570 | 	sheff_cap_by_lsoa = pd.read_csv(sheff_cap_by_lsoa_path) # NB! Use lsoa11cd
571 | 	sheff_cap_by_lsoa = sheff_cap_by_lsoa.rename(columns={'LLSOACD': 'lsoa11cd'})
572 | 
573 | 	lsoacorreltab = sheff_cap_by_lsoa.merge(piv_mw_lsoa, how='outer', on='lsoa11cd').fillna(0, inplace=False).sort_values("lsoa11cd")
574 | 	lsoarsq = lsoacorreltab[['dc_capacity', 'capacity_merged3_MWp']].corr().iloc[0,1] ** 2
575 | 	print("Correlation between our and Sheffield's MW subtotals per-LSOA: Rsq = %.3f" % lsoarsq)
576 | 
577 | 	fig, ax = plt.subplots(figsize=(8, 8))
578 | 	ax = sns.regplot(x="dc_capacity", y="capacity_merged3_MWp", data=lsoacorreltab, ax=ax,
579 | 		   scatter_kws={"s": 50, "alpha": 0.5}
580 | 		  )
581 | 	ax.set_xlabel("DC capacity from Sheffield/SolarMedia (MW)")
582 | 	ax.set_ylabel("Capacity from OSM+REPD+inferred (MWp)")
583 | 	ax.set_title("Comparison of capacity subtotals per LSOA region")
584 | 	ax.set_aspect(1)
585 | 	ax.set_xlim(-2, 80)
586 | 	ax.set_ylim(-2, 80)
587 | 
588 | 	plt.annotate("R^2: %.3f" % lsoarsq, xy=(0.05, 0.95), xycoords='axes fraction', color=(0.5, 0.5, 0.5), fontsize=12)
589 | 
590 | 	pdf.savefig(fig)
591 | 	plt.close()
592 | 
593 | 	#################################################################################
594 | 	# choropleth but showing only the deltas between pure-REPD, our merged outcome, and SolarMedia outcome:
595 | 	if True:
596 | 
597 | 		diffchoros = [
598 | 			('capacity_delta_ours_repd_MWp', 'DELTA ours from REPD',       'capacity_merged3_MWp', 'capacity_repd_MWp'),
599 | 			('capacity_delta_SM_repd_MWp',   'DELTA SolarMedia from REPD', 'dc_capacity',          'capacity_repd_MWp'),
600 | 			]
601 | 
602 | 		for col, col_lbl, cola, colb in diffchoros:
603 | 			pergsp[col] = pergsp[cola] - pergsp[colb]
604 | 
605 | 		vmax = max([pergsp[col].max() for col, _, _, _ in diffchoros])
606 | 
607 | 		for col, col_lbl, cola, colb in diffchoros:
608 | 			plot_choropleth(pergsp, col, vmax, "Capacity in each GSP region (MWp): %s" % col_lbl, vmin=-vmax, cmap="RdBu", show_statstr=False)
609 | 
610 | ##############################################################################
611 | # Next: capacity choropleths, for a selection of high-contributing users
612 | 
613 | # NOTE: according to OSM's GDPR policy we must not publish user ids.
614 | #  That's why we use a list of users which is not stored in github,
615 | #  and load their associations from the input file rather than our output.
616 | #  Then we anonymise them to simple transitory integer values.
617 | 
618 | if True:
619 | 	with open('users_to_plot.csv', 'rt') as fp:
620 | 		rdr = csv.reader(row for row in fp if not row.startswith('#'))
621 | 		users_to_plot = [line[0].strip() for line in rdr if len(line)]
622 | 
623 | 	rawosmfpath = '../raw/osm.csv'
624 | 
625 | 	df_userids = pd.read_csv(rawosmfpath, usecols=['objtype', 'id', 'user'], dtype={'id':inttype}).rename(columns={'objtype':'osm_objtype', 'id':'osm_id'})
626 | 	users_to_plot_lookup = {u:k for k,u in enumerate(users_to_plot)}
627 | 	df_userids['user'] = df_userids['user'].map(users_to_plot_lookup).astype(inttype)
628 | 	df = df.merge(df_userids, how='left', on=['osm_objtype', 'osm_id'])
629 | 	num_persons = len(users_to_plot)
630 | 
631 | 	# TMP  [(personindex, sum(df[df['user']==personindex]['capacity_merged3_MWp'])) for personindex in range(len(users_to_plot))]
632 | 
633 | 	# num items per person
634 | 	fig, axes = plt.subplots(1, num_persons, figsize=(8 * num_persons, 10))
635 | 	for personindex, ax in enumerate(axes):
636 | 		piv_count_gsp_forthisperson = pd.pivot_table(df[df['user']==personindex], values='geometry', index='RegionID', aggfunc=really_count_nonzero).rename(columns={'geometry':'num'})
637 | 		pergsp_forthisperson = gspdf.merge(piv_count_gsp_forthisperson, how='left', on='RegionID').sort_values("RegionID")
638 | 		pergsp_forthisperson['num'].fillna(0, inplace=True)
639 | 		plot_choropleth_onax(ax, pergsp_forthisperson, "num", None, cmap='copper', plottitle=None, show_statstr=False)
640 | 	pdf.savefig(fig)
641 | 	plt.close()
642 | 
643 | 	# capacity per person
644 | 	col = 'capacity_merged3_MWp'
645 | 	fig, axes = plt.subplots(1, num_persons, figsize=(8 * num_persons, 10))
646 | 	for personindex, ax in enumerate(axes):
647 | 		#print(f"  Plotting for user #{personindex}")
648 | 		piv_mw_gsp_forthisperson = pd.pivot_table(df[df['user']==personindex], values=col, index='RegionID', aggfunc='sum')
649 | 		pergsp_forthisperson = gspdf.merge(piv_mw_gsp_forthisperson, how='left', on='RegionID').sort_values("RegionID")
650 | 		pergsp_forthisperson[col].fillna(0, inplace=True)
651 | 		vmax = pergsp_forthisperson[col].max()
652 | 		#print("     peak value: %f" % vmax)
653 | 		plottitle = "Capacity in each GSP region (MWp) edited by user #%i" % (personindex+1)
654 | 		plot_choropleth_onax(ax, pergsp_forthisperson, col, np.ceil(vmax/10)*10., plottitle=None, show_statstr=False)
655 | 
656 | 	pdf.savefig(fig)
657 | 	plt.close()
658 | 
659 | 
660 | 
661 | 
662 | ##############################################################################
663 | pdf.close()
664 | 
665 | 


--------------------------------------------------------------------------------