├── .gitignore
├── README.md
├── data
└── graphs
│ ├── cloud_cover_hist.html
│ ├── facilities_map.html
│ ├── operation_status_hist.html
│ └── operation_status_per_power_plant_ts.html
├── notebooks
├── 01_data_matching.ipynb
├── 02_satellite_imagery.ipynb
├── 03_split_data_for_ml.ipynb
├── 04_dataset_loading.ipynb
├── 05_images_download.ipynb
├── 06_model_training.ipynb
└── google_cooling_tower_on_off_data.ipynb
├── requirements.txt
├── scripts
└── download_images.py
├── setup.cfg
├── setup.py
└── src
└── coal_emissions_monitoring
├── __init__.py
├── constants.py
├── data_cleaning.py
├── data_viz.py
├── dataset.py
├── ml_utils.py
├── model.py
├── satellite_imagery.py
└── transforms.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | .vscode/
131 | notebooks/lightning_logs/*
132 | data/campd/
133 | data/google/
134 | data/models/
135 | .DS_Store
136 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ccai-ss23-ai-monitoring-tutorial
2 | Experiments for the Climate Change AI summer school 2023 tutorial on "AI for Monitoring, Reporting, and Verification"
3 |
--------------------------------------------------------------------------------
/data/graphs/facilities_map.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
27 |
37 |
38 |
39 |
53 |
54 |
55 |
56 |
68 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
data_set
122 |
123 |
124 | - train
125 | - val
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
243 |
--------------------------------------------------------------------------------
/notebooks/01_data_matching.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Data matching\n",
9 | "---\n",
10 | "\n",
11 | "Experimenting with matching data from:\n",
12 | "- Global Energy Monitor (GEM)'s [Global Coal Plant Tracker](https://www.globalenergymonitor.org/coal.html)\n",
13 | "- USA's [CAMPD emissions data](https://campd.epa.gov/data)\n",
14 | "- OSM's [cooling_tower](https://wiki.openstreetmap.org/wiki/Tag:man_made%3Dcooling_tower) tag"
15 | ]
16 | },
17 | {
18 | "attachments": {},
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Setup"
23 | ]
24 | },
25 | {
26 | "attachments": {},
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "### Imports"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import pandas as pd\n",
40 | "import geopandas as gpd\n",
41 | "import plotly.express as px"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "from coal_emissions_monitoring.data_cleaning import (\n",
51 | " load_clean_gcpt_gdf,\n",
52 | " load_clean_campd_facilities_gdf,\n",
53 | " load_clean_campd_emissions_df,\n",
54 | " load_osm_data,\n",
55 | ")"
56 | ]
57 | },
58 | {
59 | "attachments": {},
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "### Parameters"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# show all columns in pandas\n",
73 | "pd.set_option(\"display.max_columns\", None)"
74 | ]
75 | },
76 | {
77 | "attachments": {},
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## Load data"
82 | ]
83 | },
84 | {
85 | "attachments": {},
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "### GEM Global Coal Plant Tracker"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "gcpt_df = load_clean_gcpt_gdf(\"/Users/adminuser/Downloads/Global-Coal-Plant-Tracker-January-2023.xlsx\")\n",
99 | "gcpt_df"
100 | ]
101 | },
102 | {
103 | "attachments": {},
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "### CAMPD facilities metadata"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "campd_facilities_df = load_clean_campd_facilities_gdf(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\")\n",
117 | "campd_facilities_df"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "campd_facilities_df.capacity_mw.describe()"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "# find distance to the nearest facility\n",
136 | "for facility_id in campd_facilities_df.facility_id:\n",
137 | " campd_facilities_df.loc[\n",
138 | " campd_facilities_df.facility_id == facility_id,\n",
139 | " \"dist_to_nearest_facility\"\n",
140 | " ] = gpd.sjoin_nearest(\n",
141 | " campd_facilities_df.loc[campd_facilities_df.facility_id == facility_id],\n",
142 | " campd_facilities_df.loc[campd_facilities_df.facility_id != facility_id],\n",
143 | " distance_col=\"dist\",\n",
144 | " ).dist.min()\n",
145 | "campd_facilities_df.groupby(\"facility_id\").dist_to_nearest_facility.min().sort_values()"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "campd_facilities_df[campd_facilities_df.year == 2023].explore()"
155 | ]
156 | },
157 | {
158 | "attachments": {},
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "### CAMPD emissions data"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "campd_emissions_df = load_clean_campd_emissions_df(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\")\n",
172 | "campd_emissions_df"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "campd_emissions_df[\"year\"] = campd_emissions_df[\"date\"].dt.year\n",
182 | "yearly_emissions = campd_emissions_df.groupby(\"year\").co2_mass_short_tons.mean()\n",
183 | "yearly_emissions"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "px.line(campd_emissions_df, x=\"date\", y=\"co2_mass_short_tons\", color=\"facility_name\")"
193 | ]
194 | },
195 | {
196 | "attachments": {},
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "### OSM cooling_tower tag"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "osm_gdf = load_osm_data()\n",
210 | "osm_gdf"
211 | ]
212 | },
213 | {
214 | "attachments": {},
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "## Match data"
219 | ]
220 | },
221 | {
222 | "attachments": {},
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "### CAMPD facilities metadata and emissions"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "campd_emissions_df[\"year\"] = pd.to_datetime(campd_emissions_df[\"date\"].dt.year, format=\"%Y\")\n",
236 | "campd_gdf = pd.merge(\n",
237 | " campd_facilities_df,\n",
238 | " campd_emissions_df,\n",
239 | " on=[\"facility_id\", \"year\"],\n",
240 | " how=\"inner\",\n",
241 | " suffixes=(\"_delete\", \"\"),\n",
242 | ")\n",
243 | "campd_gdf = campd_gdf.drop(columns=[col for col in campd_gdf.columns if \"_delete\" in col])\n",
244 | "campd_gdf"
245 | ]
246 | },
247 | {
248 | "attachments": {},
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "### CAMPD data and OSM cooling_tower tag"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "campd_ndt_gdf = gpd.sjoin_nearest(campd_gdf, osm_gdf, how=\"inner\", distance_col=\"distances\", max_distance=0.01)\n",
262 | "campd_ndt_gdf"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "campd_ndt_gdf.distances.describe()"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "ndt_plants = campd_ndt_gdf.facility_id.nunique()\n",
281 | "ndt_plants"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "metadata": {},
288 | "outputs": [],
289 | "source": []
290 | }
291 | ],
292 | "metadata": {
293 | "kernelspec": {
294 | "display_name": "ccai_ss23",
295 | "language": "python",
296 | "name": "python3"
297 | },
298 | "language_info": {
299 | "codemirror_mode": {
300 | "name": "ipython",
301 | "version": 3
302 | },
303 | "file_extension": ".py",
304 | "mimetype": "text/x-python",
305 | "name": "python",
306 | "nbconvert_exporter": "python",
307 | "pygments_lexer": "ipython3",
308 | "version": "3.10.9"
309 | },
310 | "orig_nbformat": 4
311 | },
312 | "nbformat": 4,
313 | "nbformat_minor": 2
314 | }
315 |
--------------------------------------------------------------------------------
/notebooks/02_satellite_imagery.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Satellite imagery collection and processing\n",
9 | "---\n",
10 | "\n",
11 | "Experimenting with filtering, downloading and displaying Sentinel 2 images from the [AWS STAC of Cloud-Optimized GeoTIFFs](https://registry.opendata.aws/sentinel-2-l2a-cogs/)"
12 | ]
13 | },
14 | {
15 | "attachments": {},
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Setup"
20 | ]
21 | },
22 | {
23 | "attachments": {},
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "### Imports"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "from coal_emissions_monitoring.data_cleaning import (\n",
37 | " load_clean_campd_facilities_gdf,\n",
38 | " load_clean_campd_emissions_df,\n",
39 | " load_clean_image_metadata_df,\n",
40 | " get_final_dataset\n",
41 | ")\n",
42 | "from coal_emissions_monitoring.satellite_imagery import (\n",
43 | " create_aoi_for_plants,\n",
44 | " get_image_metadata_for_plants,\n",
45 | " get_image_from_cog\n",
46 | ")\n",
47 | "from coal_emissions_monitoring.data_viz import view_satellite_image"
48 | ]
49 | },
50 | {
51 | "attachments": {},
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "### Parameters"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "cloud_filter_percent = 25"
65 | ]
66 | },
67 | {
68 | "attachments": {},
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "## Load CAMPD data"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "campd_facilities_gdf = load_clean_campd_facilities_gdf(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\")\n",
82 | "campd_facilities_gdf"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "campd_facilities_gdf = create_aoi_for_plants(campd_facilities_gdf)\n",
92 | "campd_facilities_gdf"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "campd_facilities_gdf.geometry.explore()"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "campd_emissions_df = load_clean_campd_emissions_df(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\")\n",
111 | "campd_emissions_df"
112 | ]
113 | },
114 | {
115 | "attachments": {},
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "## Filter emissions data to days when a cloudless image is available"
120 | ]
121 | },
122 | {
123 | "attachments": {},
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "### Get image metadata for every power plant"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "image_metadata_df = get_image_metadata_for_plants(campd_facilities_gdf, max_cloud_cover_prct=cloud_filter_percent)\n",
137 | "image_metadata_df.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\", index=False)\n",
138 | "image_metadata_df"
139 | ]
140 | },
141 | {
142 | "attachments": {},
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "### Join with emissions data"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "df = get_final_dataset(\n",
156 | " image_metadata_path=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n",
157 | " campd_facilities_path=\"https://drive.google.com/file/d/1b-5BriZUiiv2r0wFLubccLQpd2xb5ysl/view?usp=share_link\",\n",
158 | " campd_emissions_path=\"https://drive.google.com/file/d/1oxZXR7GDcSXwwVoIjp66iS179cFVA5dP/view?usp=share_link\",\n",
159 | " cog_type=\"all\",\n",
160 | ")\n",
161 | "df"
162 | ]
163 | },
164 | {
165 | "attachments": {},
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "## Download and display images"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "image = get_image_from_cog(cog_url=image_metadata_df.cog_url.iloc[0], geometry=campd_facilities_gdf.geometry.iloc[0])"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "image.shape, image.min(), image.mean(), image.max()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "view_satellite_image(image)"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": []
205 | }
206 | ],
207 | "metadata": {
208 | "kernelspec": {
209 | "display_name": "ccai_ss23",
210 | "language": "python",
211 | "name": "python3"
212 | },
213 | "language_info": {
214 | "codemirror_mode": {
215 | "name": "ipython",
216 | "version": 3
217 | },
218 | "file_extension": ".py",
219 | "mimetype": "text/x-python",
220 | "name": "python",
221 | "nbconvert_exporter": "python",
222 | "pygments_lexer": "ipython3",
223 | "version": "3.10.9"
224 | },
225 | "orig_nbformat": 4
226 | },
227 | "nbformat": 4,
228 | "nbformat_minor": 2
229 | }
--------------------------------------------------------------------------------
/notebooks/03_split_data_for_ml.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Split data for machine learning\n",
9 | "---\n",
10 | "\n",
11 | "Experimenting with splitting the data for machine learning model training."
12 | ]
13 | },
14 | {
15 | "attachments": {},
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Setup"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Imports"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "import numpy as np\n",
36 | "import geopandas as gpd"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "from coal_emissions_monitoring.data_cleaning import get_final_dataset\n",
46 | "from coal_emissions_monitoring.ml_utils import get_facility_set_mapper, split_data_in_sets"
47 | ]
48 | },
49 | {
50 | "attachments": {},
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "### Parameters"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "train_val_ratio = 0.8\n",
64 | "test_data_year = 2023"
65 | ]
66 | },
67 | {
68 | "attachments": {},
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "## Load data"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "df = get_final_dataset(\n",
82 | " image_metadata_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n",
83 | " campd_facilities_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\",\n",
84 | " campd_emissions_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\",\n",
85 | ")\n",
86 | "df"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "df.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/final_dataset.csv\", index=False)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "df.co2_mass_short_tons.value_counts()"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "df.isna().sum()"
114 | ]
115 | },
116 | {
117 | "attachments": {},
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "## Split data"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "facility_set_mapper = get_facility_set_mapper(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\")\n",
131 | "df[\"data_set\"] = df.apply(lambda row: split_data_in_sets(row=row, data_set_mapper=facility_set_mapper, test_year=test_data_year), axis=1)\n",
132 | "df"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "df.data_set.value_counts()"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "df.data_set.value_counts() / df.shape[0]"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "for data_set in df.data_set.unique():\n",
160 | " print(data_set)\n",
161 | " print(df[df.data_set == data_set].ts.dt.year.value_counts())\n",
162 | " print()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": []
171 | }
172 | ],
173 | "metadata": {
174 | "kernelspec": {
175 | "display_name": "ccai_ss23",
176 | "language": "python",
177 | "name": "python3"
178 | },
179 | "language_info": {
180 | "codemirror_mode": {
181 | "name": "ipython",
182 | "version": 3
183 | },
184 | "file_extension": ".py",
185 | "mimetype": "text/x-python",
186 | "name": "python",
187 | "nbconvert_exporter": "python",
188 | "pygments_lexer": "ipython3",
189 | "version": "3.10.9"
190 | },
191 | "orig_nbformat": 4
192 | },
193 | "nbformat": 4,
194 | "nbformat_minor": 2
195 | }
--------------------------------------------------------------------------------
/notebooks/04_dataset_loading.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Dataset loading\n",
9 | "---\n",
10 | "Experimenting with loading the PyTorch Lightning dataset and visualising its outputs."
11 | ]
12 | },
13 | {
14 | "attachments": {},
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Setup"
19 | ]
20 | },
21 | {
22 | "attachments": {},
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Imports"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "from coal_emissions_monitoring.dataset import CoalEmissionsDataModule\n",
36 | "from coal_emissions_monitoring.data_viz import view_satellite_image"
37 | ]
38 | },
39 | {
40 | "attachments": {},
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "## Create the dataset"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "data = CoalEmissionsDataModule(\n",
54 | " image_metadata_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n",
55 | " campd_facilities_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\",\n",
56 | " campd_emissions_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\",\n",
57 | " batch_size=2,\n",
58 | " predownload_images=True,\n",
59 | " images_dir=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/images\",\n",
60 | ")\n",
61 | "data.setup(stage=\"fit\")"
62 | ]
63 | },
64 | {
65 | "attachments": {},
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Load some batches"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "for batch in data.train_dataloader():\n",
79 | " break\n",
80 | "print(f\"Keys in batch: {batch.keys()}\")\n",
81 | "print(f\"Image shape: {batch['image'].shape}\")"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "idx = 0\n",
91 | "print(f\"Target: {batch['target'][idx]}\")\n",
92 | "print(f\"Facility name: {batch['metadata']['facility_name'][idx]}\")\n",
93 | "print(f\"Timestamp: {batch['metadata']['ts'][idx]}\")\n",
94 | "view_satellite_image(batch[\"image\"][idx])"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "((batch[\"image\"][idx] <= 1) | (batch[\"image\"][idx].isnan())).sum() / batch[\"image\"][idx].numel()"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": []
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "ccai_ss23",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.10.9"
131 | },
132 | "orig_nbformat": 4
133 | },
134 | "nbformat": 4,
135 | "nbformat_minor": 2
136 | }
137 |
--------------------------------------------------------------------------------
/notebooks/05_images_download.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Images download\n",
9 | "---\n",
10 | "\n",
11 | "Download all images before training models."
12 | ]
13 | },
14 | {
15 | "attachments": {},
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Setup"
20 | ]
21 | },
22 | {
23 | "attachments": {},
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "### Imports"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "from tqdm.auto import tqdm"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "from coal_emissions_monitoring.constants import ALL_BANDS\n",
46 | "from coal_emissions_monitoring.data_cleaning import get_final_dataset\n",
47 | "from coal_emissions_monitoring.satellite_imagery import fetch_image_path_from_cog"
48 | ]
49 | },
50 | {
51 | "attachments": {},
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "## Get final datase"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "df = get_final_dataset(\n",
65 | " image_metadata_path=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n",
66 | " campd_facilities_path=\"https://drive.google.com/file/d/1b-5BriZUiiv2r0wFLubccLQpd2xb5ysl/view?usp=share_link\",\n",
67 | " campd_emissions_path=\"https://drive.google.com/file/d/1oxZXR7GDcSXwwVoIjp66iS179cFVA5dP/view?usp=share_link\",\n",
68 | " cog_type=\"all\",\n",
69 | ")\n",
70 | "df"
71 | ]
72 | },
73 | {
74 | "attachments": {},
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "## Download images"
79 | ]
80 | },
81 | {
82 | "attachments": {},
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "### TCI (True Color Image)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "tqdm.pandas(desc=\"Downloading visual images\")\n",
96 | "df[\"local_image_path\"] = df.progress_apply(\n",
97 | " lambda row: fetch_image_path_from_cog(\n",
98 | " cog_url=row.cog_url,\n",
99 | " geometry=row.geometry,\n",
100 | " images_dir=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/visual/\",\n",
101 | " download_missing_images=True,\n",
102 | " ),\n",
103 | " axis=1,\n",
104 | ")"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "# compress all images into one file\n",
114 | "!tar -czvf /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/visual_images.tar.gz /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/visual"
115 | ]
116 | },
117 | {
118 | "attachments": {},
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "### All bands"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "tqdm.pandas(desc=\"Downloading all bands images\")\n",
132 | "df[\"local_image_all_bands_path\"] = df.progress_apply(\n",
133 | " lambda row: fetch_image_path_from_cog(\n",
134 | " cog_url=[row[band] for band in ALL_BANDS],\n",
135 | " geometry=row.geometry,\n",
136 | " cog_type=\"all\",\n",
137 | " images_dir=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/all_bands/\",\n",
138 | " download_missing_images=True,\n",
139 | " ),\n",
140 | " axis=1,\n",
141 | ")"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "# compress all images into one file\n",
151 | "!tar -czvf /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/all_bands_images.tar.gz /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/all_bands"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": []
160 | }
161 | ],
162 | "metadata": {
163 | "kernelspec": {
164 | "display_name": "ccai_ss23",
165 | "language": "python",
166 | "name": "python3"
167 | },
168 | "language_info": {
169 | "codemirror_mode": {
170 | "name": "ipython",
171 | "version": 3
172 | },
173 | "file_extension": ".py",
174 | "mimetype": "text/x-python",
175 | "name": "python",
176 | "nbconvert_exporter": "python",
177 | "pygments_lexer": "ipython3",
178 | "version": "3.10.10"
179 | },
180 | "orig_nbformat": 4
181 | },
182 | "nbformat": 4,
183 | "nbformat_minor": 2
184 | }
185 |
--------------------------------------------------------------------------------
/notebooks/06_model_training.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Model training\n",
9 | "---\n",
10 | "\n",
11 | "Experimenting with training some models over the dataset."
12 | ]
13 | },
14 | {
15 | "attachments": {},
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Setup"
20 | ]
21 | },
22 | {
23 | "attachments": {},
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "### Imports"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import timm\n",
37 | "from lightning import Trainer\n",
38 | "from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "from coal_emissions_monitoring.dataset import CoalEmissionsDataModule\n",
48 | "from coal_emissions_monitoring.model import CoalEmissionsModel, SmallCNN\n",
49 | "from coal_emissions_monitoring.transforms import efficientnet_transform"
50 | ]
51 | },
52 | {
53 | "attachments": {},
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "### Parameters"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "batch_size = 128\n",
67 | "crop_size = 52\n",
68 | "num_workers = 0\n",
69 | "learning_rate = 1e-3"
70 | ]
71 | },
72 | {
73 | "attachments": {},
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "## Create the dataset"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "data = CoalEmissionsDataModule(\n",
87 | " final_dataset_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/final_dataset.csv\",\n",
88 | " batch_size=batch_size,\n",
89 | " num_workers=num_workers,\n",
90 | " predownload_images=True,\n",
91 | " download_missing_images=False,\n",
92 | " images_dir=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual\",\n",
93 | " crop_size=crop_size,\n",
94 | ")"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "data.setup(\"fit\")"
104 | ]
105 | },
106 | {
107 | "attachments": {},
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Create the model"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "# model = timm.create_model(\"efficientnet_b0\", pretrained=True, num_classes=1)\n",
121 | "model = SmallCNN(num_input_channels=3, num_classes=1)"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "model = model.float().to(\"cpu\")"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "lit_model = CoalEmissionsModel(\n",
140 | " model=model,\n",
141 | " learning_rate=learning_rate,\n",
142 | " pos_weight=data.pos_weight,\n",
143 | ")"
144 | ]
145 | },
146 | {
147 | "attachments": {},
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "## Confirm that the model can be run on a batch of data"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "data.setup(stage=\"fit\")\n",
161 | "for batch in data.train_dataloader():\n",
162 | " break\n",
163 | "print(f\"Keys in batch: {batch.keys()}\")\n",
164 | "print(f\"Image shape: {batch['image'].shape}\")"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "y_pred = lit_model(batch[\"image\"])\n",
174 | "y_pred"
175 | ]
176 | },
177 | {
178 | "attachments": {},
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "## Check that the model can overfit a single batch"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "trainer = Trainer(\n",
192 | " max_epochs=1,\n",
193 | " callbacks=[\n",
194 | " EarlyStopping(monitor=\"val_loss\", mode=\"min\", patience=10),\n",
195 | " ModelCheckpoint(\n",
196 | " monitor=\"val_loss\",\n",
197 | " mode=\"min\",\n",
198 | " filename=\"{val_loss:2f}-{val_balanced_accuracy:.2f}-{epoch}-64crop_full_data\",\n",
199 | " save_top_k=1,\n",
200 | " dirpath=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/models/\",\n",
201 | " )\n",
202 | " ],\n",
203 | " limit_train_batches=round(0.1 * len(data.train_dataset.gdf) / batch_size),\n",
204 | " limit_val_batches=round(0.4 * len(data.val_dataset.gdf) / batch_size),\n",
205 | " reload_dataloaders_every_n_epochs=1,\n",
206 | " precision=\"16-mixed\",\n",
207 | " accelerator=\"cpu\",\n",
208 | " devices=1,\n",
209 | " log_every_n_steps=5,\n",
210 | " # overfit_batches=1,\n",
211 | ")\n",
212 | "trainer.fit(lit_model, data)"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "_ = trainer.test(\n",
222 | " model=lit_model,\n",
223 | " datamodule=data,\n",
224 | " ckpt_path=\"best\",\n",
225 | " verbose=True,\n",
226 | ")"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": []
235 | }
236 | ],
237 | "metadata": {
238 | "kernelspec": {
239 | "display_name": "ccai_ss23",
240 | "language": "python",
241 | "name": "python3"
242 | },
243 | "language_info": {
244 | "codemirror_mode": {
245 | "name": "ipython",
246 | "version": 3
247 | },
248 | "file_extension": ".py",
249 | "mimetype": "text/x-python",
250 | "name": "python",
251 | "nbconvert_exporter": "python",
252 | "pygments_lexer": "ipython3",
253 | "version": "3.10.9"
254 | },
255 | "orig_nbformat": 4
256 | },
257 | "nbformat": 4,
258 | "nbformat_minor": 2
259 | }
260 |
--------------------------------------------------------------------------------
/notebooks/google_cooling_tower_on_off_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import geopandas as gpd\n",
11 | "from tqdm.auto import tqdm"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from coal_emissions_monitoring.satellite_imagery import (\n",
21 | " create_aoi_for_plants,\n",
22 | " get_image_metadata_for_plants,\n",
23 | " get_image_from_cog\n",
24 | ")\n",
25 | "from coal_emissions_monitoring.data_viz import view_satellite_image"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "df = pd.read_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/labeled_geospatial_data.csv\")\n",
35 | "df"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "df.is_powered_on.value_counts()"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# get unique combinations of lat/lon\n",
54 | "unique_coords = df[[\"lat\", \"lon\"]].drop_duplicates().reset_index(drop=True)\n",
55 | "unique_coords.reset_index(inplace=True)\n",
56 | "unique_coords.set_index([\"lat\", \"lon\"], inplace=True)\n",
57 | "unique_coords = unique_coords[\"index\"].to_dict()\n",
58 | "unique_coords"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# set an epsg code for each unique lat/lon\n",
68 | "df[\"facility_id\"] = df.apply(\n",
69 | " lambda x: unique_coords[(x[\"lat\"], x[\"lon\"])], axis=1\n",
70 | ")\n",
71 | "df"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "df.facility_id.value_counts()"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "df.rename(columns={\"lat\": \"latitude\", \"lon\": \"longitude\"}, inplace=True)\n",
90 | "df"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "df.rename(columns={\"timestamp\": \"ts\"}, inplace=True)\n",
100 | "df.ts = pd.to_datetime(df.ts)\n",
101 | "df.dtypes"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs=\"EPSG:4326\")\n",
111 | "gdf"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "gdf = create_aoi_for_plants(gdf)\n",
121 | "gdf"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "gdf.geometry.explore()"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "# image_metadata_df = get_image_metadata_for_plants(\n",
140 | "# gdf,\n",
141 | "# start_date=gdf.ts.min(),\n",
142 | "# end_date=gdf.ts.max(),\n",
143 | "# max_cloud_cover_prct=50,\n",
144 | "# )\n",
145 | "image_metadata_df = pd.read_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/image_metadata.csv\")\n",
146 | "image_metadata_df.ts = pd.to_datetime(image_metadata_df.ts)\n",
147 | "image_metadata_df"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "# filter the image metadata to match the day of each row of gdf\n",
157 | "image_metadata_df[\"date\"] = image_metadata_df.ts.dt.date\n",
158 | "gdf[\"date\"] = gdf.ts.dt.date\n",
159 | "image_metadata_df = image_metadata_df.merge(\n",
160 | " gdf[[\"facility_id\", \"date\"]], on=[\"facility_id\", \"date\"]\n",
161 | ")\n",
162 | "image_metadata_df"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "image_metadata_df.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/image_metadata.csv\", index=False)"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "gdf.merge(\n",
181 | " image_metadata_df.drop(columns=[\"ts\"]),\n",
182 | " on=[\"facility_id\", \"date\"]\n",
183 | ").to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/all_urls_dataset.csv\", index=False)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "gdf = gdf.merge(\n",
193 | " image_metadata_df[[\"facility_id\", \"date\", \"cloud_cover\", \"visual\"]],\n",
194 | " on=[\"facility_id\", \"date\"]\n",
195 | ")\n",
196 | "gdf.rename(columns={\"visual\": \"cog_url\"}, inplace=True)\n",
197 | "gdf.drop(columns=[\"date\"], inplace=True)\n",
198 | "gdf"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "gdf.sort_values(by=[\"facility_id\", \"ts\"], inplace=True)\n",
208 | "gdf.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/final_dataset.csv\", index=False)"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": null,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "row = gdf.iloc[0]\n",
218 | "image = get_image_from_cog(\n",
219 | " cog_url=row.cog_url,\n",
220 | " geometry=row.geometry,\n",
221 | " size=64,\n",
222 | ")\n",
223 | "image.shape"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "view_satellite_image(image)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {},
239 | "outputs": [],
240 | "source": []
241 | }
242 | ],
243 | "metadata": {
244 | "kernelspec": {
245 | "display_name": "ccai_ss23",
246 | "language": "python",
247 | "name": "python3"
248 | },
249 | "language_info": {
250 | "codemirror_mode": {
251 | "name": "ipython",
252 | "version": 3
253 | },
254 | "file_extension": ".py",
255 | "mimetype": "text/x-python",
256 | "name": "python",
257 | "nbconvert_exporter": "python",
258 | "pygments_lexer": "ipython3",
259 | "version": "3.10.9"
260 | },
261 | "orig_nbformat": 4
262 | },
263 | "nbformat": 4,
264 | "nbformat_minor": 2
265 | }
266 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | overpy==0.6
2 | pandas==1.5.3
3 | geopandas==0.12.2
4 | openpyxl==3.1.2
5 | requests==2.28.2
6 | folium==0.14.0
7 | mapclassify==2.5.0
8 | matplotlib==3.7.1
9 | plotly==5.14.1
10 | nbformat==5.8.0
11 | pystac-client==0.6.1
12 | rasterio==1.3.6
13 | loguru==0.6.0
14 | tqdm==4.65.0
15 | black==23.3.0
16 | flake8==6.0.0
17 | torch==2.0.0
18 | lightning==2.0.1.post0
19 | kornia==0.6.11
20 | timm==0.6.13
21 | backoff==2.2.1
--------------------------------------------------------------------------------
/scripts/download_images.py:
--------------------------------------------------------------------------------
1 | # %% [markdown]
2 | # # Images download
3 | # ---
4 | #
5 | # Download all images before training models.
6 |
7 | # %% [markdown]
8 | # ## Setup
9 |
10 | # %% [markdown]
11 | # ### Imports
12 |
13 | # %%
14 | import os
15 | from tqdm.auto import tqdm
16 |
17 | # %%
18 | from coal_emissions_monitoring.constants import ALL_BANDS, MAIN_COLUMNS
19 | from coal_emissions_monitoring.data_cleaning import load_final_dataset
20 | from coal_emissions_monitoring.satellite_imagery import fetch_image_path_from_cog
21 |
22 | # %% [markdown]
23 | # ## Get final datase
24 |
25 | # %%
26 | gdf = load_final_dataset(
27 | "/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/all_urls_dataset.csv"
28 | )
29 |
30 | # %% [markdown]
31 | # ## Download images
32 |
33 | # %% [markdown]
34 | # ### TCI (True Color Image)
35 |
36 | # %%
37 | tqdm.pandas(desc="Downloading visual images")
38 | gdf["local_image_path"] = gdf.progress_apply(
39 | lambda row: fetch_image_path_from_cog(
40 | cog_url=row.visual,
41 | geometry=row.geometry,
42 | cog_type="visual",
43 | images_dir="/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual/",
44 | download_missing_images=True,
45 | ),
46 | axis=1,
47 | )
48 |
49 | # %%
50 | path = "/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/"
51 | os.makedirs(path, exist_ok=True)
52 | gdf.rename(columns={"visual": "cog_url"})[MAIN_COLUMNS + ["local_image_path"]].to_csv(
53 | f"{path}final_dataset.csv",
54 | index=False,
55 | )
56 |
57 | # %%
58 | # compress all images into one file
59 | os.system(
60 | "tar -czvf /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual_images.tar.gz /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual"
61 | )
62 |
63 | # %% [markdown]
64 | # ### All bands
65 |
66 | # %%
67 | tqdm.pandas(desc="Downloading all bands images")
68 | gdf["local_image_all_bands_path"] = gdf.progress_apply(
69 | lambda row: fetch_image_path_from_cog(
70 | cog_url=[row[band] for band in ALL_BANDS],
71 | geometry=row.geometry,
72 | size=32, # smaller images to make the download faster
73 | cog_type="all",
74 | images_dir="/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/all_bands/",
75 | download_missing_images=True,
76 | ),
77 | axis=1,
78 | )
79 |
80 | # %%
81 | # compress all images into one file
82 | os.system(
83 | "tar -czvf /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/all_bands_images.tar.gz /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/all_bands"
84 | )
85 |
86 | # %%
87 | path = "/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/"
88 | os.makedirs(path, exist_ok=True)
89 | gdf.rename(columns={"visual": "cog_url"})[
90 | MAIN_COLUMNS + ["local_image_path", "local_image_all_bands_path"]
91 | ].to_csv(
92 | f"{path}final_dataset.csv",
93 | index=False,
94 | )
95 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = coal-emissions-monitoring
3 | version = 0.0.1
4 | description = A data science project to monitor coal power emissions
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown
7 | url = https://github.com/AndreCNF/ccai-ss23-ai-monitoring-tutorial
8 |
9 | [options]
10 | package_dir =
11 | = src
12 | packages = find:
13 | python_requires = >=3.7
14 | install_requires =
15 | overpy
16 | pandas
17 | geopandas
18 | openpyxl
19 | requests
20 | folium
21 | mapclassify
22 | matplotlib
23 | plotly
24 | nbformat>=4.2.0
25 | pystac-client
26 | rasterio
27 | loguru
28 | tqdm
29 | torch
30 | lightning
31 | kornia
32 | timm
33 | backoff
34 |
35 | [options.packages.find]
36 | where=src
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | use_scm_version=True,
5 | setup_requires=["setuptools_scm"],
6 | )
7 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreCNF/ccai-ss23-ai-monitoring-tutorial/8b7ba2d2b11175c8f12f87b22a7d18acb6c8628e/src/coal_emissions_monitoring/__init__.py
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/constants.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | GLOBAL_EPSG = 4326
4 | API_URL = "https://earth-search.aws.element84.com/v0"
5 | COLLECTION = "sentinel-s2-l2a-cogs" # Sentinel-2, Level 2A, COGs
6 | AOI_SIZE_METERS = 640
7 | IMAGE_SIZE_PX = 64
8 | CROP_SIZE_PX = 52
9 | START_DATE = datetime(year=2016, month=1, day=1)
10 | END_DATE = datetime(year=2019, month=12, day=31)
11 | MAX_DARK_FRAC = 0.5
12 | MAX_BRIGHT_MEAN = 250
13 | MAX_CLOUD_COVER_PRCT = 50
14 | TRAIN_VAL_RATIO = 0.8
15 | TEST_YEAR = 2020
16 | BATCH_SIZE = 32
17 | MAIN_COLUMNS = [
18 | "facility_id",
19 | "latitude",
20 | "longitude",
21 | "ts",
22 | "is_powered_on",
23 | "cloud_cover",
24 | "cog_url",
25 | "geometry",
26 | ]
27 | ALL_BANDS = [
28 | "b01",
29 | "b02",
30 | "b03",
31 | "b04",
32 | "b05",
33 | "b06",
34 | "b07",
35 | "b08",
36 | "b8a",
37 | "b09",
38 | "b11",
39 | "b12",
40 | ]
41 | EMISSIONS_TARGET = "is_powered_on"
42 | EMISSIONS_CATEGORIES = {
43 | 0: "no_emissions",
44 | 1: "low",
45 | 2: "medium",
46 | 3: "high",
47 | 4: "very_high",
48 | }
49 | RANDOM_TRANSFORM_PROB = 0.5
50 | POSITIVE_THRESHOLD = 0.5
51 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/data_cleaning.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import warnings
3 | from typing import Callable, Optional, Union
4 |
5 | import pandas as pd
6 | import geopandas as gpd
7 | import overpy
8 |
9 | from coal_emissions_monitoring.constants import ALL_BANDS, GLOBAL_EPSG, MAIN_COLUMNS
10 | from coal_emissions_monitoring.satellite_imagery import create_aoi_for_plants
11 |
12 | OSM_API = overpy.Overpass()
13 |
14 | # suppress geopandas CRS warning as we don't need to worry too much about
15 | # the precision of distances
16 | warnings.filterwarnings("ignore", message="Geometry is in a geographic CRS*")
17 | # suppress pandas warning of setting value in copy
18 | warnings.filterwarnings("ignore", message="A value is trying to be set on a copy*")
19 | # suppress pandas warning on regex
20 | warnings.filterwarnings("ignore", message="The default value of regex will change*")
21 |
22 |
23 | def clean_column_names(
24 | df: Union[pd.DataFrame, gpd.GeoDataFrame]
25 | ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
26 | """
27 | Clean column names in a data frame.
28 |
29 | Args:
30 | df (Union[pd.DataFrame, gpd.GeoDataFrame]):
31 | Data frame to clean
32 |
33 | Returns:
34 | df (Union[pd.DataFrame, gpd.GeoDataFrame]):
35 | Cleaned data frame
36 | """
37 | df.columns = (
38 | df.columns.str.lower()
39 | .str.replace(" ", "_")
40 | .str.replace("(", "")
41 | .str.replace(")", "")
42 | .str.replace("/", "_")
43 | .str.replace("-", "_")
44 | .str.replace(",", "_")
45 | )
46 | return df
47 |
48 |
49 | def fix_google_drive_url(url: str) -> str:
50 | """
51 | Fix a Google Drive URL.
52 |
53 | Args:
54 | url (str):
55 | URL to fix
56 |
57 | Returns:
58 | url (str):
59 | Fixed URL
60 | """
61 | assert url.startswith(
62 | "https://drive.google.com/file/d/"
63 | ), "URL must start with https://drive.google.com/file/d/"
64 | return "https://drive.google.com/uc?id=" + url.split("/")[-2]
65 |
66 |
67 | def load_csv(path: str) -> pd.DataFrame:
68 | """
69 | Load a CSV file.
70 |
71 | Args:
72 | path (str):
73 | Path to CSV file
74 |
75 | Returns:
76 | df (pd.DataFrame):
77 | Data frame
78 | """
79 | if path.startswith("https://drive.google.com/file/d/"):
80 | return pd.read_csv(fix_google_drive_url(path))
81 | else:
82 | return pd.read_csv(path)
83 |
84 |
85 | def load_clean_data_df(
86 | data_path: Union[str, Path],
87 | load_func: Optional[Callable] = load_csv,
88 | clean_func: Optional[Callable] = clean_column_names,
89 | ) -> pd.DataFrame:
90 | """
91 | Load and clean a data frame.
92 |
93 | Args:
94 | data_path (Union[str, Path]):
95 | Path to data
96 | load_func (Optional[Callable]):
97 | Function to load data
98 | clean_func (Optional[Callable]):
99 | Function to clean data
100 |
101 | Returns:
102 | df (pd.DataFrame):
103 | Cleaned data frame
104 | """
105 | df = load_func(data_path)
106 | df = clean_func(df)
107 | return df
108 |
109 |
110 | def load_clean_data_gdf(
111 | data_path: Union[str, Path],
112 | load_func: Optional[Callable] = load_csv,
113 | clean_func: Optional[Callable] = clean_column_names,
114 | ) -> gpd.GeoDataFrame:
115 | """
116 | Load and clean a data frame, outputting it as a GeoDataFrame.
117 |
118 | Args:
119 | data_path (Union[str, Path]):
120 | Path to data
121 | load_func (Optional[Callable]):
122 | Function to load data
123 | clean_func (Optional[Callable]):
124 | Function to clean data
125 |
126 | Returns:
127 | gdf (gpd.GeoDataFrame):
128 | Cleaned data frame
129 | """
130 | df = load_clean_data_df(
131 | data_path=data_path, load_func=load_func, clean_func=clean_func
132 | )
133 | gdf = gpd.GeoDataFrame(
134 | df,
135 | geometry=gpd.points_from_xy(
136 | df["longitude"],
137 | df["latitude"],
138 | ),
139 | crs=f"EPSG:{GLOBAL_EPSG}",
140 | )
141 | return gdf
142 |
143 |
144 | def load_raw_gcpt_data(gcpt_path: Union[str, Path]) -> pd.DataFrame:
145 | """
146 | Load GCPT data in its raw excel format from GCS.
147 |
148 | Returns:
149 | df (pd.DataFrame):
150 | GCPT data frame
151 | """
152 | df = pd.read_excel(
153 | gcpt_path,
154 | sheet_name="Units",
155 | )
156 | return df
157 |
158 |
159 | def clean_gcpt(df: pd.DataFrame) -> pd.DataFrame:
160 | """
161 | Clean the GCPT data frame, setting better column names.
162 |
163 | Args:
164 | df (pd.DataFrame):
165 | GCPT data frame
166 |
167 | Returns:
168 | df (pd.DataFrame):
169 | Cleaned GCPT data frame
170 | """
171 | df = clean_column_names(df)
172 | df.rename(columns={"parentid": "parent_id"}, inplace=True)
173 | df.rename(columns={"trackerloc": "tracker_loc"}, inplace=True)
174 | return df
175 |
176 |
177 | def load_clean_gcpt_gdf(gcpt_path: Union[str, Path]) -> gpd.GeoDataFrame:
178 | """
179 | Load and clean the GCPT data frame.
180 |
181 | Args:
182 | gcpt_path (Union[str, Path]):
183 | Path to GCPT data
184 |
185 | Returns:
186 | gdf (gpd.GeoDataFrame):
187 | Cleaned GCPT data frame
188 | """
189 | return load_clean_data_gdf(
190 | data_path=gcpt_path, load_func=load_raw_gcpt_data, clean_func=clean_gcpt
191 | )
192 |
193 |
194 | def clean_campd_facilities(df: pd.DataFrame) -> pd.DataFrame:
195 | """
196 | Clean the CAMPD facilities data frame.
197 |
198 | Args:
199 | df (pd.DataFrame):
200 | CAMPD facilities data frame
201 |
202 | Returns:
203 | df (pd.DataFrame):
204 | Cleaned CAMPD facilities data frame
205 | """
206 | df = clean_column_names(df)
207 | # get the capacity
208 | df["capacity_mw"] = (
209 | df["associated_generators_&_nameplate_capacity_mwe"]
210 | .str.split(" ")
211 | .str[-1]
212 | .str.replace("(", "")
213 | .str.replace(")", "")
214 | .astype(float)
215 | )
216 | # filter to operating units
217 | df = df[(df.operating_status == "Operating") & (df.capacity_mw > 0)]
218 | # aggregate by facility
219 | df = df.groupby(["facility_id", "year"]).agg(
220 | {
221 | "capacity_mw": "sum",
222 | "facility_name": "first",
223 | "latitude": "mean",
224 | "longitude": "mean",
225 | }
226 | )
227 | # rearrange columns
228 | df = df.reset_index()[
229 | ["facility_id", "facility_name", "year", "capacity_mw", "latitude", "longitude"]
230 | ]
231 | # fix datetime column data type
232 | df.year = pd.to_datetime(df.year, format="%Y")
233 | return df
234 |
235 |
236 | def load_clean_campd_facilities_gdf(
237 | campd_facilities_path: Union[str, Path]
238 | ) -> gpd.GeoDataFrame:
239 | """
240 | Load and clean the CAMPD facilities data frame.
241 |
242 | Args:
243 | campd_facilities_path (Union[str, Path]):
244 | Path to CAMPD facilities data
245 |
246 | Returns:
247 | gdf (gpd.GeoDataFrame):
248 | Cleaned CAMPD facilities data frame
249 | """
250 | return load_clean_data_gdf(
251 | data_path=campd_facilities_path,
252 | load_func=load_csv,
253 | clean_func=clean_campd_facilities,
254 | )
255 |
256 |
257 | def clean_campd_emissions(df: pd.DataFrame) -> pd.DataFrame:
258 | """
259 | Clean the CAMPD emissions data frame.
260 |
261 | Args:
262 | df (pd.DataFrame):
263 | CAMPD emissions data frame
264 |
265 | Returns:
266 | df (pd.DataFrame):
267 | Cleaned CAMPD emissions data frame
268 | """
269 | df = clean_column_names(df)
270 | # fix datetime column data type
271 | df.date = pd.to_datetime(df.date)
272 | # fill missing values (emissions seem to be ignored if their value is 0)
273 | df = df.fillna(0)
274 | return df
275 |
276 |
277 | def load_clean_campd_emissions_df(
278 | campd_emissions_path: Union[str, Path]
279 | ) -> pd.DataFrame:
280 | """
281 | Load and clean the CAMPD emissions data frame.
282 |
283 | Args:
284 | campd_emissions_path (Union[str, Path]):
285 | Path to CAMPD emissions data
286 |
287 | Returns:
288 | df (pd.DataFrame):
289 | Cleaned CAMPD emissions data frame
290 | """
291 | return load_clean_data_df(
292 | data_path=campd_emissions_path,
293 | load_func=load_csv,
294 | clean_func=clean_campd_emissions,
295 | )
296 |
297 |
298 | def load_osm_data(
299 | country: str = "United States", tag: str = "man_made", value: str = "cooling_tower"
300 | ) -> gpd.GeoDataFrame:
301 | """
302 | Load OSM data.
303 |
304 | Args:
305 | country (str):
306 | Country to filter to
307 | tag (str):
308 | OSM tag to filter to
309 | value (str):
310 | OSM value to filter to
311 |
312 | Returns:
313 | gdf (gpd.GeoDataFrame):
314 | OSM cooling towers data frame
315 | """
316 | # load the data
317 | osm_results = OSM_API.query(
318 | query=f"""
319 | area[name="{country}"]->.searchArea;
320 | (
321 | node["{tag}"="{value}"](area.searchArea);
322 | way["{tag}"="{value}"](area.searchArea);
323 | relation["{tag}"="{value}"](area.searchArea);
324 | );
325 | out body;
326 | >;
327 | out skel qt;
328 | """
329 | )
330 | df = pd.DataFrame(
331 | [
332 | {
333 | "osm_id": element.id,
334 | "latitude": element.lat,
335 | "longitude": element.lon,
336 | }
337 | for element in osm_results.nodes
338 | ]
339 | )
340 | # convert to geodataframe
341 | gdf = gpd.GeoDataFrame(
342 | df,
343 | geometry=gpd.points_from_xy(df.longitude, df.latitude),
344 | crs="EPSG:4326",
345 | )
346 | return gdf
347 |
348 |
349 | def filter_to_cooling_tower_plants(
350 | gdf: gpd.GeoDataFrame,
351 | campd_facilities_path: Union[str, Path],
352 | ) -> gpd.GeoDataFrame:
353 | """
354 | Filter data to plants with cooling towers.
355 |
356 | Args:
357 | gdf (gpd.GeoDataFrame):
358 | Data to be filtered
359 | campd_facilities_path (Union[str, Path]):
360 | Path to CAMPD facilities data
361 |
362 | Returns:
363 | gdf (gpd.GeoDataFrame):
364 | Filtered data
365 | """
366 | # load the CAMPD facilities data
367 | campd_facilities_gdf = load_clean_campd_facilities_gdf(campd_facilities_path)
368 | # load the OSM data
369 | osm_gdf = load_osm_data()
370 | # spatial join
371 | campd_ndt_gdf = gpd.sjoin_nearest(
372 | campd_facilities_gdf,
373 | osm_gdf,
374 | how="inner",
375 | distance_col="distances",
376 | max_distance=0.01,
377 | )
378 | # filter to plants with cooling towers
379 | gdf = gdf[gdf.facility_id.isin(campd_ndt_gdf.facility_id)]
380 | return gdf
381 |
382 |
383 | def clean_image_metadata(df: pd.DataFrame, cog_type: str = "visual") -> pd.DataFrame:
384 | """
385 | Clean the image metadata data frame.
386 |
387 | Args:
388 | df (pd.DataFrame):
389 | Image metadata data frame
390 | cog_type (str):
391 | Type of COG to filter to. If "all", no filtering is done.
392 |
393 | Returns:
394 | df (pd.DataFrame):
395 | Cleaned image metadata data frame
396 | """
397 | df = clean_column_names(df)
398 | # fix datetime column data type
399 | df.ts = pd.to_datetime(df.ts)
400 | # filter to most relevant columns
401 | if cog_type != "all":
402 | df.rename(columns={cog_type: "cog_url"}, inplace=True)
403 | df = df[["facility_id", "ts", "cloud_cover", "cog_url"]]
404 | else:
405 | df = df[
406 | [
407 | "facility_id",
408 | "ts",
409 | "cloud_cover",
410 | "visual",
411 | ]
412 | + ALL_BANDS
413 | ]
414 | return df
415 |
416 |
417 | def load_clean_image_metadata_df(
418 | image_metadata_path: Union[str, Path], cog_type: str = "visual"
419 | ) -> pd.DataFrame:
420 | """
421 | Load and clean the image metadata data frame.
422 |
423 | Args:
424 | image_metadata_path (Union[str, Path]):
425 | Path to image metadata data
426 | cog_type (str):
427 | Type of COG to filter to
428 |
429 | Returns:
430 | df (pd.DataFrame):
431 | Cleaned image metadata data frame
432 | """
433 | return load_clean_data_df(
434 | data_path=image_metadata_path,
435 | load_func=load_csv,
436 | clean_func=lambda df: clean_image_metadata(df, cog_type=cog_type),
437 | )
438 |
439 |
440 | def get_final_dataset(
441 | image_metadata_path: Union[str, Path],
442 | campd_facilities_path: Union[str, Path],
443 | campd_emissions_path: Union[str, Path],
444 | cog_type: str = "visual",
445 | ) -> gpd.GeoDataFrame:
446 | """
447 | Get the final dataset that has the facility and image metadata, as well as
448 | the emissions data that we'll train models on.
449 |
450 | Args:
451 | image_metadata_path (Union[str, Path]):
452 | Path to image metadata data
453 | campd_facilities_path (Union[str, Path]):
454 | Path to CAMPD facilities data
455 | campd_emissions_path (Union[str, Path]):
456 | Path to CAMPD emissions data
457 | cog_type (str):
458 | Type of COG to filter to. If "all", no filtering is done.
459 |
460 | Returns:
461 | gdf (gpd.GeoDataFrame):
462 | Final dataset that has the facility and image metadata, as well as
463 | the emissions data that we'll train models on
464 | """
465 | # load all data
466 | image_metadata_df = load_clean_image_metadata_df(
467 | image_metadata_path=image_metadata_path, cog_type=cog_type
468 | )
469 | campd_facilities_gdf = load_clean_campd_facilities_gdf(
470 | campd_facilities_path=campd_facilities_path
471 | )
472 | campd_facilities_gdf = create_aoi_for_plants(campd_facilities_gdf)
473 | campd_emissions_df = load_clean_campd_emissions_df(
474 | campd_emissions_path=campd_emissions_path
475 | )
476 | # remove the hour info from the date so as to join by day of the year
477 | image_metadata_df["date_without_time"] = image_metadata_df["ts"].dt.date
478 | campd_emissions_df["date_without_time"] = campd_emissions_df["date"].dt.date
479 | # merge the emissions with image metadata
480 | merged_df = pd.merge(
481 | left=campd_emissions_df,
482 | right=image_metadata_df,
483 | how="inner",
484 | on=["facility_id", "date_without_time"],
485 | )
486 | # merge the facilities with the merged emissions and image metadata
487 | merged_df = pd.merge(
488 | left=merged_df,
489 | right=campd_facilities_gdf,
490 | how="inner",
491 | on="facility_id",
492 | suffixes=("", "_to_delete"),
493 | )
494 | # filter to the columns that we care about for model training
495 | if cog_type != "all":
496 | final_columns = MAIN_COLUMNS + ["cog_url"]
497 | else:
498 | final_columns = MAIN_COLUMNS + ["visual"] + ALL_BANDS
499 | merged_df = merged_df[final_columns]
500 | merged_df.drop_duplicates(["facility_id", "ts"], inplace=True)
501 | # make sure that it's in geopandas format
502 | merged_df = gpd.GeoDataFrame(
503 | merged_df,
504 | geometry=merged_df.geometry,
505 | crs=f"EPSG:{GLOBAL_EPSG}",
506 | )
507 | return merged_df
508 |
509 |
510 | def clean_final_dataset(df: pd.DataFrame) -> gpd.GeoDataFrame:
511 | """
512 | Clean the final dataset that has the facility and image metadata, as well as
513 | the emissions data that we'll train models on.
514 |
515 | Args:
516 | df (pd.DataFrame):
517 | Final dataset that has the facility and image metadata, as well as
518 | the emissions data that we'll train models on
519 |
520 | Returns:
521 | gdf (gpd.GeoDataFrame):
522 | Cleaned final dataset that has the facility and image metadata, as
523 | well as the emissions data that we'll train models on
524 | """
525 | # fix datetime column data type
526 | df.ts = pd.to_datetime(df.ts)
527 | # fix geometry column data type
528 | df.geometry = gpd.GeoSeries.from_wkt(df.geometry)
529 | gdf = gpd.GeoDataFrame(df, geometry=df.geometry, crs=f"EPSG:{GLOBAL_EPSG}")
530 | return gdf
531 |
532 |
533 | def load_final_dataset(final_dataset_path: Union[str, Path]) -> gpd.GeoDataFrame:
534 | """
535 | Load the final dataset that has the facility and image metadata, as well as
536 | the emissions data that we'll train models on.
537 |
538 | Args:
539 | final_dataset_path (Union[str, Path]):
540 | Path to the final dataset
541 |
542 | Returns:
543 | gdf (gpd.GeoDataFrame):
544 | Final dataset that has the facility and image metadata, as well as
545 | the emissions data that we'll train models on
546 | """
547 | return load_clean_data_df(
548 | data_path=final_dataset_path,
549 | load_func=load_csv,
550 | clean_func=clean_final_dataset,
551 | )
552 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/data_viz.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | import numpy as np
3 | import torch
4 | from plotly.graph_objs import Figure
5 | import plotly.express as px
6 |
7 |
8 | def view_satellite_image(image: Union[np.ndarray, torch.Tensor]) -> Figure:
9 | """
10 | View a satellite image using plotly
11 |
12 | Args:
13 | image (np.ndarray):
14 | The satellite image
15 |
16 | Returns:
17 | Figure:
18 | The plotly figure
19 | """
20 | if isinstance(image, torch.Tensor):
21 | image = image.numpy()
22 | fig = px.imshow(image.transpose(1, 2, 0), zmin=0, zmax=255)
23 | # remove padding
24 | fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
25 | return fig
26 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/dataset.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Optional, Union
3 | from loguru import logger
4 | import warnings
5 | import numpy as np
6 | import torch
7 | from torch.utils.data import IterableDataset, DataLoader
8 | from lightning import LightningDataModule
9 | import geopandas as gpd
10 | from tqdm.auto import tqdm
11 |
12 | # surpress batch size warning
13 | warnings.filterwarnings(
14 | "ignore", message="Trying to infer the `batch_size` from an ambiguous*"
15 | )
16 |
17 | from coal_emissions_monitoring.constants import (
18 | BATCH_SIZE,
19 | CROP_SIZE_PX,
20 | EMISSIONS_TARGET,
21 | MAIN_COLUMNS,
22 | IMAGE_SIZE_PX,
23 | MAX_BRIGHT_MEAN,
24 | MAX_CLOUD_COVER_PRCT,
25 | MAX_DARK_FRAC,
26 | TEST_YEAR,
27 | TRAIN_VAL_RATIO,
28 | )
29 | from coal_emissions_monitoring.satellite_imagery import (
30 | fetch_image_path_from_cog,
31 | get_image_from_cog,
32 | is_image_too_bright,
33 | is_image_too_dark,
34 | )
35 | from coal_emissions_monitoring.data_cleaning import (
36 | filter_to_cooling_tower_plants,
37 | get_final_dataset,
38 | load_final_dataset,
39 | )
40 | from coal_emissions_monitoring.ml_utils import (
41 | emissions_to_category,
42 | get_facility_set_mapper,
43 | split_data_in_sets,
44 | )
45 | from coal_emissions_monitoring.transforms import get_transform
46 |
47 |
48 | class CoalEmissionsDataset(IterableDataset):
49 | def __init__(
50 | self,
51 | gdf: gpd.GeoDataFrame,
52 | target: str = EMISSIONS_TARGET,
53 | image_size: int = IMAGE_SIZE_PX,
54 | max_dark_frac: float = MAX_DARK_FRAC,
55 | max_mean_val: float = MAX_BRIGHT_MEAN,
56 | transforms: Optional[torch.nn.Module] = None,
57 | use_local_images: bool = False,
58 | ):
59 | """
60 | Dataset that gets images of coal power plants, their emissions
61 | and metadata.
62 |
63 | Args:
64 | gdf (gpd.GeoDataFrame):
65 | A GeoDataFrame with the following columns:
66 | - facility_id
67 | - latitude
68 | - longitude
69 | - ts
70 | - is_powered_on
71 | - cloud_cover
72 | - cog_url
73 | - geometry
74 | target (str):
75 | The target column to predict
76 | image_size (int):
77 | The size of the image in pixels
78 | max_dark_frac (float):
79 | The maximum fraction of dark pixels allowed for an image;
80 | if the image has more dark pixels than this, it is skipped
81 | max_mean_val (float):
82 | The maximum mean value allowed for an image; if the image
83 | has a higher mean value than this, it is skipped
84 | transforms (Optional[torch.nn.Module]):
85 | A PyTorch module that transforms the image
86 | use_local_images (bool):
87 | Whether to use local images instead of downloading them
88 | from the cloud
89 | """
90 | assert len(set(MAIN_COLUMNS) - set(gdf.columns)) == 0, (
91 | "gdf must have all columns of the following list:\n"
92 | f"{MAIN_COLUMNS}\n"
93 | f"Instead, gdf has the following columns:\n"
94 | f"{gdf.columns}"
95 | )
96 | self.gdf = gdf
97 | self.target = target
98 | self.image_size = image_size
99 | self.max_dark_frac = max_dark_frac
100 | self.max_mean_val = max_mean_val
101 | self.transforms = transforms
102 | self.use_local_images = use_local_images
103 | if self.use_local_images:
104 | assert "local_image_path" in self.gdf.columns, (
105 | "If use_local_images is True, gdf must have a "
106 | "local_image_path column"
107 | )
108 |
109 | def __iter__(self):
110 | if torch.utils.data.get_worker_info():
111 | worker_total_num = torch.utils.data.get_worker_info().num_workers
112 | worker_id = torch.utils.data.get_worker_info().id
113 | else:
114 | worker_total_num = 1
115 | worker_id = 0
116 | for idx in range(worker_id, len(self.gdf), worker_total_num):
117 | row = self.gdf.iloc[idx]
118 | if self.use_local_images:
119 | try:
120 | image = np.load(row.local_image_path)
121 | except TypeError as e:
122 | logger.warning(
123 | f"Could not load local image at {row.local_image_path}. "
124 | f"Original error: {e}"
125 | )
126 | continue
127 | else:
128 | image = get_image_from_cog(
129 | cog_url=row.cog_url, geometry=row.geometry, size=self.image_size
130 | )
131 | image = torch.from_numpy(image).float()
132 | if is_image_too_dark(
133 | image, max_dark_frac=self.max_dark_frac
134 | ) or is_image_too_bright(image, max_mean_val=self.max_mean_val):
135 | continue
136 | if self.transforms is not None:
137 | try:
138 | image = self.transforms(image).squeeze(0)
139 | except AssertionError as e:
140 | logger.warning(
141 | f"Could not transform image at {row.local_image_path}. "
142 | f"Original error: {e}"
143 | )
144 | continue
145 |
146 | target = torch.tensor(row[self.target]).float()
147 | metadata = row.drop([self.target, "geometry", "data_set"]).to_dict()
148 | metadata["ts"] = str(metadata["ts"])
149 | yield {
150 | "image": image,
151 | "target": target,
152 | "metadata": metadata,
153 | }
154 |
155 |
156 | class CoalEmissionsDataModule(LightningDataModule):
157 | def __init__(
158 | self,
159 | final_dataset_path: Optional[Union[str, Path]] = None,
160 | image_metadata_path: Optional[Union[str, Path]] = None,
161 | campd_facilities_path: Optional[Union[str, Path]] = None,
162 | campd_emissions_path: Optional[Union[str, Path]] = None,
163 | target: str = EMISSIONS_TARGET,
164 | image_size: int = IMAGE_SIZE_PX,
165 | crop_size: int = CROP_SIZE_PX,
166 | train_val_ratio: float = TRAIN_VAL_RATIO,
167 | test_year: int = TEST_YEAR,
168 | batch_size: int = BATCH_SIZE,
169 | max_dark_frac: float = MAX_DARK_FRAC,
170 | max_mean_val: float = MAX_BRIGHT_MEAN,
171 | max_cloud_cover_prct: int = MAX_CLOUD_COVER_PRCT,
172 | predownload_images: bool = False,
173 | download_missing_images: bool = False,
174 | images_dir: str = "images/",
175 | num_workers: int = 0,
176 | ):
177 | """
178 | Lightning Data Module that gets images of coal power plants,
179 | their emissions and metadata, and splits them into train,
180 | validation and test sets.
181 |
182 | Args:
183 | image_metadata_path (Union[str, Path]):
184 | Path to image metadata data
185 | campd_facilities_path (Union[str, Path]):
186 | Path to CAMPD facilities data
187 | campd_emissions_path (Union[str, Path]):
188 | Path to CAMPD emissions data
189 | target (str):
190 | The target column to predict
191 | image_size (int):
192 | The size of the image in pixels
193 | crop_size (int):
194 | The size of the crop in pixels
195 | train_val_ratio (float):
196 | The ratio of train to validation data
197 | test_year (int):
198 | The year to use for testing
199 | batch_size (int):
200 | The batch size, i.e. the number of samples to load at once
201 | max_dark_frac (float):
202 | The maximum fraction of dark pixels allowed for an image;
203 | if the image has more dark pixels than this, it is skipped
204 | max_mean_val (float):
205 | The maximum mean value allowed for an image; if the image
206 | has a higher mean value than this, it is skipped
207 | max_cloud_cover_prct (int):
208 | The maximum cloud cover percentage allowed for an image;
209 | if the image has more cloud cover than this, it is skipped
210 | predownload_images (bool):
211 | Whether to pre-download images from the cloud or load each
212 | one on the fly
213 | download_missing_images (bool):
214 | Whether to download images that are missing from the
215 | images_dir path
216 | images_dir (str):
217 | The directory to save images to if predownload_images is True
218 | num_workers (int):
219 | The number of workers to use for loading data
220 | """
221 | super().__init__()
222 | self.final_dataset_path = final_dataset_path
223 | self.image_metadata_path = image_metadata_path
224 | self.campd_facilities_path = campd_facilities_path
225 | self.campd_emissions_path = campd_emissions_path
226 | self.target = target
227 | self.image_size = image_size
228 | self.crop_size = crop_size
229 | self.train_val_ratio = train_val_ratio
230 | self.test_year = test_year
231 | self.batch_size = batch_size
232 | self.max_dark_frac = max_dark_frac
233 | self.max_mean_val = max_mean_val
234 | self.max_cloud_cover_prct = max_cloud_cover_prct
235 | self.predownload_images = predownload_images
236 | self.download_missing_images = download_missing_images
237 | self.images_dir = images_dir
238 | self.num_workers = num_workers
239 | self.emissions_quantiles = None
240 |
241 | def setup(self, stage: str):
242 | """
243 | Split the data into train, validation and test sets.
244 |
245 | Args:
246 | stage (str):
247 | The stage of the setup
248 | """
249 | # load the final dataset
250 | if self.final_dataset_path is not None:
251 | self.gdf = load_final_dataset(self.final_dataset_path)
252 | else:
253 | self.gdf = get_final_dataset(
254 | image_metadata_path=self.image_metadata_path,
255 | campd_facilities_path=self.campd_facilities_path,
256 | campd_emissions_path=self.campd_emissions_path,
257 | )
258 | # filter out rows with too much cloud cover
259 | self.gdf = self.gdf[self.gdf.cloud_cover <= self.max_cloud_cover_prct]
260 | if self.predownload_images:
261 | # make sure that images are already downloaded
262 | if "local_image_path" not in self.gdf.columns:
263 | tqdm.pandas(desc="Downloading images")
264 | self.gdf["local_image_path"] = self.gdf.progress_apply(
265 | lambda row: fetch_image_path_from_cog(
266 | cog_url=row.cog_url,
267 | geometry=row.geometry,
268 | size=self.image_size,
269 | images_dir=self.images_dir,
270 | download_missing_images=self.download_missing_images,
271 | ),
272 | axis=1,
273 | )
274 | # skip rows where the image could not be downloaded
275 | self.gdf = self.gdf[~self.gdf.local_image_path.isna()]
276 | else:
277 | # make sure that the image paths are in the right directory
278 | current_image_path = (
279 | self.gdf.local_image_path.str.split("/")
280 | .str[:-1]
281 | .str.join("/")
282 | .iloc[0]
283 | )
284 | if current_image_path != self.images_dir:
285 | self.gdf.local_image_path = self.gdf.local_image_path.str.replace(
286 | current_image_path, self.images_dir
287 | )
288 | # split the data into train, validation and test sets
289 | facility_set_mapper = get_facility_set_mapper(
290 | self.gdf,
291 | train_val_ratio=self.train_val_ratio,
292 | )
293 | self.gdf["data_set"] = self.gdf.apply(
294 | lambda row: split_data_in_sets(
295 | row=row, data_set_mapper=facility_set_mapper, test_year=self.test_year
296 | ),
297 | axis=1,
298 | )
299 | self.pos_weight = self.get_pos_weight(self.gdf)
300 | if stage == "fit":
301 | self.train_dataset = CoalEmissionsDataset(
302 | gdf=self.gdf[self.gdf.data_set == "train"].sample(frac=1),
303 | target=self.target,
304 | image_size=self.image_size,
305 | transforms=get_transform(data_group="train", crop_size=self.crop_size),
306 | use_local_images=self.predownload_images,
307 | max_dark_frac=self.max_dark_frac,
308 | max_mean_val=self.max_mean_val,
309 | )
310 | self.val_dataset = CoalEmissionsDataset(
311 | gdf=self.gdf[self.gdf.data_set == "val"].sample(frac=1),
312 | target=self.target,
313 | image_size=self.image_size,
314 | transforms=get_transform(data_group="val", crop_size=self.crop_size),
315 | use_local_images=self.predownload_images,
316 | max_dark_frac=self.max_dark_frac,
317 | max_mean_val=self.max_mean_val,
318 | )
319 | elif stage == "test":
320 | self.test_dataset = CoalEmissionsDataset(
321 | gdf=self.gdf[self.gdf.data_set == "test"].sample(frac=1),
322 | target=self.target,
323 | image_size=self.image_size,
324 | transforms=get_transform(data_group="test", crop_size=self.crop_size),
325 | use_local_images=self.predownload_images,
326 | max_dark_frac=self.max_dark_frac,
327 | max_mean_val=self.max_mean_val,
328 | )
329 |
330 | def get_dataloader(self, data_group: str):
331 | # reshuffle the dataset
332 | getattr(self, f"{data_group}_dataset").gdf = getattr(
333 | self, f"{data_group}_dataset"
334 | ).gdf.sample(frac=1)
335 | # reset the dataloader
336 | return DataLoader(
337 | getattr(self, f"{data_group}_dataset"),
338 | batch_size=self.batch_size,
339 | num_workers=self.num_workers,
340 | pin_memory=True if torch.cuda.is_available() else False,
341 | )
342 |
343 | def train_dataloader(self):
344 | return self.get_dataloader("train")
345 |
346 | def val_dataloader(self):
347 | return self.get_dataloader("val")
348 |
349 | def test_dataloader(self):
350 | return self.get_dataloader("test")
351 |
352 | def get_pos_weight(self, gdf: Optional[gpd.GeoDataFrame] = None) -> float:
353 | """
354 | Get the positive weight for the dataset, based on class imbalance.
355 |
356 | Args:
357 | gdf (Optional[gpd.GeoDataFrame]):
358 | The dataset to use for calculating the positive weight.
359 | If None, the dataset used for training will be used.
360 |
361 | Returns:
362 | float:
363 | The positive weight
364 | """
365 | if gdf is None:
366 | gdf = self.gdf
367 | num_positives = gdf[self.target].sum()
368 | num_negatives = len(gdf) - num_positives
369 | return num_negatives / num_positives
370 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/ml_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Tuple
2 | import geopandas as gpd
3 | import numpy as np
4 | import pandas as pd
5 | import torch
6 |
7 | from coal_emissions_monitoring.data_cleaning import load_clean_campd_facilities_gdf
8 | from coal_emissions_monitoring.constants import TEST_YEAR, TRAIN_VAL_RATIO
9 |
10 |
11 | def get_facility_set_mapper(
12 | gdf: gpd.GeoDataFrame, train_val_ratio: float = TRAIN_VAL_RATIO
13 | ) -> Dict[int, str]:
14 | """
15 | Get a mapper from facility ID to a set of train or validation.
16 |
17 | Args:
18 | gdf (gpd.GeoDataFrame):
19 | The gdf containing the facility IDs
20 | train_val_ratio (float):
21 | The ratio of training to validation data
22 |
23 | Returns:
24 | Dict[int, str]:
25 | A mapper from facility ID to a set of train or validation
26 | """
27 | assigned_facilities = set()
28 | for facility_id, facility_gdf in gdf.groupby("facility_id"):
29 | if facility_id in assigned_facilities:
30 | continue
31 | # assign a data set to the facility
32 | data_set = np.random.choice(
33 | ["train", "val"], p=[train_val_ratio, 1 - train_val_ratio]
34 | )
35 | gdf.loc[gdf.facility_id == facility_id, "data_set"] = data_set
36 | assigned_facilities.add(facility_id)
37 | # apply the same data set to intersecting facilities
38 | other_facilities_gdf = gdf.loc[
39 | gdf.facility_id != facility_id, ["facility_id", "geometry"]
40 | ]
41 | other_facilities_gdf.rename(
42 | columns={"facility_id": "intersecting_facility_id"}, inplace=True
43 | )
44 | intersecting_facilities_gdf = gpd.sjoin(
45 | facility_gdf,
46 | other_facilities_gdf,
47 | how="inner",
48 | predicate="intersects",
49 | )
50 | if intersecting_facilities_gdf.empty:
51 | continue
52 | else:
53 | for intersecting_facility_id in intersecting_facilities_gdf[
54 | "intersecting_facility_id"
55 | ].unique():
56 | gdf.loc[
57 | gdf.facility_id == intersecting_facility_id,
58 | "data_set",
59 | ] = data_set
60 | assigned_facilities.add(intersecting_facility_id)
61 | # create a mapper from facility ID to a set of train or validation
62 | return gdf.groupby("facility_id").data_set.first().to_dict()
63 |
64 |
65 | def split_data_in_sets(
66 | row: pd.DataFrame, data_set_mapper: Dict[int, str], test_year: int = TEST_YEAR
67 | ) -> str:
68 | """
69 | Split the data in sets. This function is meant to be used with pandas.DataFrame.apply.
70 |
71 | Args:
72 | row (pd.DataFrame):
73 | The row of the DataFrame
74 | data_set_mapper (Dict[int, str]):
75 | A mapper from facility ID to a set of train or validation
76 | test_year (int):
77 | The year to use for testing
78 |
79 | Returns:
80 | str:
81 | The data set
82 | """
83 | if row.ts.year == test_year:
84 | data_set = "test"
85 | else:
86 | data_set = data_set_mapper[row.facility_id]
87 | return data_set
88 |
89 |
90 | def emissions_to_category(
91 | emissions: float, quantiles: Dict[float, float], rescale: bool = False
92 | ) -> int:
93 | """
94 | Convert emissions to a category based on quantiles. The quantiles are
95 | calculated from the training data. Here's how the categories are defined:
96 | - 0: no emissions
97 | - 1: low emissions
98 | - 2: medium emissions
99 | - 3: high emissions
100 | - 4: very high emissions
101 |
102 | Args:
103 | emissions (float): emissions value
104 | quantiles (Dict[float, float]): quantiles to use for categorization
105 | rescale (bool): whether to rescale emissions to the original range,
106 | using the 99th quantile as the maximum value
107 |
108 | Returns:
109 | int: category
110 | """
111 | if rescale:
112 | emissions = emissions * quantiles[0.99]
113 | if emissions <= 0:
114 | return 0
115 | elif emissions <= quantiles[0.3]:
116 | return 1
117 | elif emissions > quantiles[0.3] and emissions <= quantiles[0.6]:
118 | return 2
119 | elif emissions > quantiles[0.6] and emissions <= quantiles[0.99]:
120 | return 3
121 | else:
122 | return 4
123 |
124 |
125 | def preds_n_targets_to_categories(
126 | preds: torch.Tensor,
127 | targets: torch.Tensor,
128 | quantiles: Dict[float, float],
129 | rescale: bool = False,
130 | ) -> Tuple[torch.Tensor, torch.Tensor]:
131 | """
132 | Convert emissions to a category based on quantiles. The quantiles are
133 | calculated from the training data. Here's how the categories are defined:
134 | - 0: no emissions
135 | - 1: low emissions
136 | - 2: medium emissions
137 | - 3: high emissions
138 | - 4: very high emissions
139 |
140 | Args:
141 | preds (torch.Tensor): emissions predictions
142 | targets (torch.Tensor): emissions targets
143 | quantiles (Dict[float, float]): quantiles to use for categorization
144 | rescale (bool): whether to rescale emissions to the original range,
145 | using the 99th quantile as the maximum value
146 |
147 | Returns:
148 | Tuple[torch.Tensor, torch.Tensor]: tuple of predictions and targets
149 | """
150 | preds_cat = torch.tensor(
151 | [
152 | emissions_to_category(y_pred_i, quantiles, rescale=rescale)
153 | for y_pred_i in preds
154 | ]
155 | ).to(preds.device)
156 | targets_cat = torch.tensor(
157 | [emissions_to_category(y_i, quantiles, rescale=rescale) for y_i in targets]
158 | ).to(targets.device)
159 | return preds_cat, targets_cat
160 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/model.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 | from lightning import LightningModule
3 | import torch
4 | import torchmetrics
5 | from sklearn.metrics import balanced_accuracy_score
6 | import warnings
7 |
8 | from coal_emissions_monitoring.constants import POSITIVE_THRESHOLD
9 |
10 | # surpress balanced accuracy warning
11 | warnings.filterwarnings("ignore", message="y_pred contains classes not in y_true*")
12 |
13 |
14 | class SmallCNN(torch.nn.Module):
15 | def __init__(self, num_input_channels: int = 3, num_classes: int = 1):
16 | super().__init__()
17 | self.num_input_channels = num_input_channels
18 | self.num_classes = num_classes
19 | # build a simple model with EfficientNet-like blocks, global pooling
20 | # and a final linear layer, compatible with images of size 32x32
21 | self.model = torch.nn.Sequential(
22 | torch.nn.Conv2d(
23 | in_channels=self.num_input_channels,
24 | out_channels=16,
25 | kernel_size=3,
26 | padding=1,
27 | ),
28 | torch.nn.ReLU(),
29 | torch.nn.Conv2d(
30 | in_channels=16,
31 | out_channels=32,
32 | kernel_size=3,
33 | padding=1,
34 | ),
35 | torch.nn.ReLU(),
36 | torch.nn.MaxPool2d(kernel_size=2),
37 | torch.nn.Conv2d(
38 | in_channels=32,
39 | out_channels=64,
40 | kernel_size=3,
41 | padding=1,
42 | ),
43 | torch.nn.ReLU(),
44 | torch.nn.Conv2d(
45 | in_channels=64,
46 | out_channels=64,
47 | kernel_size=3,
48 | padding=1,
49 | ),
50 | torch.nn.ReLU(),
51 | torch.nn.MaxPool2d(kernel_size=2),
52 | torch.nn.Conv2d(
53 | in_channels=64,
54 | out_channels=128,
55 | kernel_size=3,
56 | padding=1,
57 | ),
58 | torch.nn.ReLU(),
59 | torch.nn.Conv2d(
60 | in_channels=128,
61 | out_channels=128,
62 | kernel_size=3,
63 | padding=1,
64 | ),
65 | torch.nn.ReLU(),
66 | torch.nn.AdaptiveAvgPool2d(output_size=1),
67 | torch.nn.Flatten(),
68 | torch.nn.Linear(128, self.num_classes),
69 | )
70 |
71 | def forward(self, x):
72 | return self.model(x)
73 |
74 |
75 | class CoalEmissionsModel(LightningModule):
76 | def __init__(
77 | self,
78 | model: torch.nn.Module,
79 | learning_rate: float = 1e-3,
80 | pos_weight: float = 1.0,
81 | ):
82 | super().__init__()
83 | self.model = model
84 | self.learning_rate = learning_rate
85 | self.pos_weight = pos_weight
86 | self.loss = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.pos_weight))
87 |
88 | def forward(self, x):
89 | preds = self.model(x).squeeze(-1)
90 | return preds
91 |
92 | def calculate_all_metrics(
93 | self, preds: torch.Tensor, targets: torch.Tensor
94 | ) -> Dict[str, float]:
95 | """
96 | Calculate metrics for a batch of predictions and targets.
97 |
98 | Args:
99 | preds (torch.Tensor): predictions
100 | targets (torch.Tensor): targets
101 |
102 | Returns:
103 | Dict[str, float]: metrics
104 | """
105 | metrics = dict()
106 | # calculate the cross entropy loss
107 | metrics["loss"] = self.loss(preds, targets)
108 | # apply sigmoid to the predictions to get a value between 0 and 1
109 | preds = torch.sigmoid(preds)
110 | # calculate emissions vs no-emissions accuracy
111 | metrics["accuracy"] = (
112 | ((preds > POSITIVE_THRESHOLD) == (targets > 0)).float().mean()
113 | )
114 | # calculate balanced accuracy, which accounts for class imbalance
115 | metrics["balanced_accuracy"] = balanced_accuracy_score(
116 | y_pred=(preds.cpu() > POSITIVE_THRESHOLD).int(),
117 | y_true=targets.cpu().int(),
118 | )
119 | # calculate recall and precision
120 | metrics["recall"] = torchmetrics.functional.recall(
121 | preds=preds,
122 | target=targets,
123 | average="macro",
124 | task="binary",
125 | )
126 | metrics["precision"] = torchmetrics.functional.precision(
127 | preds=preds,
128 | target=targets,
129 | average="macro",
130 | task="binary",
131 | )
132 | return metrics
133 |
134 | def shared_step(
135 | self,
136 | batch: Dict[str, Any],
137 | batch_idx: int,
138 | stage: str,
139 | ):
140 | if len(batch["image"].shape) == 0:
141 | # avoid iteration over a 0-d array error
142 | return dict()
143 | metrics = dict()
144 | x, y = batch["image"], batch["target"]
145 | x, y = x.float().to(self.device), y.float().to(self.device)
146 | # forward pass (calculate predictions)
147 | y_pred = self(x)
148 | # calculate metrics for the current batch
149 | metrics = self.calculate_all_metrics(preds=y_pred, targets=y)
150 | metrics = {
151 | (f"{stage}_{k}" if k != "loss" or stage != "train" else k): v
152 | for k, v in metrics.items()
153 | }
154 | # log metrics
155 | for k, v in metrics.items():
156 | if k == "loss":
157 | self.log(k, v, on_step=True, prog_bar=True)
158 | else:
159 | self.log(k, v, on_step=False, on_epoch=True, prog_bar=True)
160 | return metrics
161 |
162 | def training_step(self, batch: Dict[str, Any], batch_idx: int):
163 | return self.shared_step(batch, batch_idx, stage="train")
164 |
165 | def validation_step(self, batch: Dict[str, Any], batch_idx: int):
166 | return self.shared_step(batch, batch_idx, stage="val")
167 |
168 | def test_step(self, batch: Dict[str, Any], batch_idx: int):
169 | return self.shared_step(batch, batch_idx, stage="test")
170 |
171 | def configure_optimizers(self):
172 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
173 | return {
174 | "optimizer": optimizer,
175 | "lr_scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
176 | optimizer, mode="min", factor=0.1, patience=3
177 | ),
178 | "monitor": "val_loss",
179 | }
180 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/satellite_imagery.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import os
3 | from typing import List, Optional, Union
4 | import backoff
5 |
6 | import geopandas as gpd
7 | import numpy as np
8 | import rasterio as rio
9 | from rasterio.errors import RasterioIOError
10 | import pandas as pd
11 | import pystac_client
12 | from loguru import logger
13 | from pyproj.aoi import AreaOfInterest
14 | from pyproj.database import query_utm_crs_info
15 | from shapely.geometry.base import BaseGeometry
16 | import torch
17 | from tqdm.auto import tqdm
18 |
19 | from coal_emissions_monitoring.constants import (
20 | ALL_BANDS,
21 | AOI_SIZE_METERS,
22 | API_URL,
23 | COLLECTION,
24 | END_DATE,
25 | GLOBAL_EPSG,
26 | IMAGE_SIZE_PX,
27 | MAX_BRIGHT_MEAN,
28 | MAX_CLOUD_COVER_PRCT,
29 | MAX_DARK_FRAC,
30 | START_DATE,
31 | )
32 |
33 | STAC_CLIENT = pystac_client.Client.open(API_URL)
34 |
35 |
36 | def get_epsg_from_coords(latitude: float, longitude: float) -> int:
37 | """
38 | Get the EPSG code for a specific coordinate
39 |
40 | Args:
41 | latitude (float):
42 | The latitude of the coordinate
43 | longitude (float):
44 | The longitude of the coordinate
45 |
46 | Returns:
47 | int:
48 | The EPSG code for the coordinate
49 | """
50 | crs_info = query_utm_crs_info(
51 | datum_name="WGS 84",
52 | area_of_interest=AreaOfInterest(
53 | west_lon_degree=longitude,
54 | south_lat_degree=latitude,
55 | east_lon_degree=longitude,
56 | north_lat_degree=latitude,
57 | ),
58 | )
59 | return int(crs_info[0].code)
60 |
61 |
62 | def create_aoi_for_plants(campd_facilities_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
63 | """
64 | Create a square area of interest (AOI) for each plant in the CAMPD facilities data.
65 | This will later be used to query for satellite imagery.
66 |
67 | Args:
68 | campd_facilities_gdf (gpd.GeoDataFrame):
69 | The CAMPD facilities data frame
70 |
71 | Returns:
72 | gpd.GeoDataFrame:
73 | A data frame containing the AOIs for each plant
74 | """
75 | facility_dfs = list()
76 | for _, facility_df in tqdm(
77 | campd_facilities_gdf.groupby("facility_id"),
78 | total=campd_facilities_gdf.facility_id.nunique(),
79 | desc="Creating AOIs for plants",
80 | ):
81 | # identify what is the local CRS for the current facility,
82 | # based on its latitude and longitude
83 | epsg = get_epsg_from_coords(
84 | facility_df.latitude.mean(), facility_df.longitude.mean()
85 | )
86 | # convert to the local CRS, based on the coordinates
87 | facility_df = facility_df.to_crs(epsg=epsg)
88 | # buffer the geometry into a square that is ~3.2km on each side
89 | facility_df.geometry = facility_df.geometry.buffer(
90 | AOI_SIZE_METERS / 2, cap_style=3
91 | )
92 | # convert back to the global CRS
93 | facility_df = facility_df.to_crs(epsg=GLOBAL_EPSG)
94 | facility_dfs.append(facility_df)
95 | return gpd.GeoDataFrame(pd.concat(facility_dfs, ignore_index=True))
96 |
97 |
98 | def get_aws_cog_links_from_geom(
99 | geometry: BaseGeometry,
100 | collection: str = COLLECTION,
101 | start_date: Optional[datetime] = START_DATE,
102 | end_date: Optional[datetime] = END_DATE,
103 | max_cloud_cover_prct: Optional[int] = MAX_CLOUD_COVER_PRCT,
104 | sort_by: str = "updated",
105 | max_items: Optional[int] = None,
106 | verbose: bool = True,
107 | ) -> pd.DataFrame:
108 | """
109 | Retrieve links from AWS' Sentinel 2 L2A STAC
110 |
111 | Args:
112 | geometry (BaseGeometry):
113 | The geometry to query for images that
114 | contain it in STAC
115 | collection (str):
116 | The STAC collection to query
117 | start_date (Optional[datetime]):
118 | Optional start date to filter images on
119 | end_date (Optional[datetime]):
120 | Optional end date to filter images on
121 | max_cloud_cover_prct (Optional[int]):
122 | Optional maximum cloud cover to filter
123 | images that are too cloudy. Expressed
124 | as a percentage, e.g. 1 = 1%
125 | sort_by (str):
126 | Which property to sort the results by,
127 | in descending order; needs to be a valid
128 | property in the STAC collection
129 | max_items (Optional[int]):
130 | Optional maximum number of items to
131 | return
132 | verbose (bool):
133 | Whether to print the progress of the
134 | query
135 |
136 | Returns:
137 | pd.DataFrame:
138 | A dataframe containing the ID of the tile and
139 | the links to its COGs and metadata
140 | """
141 | # get the bounding box from the geometry
142 | bbox = geometry.bounds
143 | # specify the cloud filter
144 | if max_cloud_cover_prct == 0:
145 | cloud_filter = "eo:cloud_cover=0"
146 | elif max_cloud_cover_prct is not None:
147 | cloud_filter = f"eo:cloud_cover<={max_cloud_cover_prct}"
148 | # query the STAC collection(s) in a specific bounding box and search criteria
149 | search = STAC_CLIENT.search(
150 | collections=[collection],
151 | bbox=bbox,
152 | datetime=f"{start_date.strftime('%Y-%m-%d')}/{end_date.strftime('%Y-%m-%d')}",
153 | query=[cloud_filter] if max_cloud_cover_prct is not None else None,
154 | )
155 | if verbose:
156 | logger.info(f"Found {search.matched()} items matching the search criteria")
157 | items = search.get_all_items()
158 | if max_cloud_cover_prct is not None and collection == "sentinel-s2-l2a-cogs":
159 | # some items had invalid cloud cover data and turned out very cloudy; only works for L2A
160 | items_valid_cloud_filter = [
161 | x for x in items if x.properties["sentinel:valid_cloud_cover"] == True
162 | ]
163 | if verbose:
164 | logger.info(
165 | f"Removed {len(items) - len(items_valid_cloud_filter)} items for invalid cloud filters"
166 | )
167 | items = items_valid_cloud_filter
168 | items = sorted(items, key=lambda x: x.properties[sort_by], reverse=True)
169 | if max_items is not None:
170 | items = items[:max_items]
171 | # create a dictionary that contains the tile ID and the links to the COGs and metadata
172 | output = dict(tile_id=[item.id for item in items])
173 | if len(items) == 0:
174 | return None
175 | asset_keys = items[0].assets.keys()
176 | for key in asset_keys:
177 | output[key] = [item.assets[key].href for item in items]
178 | output["cloud_cover"] = [item.properties["eo:cloud_cover"] for item in items]
179 | output[sort_by] = [item.properties[sort_by] for item in items]
180 | output["ts"] = [item.properties["datetime"] for item in items]
181 | output = pd.DataFrame(output)
182 | output["ts"] = pd.to_datetime(output["ts"])
183 | output.drop_duplicates(subset="ts", keep="first", inplace=True)
184 | output.sort_values("ts", inplace=True)
185 | return output
186 |
187 |
188 | def get_image_metadata_for_plants(
189 | plant_aoi_gdf: gpd.GeoDataFrame,
190 | collection: str = COLLECTION,
191 | start_date: datetime = START_DATE,
192 | end_date: datetime = END_DATE,
193 | max_cloud_cover_prct: int = MAX_CLOUD_COVER_PRCT,
194 | sort_by: str = "updated",
195 | ) -> pd.DataFrame:
196 | """
197 | Get the metadata for the satellite images for each plant,
198 | based on the AOI defined for each plant (see create_aoi_for_plants)
199 |
200 | Args:
201 | plant_aoi_gdf (gpd.GeoDataFrame):
202 | The data frame containing the AOIs for each plant
203 | collection (str):
204 | The STAC collection to query
205 | start_date (Optional[datetime]):
206 | Start date to filter images on
207 | end_date (Optional[datetime]):
208 | End date to filter images on
209 | max_cloud_cover_prct (Optional[int]):
210 | Maximum cloud cover to filter
211 | images that are too cloudy. Expressed
212 | as a percentage, e.g. 1 = 1%
213 | sort_by (str):
214 | Which property to sort the results by,
215 | in descending order; needs to be a valid
216 | property in the STAC collection
217 |
218 | Returns:
219 | pd.DataFrame:
220 | A dataframe containing the ID of the tile and
221 | the links to its COGs and metadata
222 | """
223 | image_metadata_dfs = list()
224 | for facility_id, geometry in tqdm(
225 | plant_aoi_gdf.groupby("facility_id").geometry.first().items(),
226 | total=plant_aoi_gdf.facility_id.nunique(),
227 | desc="Querying STAC API",
228 | ):
229 | stac_results_df = get_aws_cog_links_from_geom(
230 | geometry=geometry,
231 | collection=collection,
232 | start_date=start_date,
233 | end_date=end_date,
234 | max_cloud_cover_prct=max_cloud_cover_prct,
235 | sort_by=sort_by,
236 | verbose=False,
237 | )
238 | stac_results_df["facility_id"] = facility_id
239 | image_metadata_dfs.append(stac_results_df)
240 | return pd.concat(image_metadata_dfs, ignore_index=True)
241 |
242 |
243 | def pad_or_crop_to_size(image: np.ndarray, size: int = IMAGE_SIZE_PX) -> np.ndarray:
244 | """
245 | Pad or crop an image to a specific size
246 |
247 | Args:
248 | image (np.ndarray):
249 | The image to pad or crop, with dimensions (C, H, W),
250 | where C is the number of channels, H is the height and
251 | W is the width
252 | size (int):
253 | The size to pad or crop to
254 |
255 | Returns:
256 | np.ndarray:
257 | The padded or cropped image
258 | """
259 | if image.shape[1] > size:
260 | # crop the image
261 | image = image[:, :size, :size]
262 | elif image.shape[1] < size:
263 | # pad the image
264 | image = np.pad(
265 | image,
266 | ((0, 0), (0, size - image.shape[1]), (0, size - image.shape[2])),
267 | )
268 | return image
269 |
270 |
271 | @backoff.on_exception(backoff.expo, RasterioIOError, max_tries=3)
272 | def get_image_from_cog(
273 | cog_url: str, geometry: BaseGeometry, size: int = IMAGE_SIZE_PX
274 | ) -> np.ndarray:
275 | """
276 | Get the image from a COG, clipped to the geometry
277 |
278 | Args:
279 | cog_url (str):
280 | The URL to the COG
281 | geometry (BaseGeometry):
282 | The geometry to clip the image to
283 | size (int):
284 | The size to pad or crop to
285 |
286 | Returns:
287 | np.ndarray:
288 | The clipped image
289 | """
290 | # load only the bbox of the image
291 | with rio.open(cog_url) as src:
292 | # get the bbox converted to the right coordinate reference system (crs);
293 | # doing all of this because geopandas has the convenient to_crs function
294 | crs_bbox = (
295 | gpd.GeoDataFrame(geometry=[geometry], crs=GLOBAL_EPSG)
296 | .to_crs(src.crs)
297 | .total_bounds
298 | )
299 | # define window in RasterIO
300 | window = rio.windows.from_bounds(*crs_bbox, transform=src.transform)
301 | # actual HTTP range request
302 | image = src.read(window=window)
303 | # make sure that the image has the shape that we want
304 | image = pad_or_crop_to_size(image, size=size)
305 | return image
306 |
307 |
308 | def get_all_bands_image(
309 | cog_urls: List[str],
310 | geometry: BaseGeometry,
311 | size: int = IMAGE_SIZE_PX,
312 | ) -> np.ndarray:
313 | """
314 | Get an image that stacks all bands for a given row,
315 | clipped to the geometry.
316 |
317 | Args:
318 | cog_urls (List[str]):
319 | The URLs to the COGs
320 | geometry (BaseGeometry):
321 | The geometry to clip the image to
322 | size (int):
323 | The size to pad or crop to
324 |
325 | Returns:
326 | np.ndarray:
327 | The stacked image
328 | """
329 | bands = [
330 | get_image_from_cog(cog_url=url, geometry=geometry, size=size).squeeze()
331 | for url in cog_urls
332 | ]
333 | return np.stack(bands, axis=0)
334 |
335 |
336 | def fetch_image_path_from_cog(
337 | cog_url: Union[str, List[str]],
338 | geometry: BaseGeometry,
339 | size: int = IMAGE_SIZE_PX,
340 | cog_type: str = "visual",
341 | images_dir: str = "images/",
342 | download_missing_images: bool = False,
343 | ) -> Union[str, None]:
344 | """
345 | Fetch the image path from a COG; if download_missing_images is True,
346 | the image will be downloaded if it does not exist.
347 |
348 | Args:
349 | cog_url (Union[str, List[str]]):
350 | The URL to the COG
351 | geometry (BaseGeometry):
352 | The geometry to clip the image to
353 | size (int):
354 | The size to pad or crop to
355 | cog_type (str):
356 | The type of COG to download. Can be either "visual" or "all".
357 | images_dir (str):
358 | The directory to save the image to
359 | download_missing_images (bool):
360 | Whether to download the image if it does not exist
361 |
362 | Returns:
363 | Union[str, None]:
364 | The path to the downloaded image. If the image
365 | doesn't exist or could not be downloaded, None is returned.
366 | """
367 | if cog_type == "all":
368 | assert isinstance(cog_url, list) and len(cog_url) == len(ALL_BANDS), (
369 | "If cog_type is 'all', cog_url must be a list "
370 | f"of length {len(ALL_BANDS)}"
371 | )
372 | image_name = "_".join(cog_url[0].split("/")[-2:]).replace(".tif", "")
373 | else:
374 | image_name = "_".join(cog_url.split("/")[-2:]).replace(".tif", "")
375 | lat, lon = geometry.centroid.coords[0]
376 | patch_name = f"{image_name}_{lat}_{lon}_{size}"
377 | image_path = os.path.join(images_dir, f"{patch_name}.npy")
378 | if os.path.exists(image_path):
379 | # image already exists in the expected location
380 | return str(image_path)
381 | else:
382 | if not download_missing_images:
383 | # image does not exist and we don't want to download it
384 | return None
385 | else:
386 | # download and save the image
387 | os.makedirs(images_dir, exist_ok=True)
388 | try:
389 | if cog_type == "visual":
390 | image = get_image_from_cog(
391 | cog_url=cog_url, geometry=geometry, size=size
392 | )
393 | elif cog_type == "all":
394 | try:
395 | image = get_all_bands_image(
396 | cog_urls=cog_url, geometry=geometry, size=size
397 | )
398 | except ValueError as e:
399 | logger.warning(
400 | f"Failed to download image {cog_url}. Original error:\n{e}"
401 | )
402 | return None
403 | except RasterioIOError as e:
404 | logger.warning(
405 | f"Failed to download image {cog_url}. Original error:\n{e}"
406 | )
407 | return None
408 | np.save(image_path, image)
409 | return str(image_path)
410 |
411 |
412 | def is_image_too_dark(
413 | image: torch.Tensor, max_dark_frac: float = MAX_DARK_FRAC
414 | ) -> bool:
415 | """
416 | Check if an image is too dark, based on the fraction of pixels that are
417 | black or NaN
418 |
419 | Args:
420 | image (torch.Tensor):
421 | The image to check, with dimensions (C, H, W),
422 | where C is the number of channels, H is the height and
423 | W is the width
424 | max_dark_frac (float):
425 | The maximum fraction of pixels that can be black or NaN
426 |
427 | Returns:
428 | bool:
429 | Whether the image is too dark
430 | """
431 | dark_frac = ((image <= 1) | (image.isnan())).sum() / image.numel()
432 | return dark_frac > max_dark_frac
433 |
434 |
435 | def is_image_too_bright(image: torch.Tensor, max_mean_val: MAX_BRIGHT_MEAN) -> bool:
436 | """
437 | Check if the image is too bright, such as because of clouds or snow, based
438 | on the mean value of the image
439 |
440 | Args:
441 | image (torch.Tensor):
442 | The image to check, with dimensions (C, H, W),
443 | where C is the number of channels, H is the height and
444 | W is the width
445 | max_mean_val (float):
446 | The maximum mean value of the image
447 |
448 | Returns:
449 | bool:
450 | Whether the image is too bright
451 | """
452 | return image.mean() > max_mean_val
453 |
--------------------------------------------------------------------------------
/src/coal_emissions_monitoring/transforms.py:
--------------------------------------------------------------------------------
1 | import kornia.augmentation as K
2 | import torch
3 |
4 | from coal_emissions_monitoring.constants import CROP_SIZE_PX, RANDOM_TRANSFORM_PROB
5 |
6 |
7 | def get_transform(
8 | data_group: str, crop_size: int = CROP_SIZE_PX
9 | ) -> K.AugmentationSequential:
10 | """
11 | Get the transform for the given data group, i.e. train, val, or test.
12 |
13 | Args:
14 | data_group (str): data group
15 | crop_size (int): crop size
16 |
17 | Returns:
18 | K.AugmentationSequential: transforms
19 | """
20 | if data_group == "train":
21 | return K.AugmentationSequential(
22 | K.RandomCrop(size=(crop_size, crop_size)),
23 | K.RandomHorizontalFlip(p=RANDOM_TRANSFORM_PROB),
24 | K.RandomRotation(p=RANDOM_TRANSFORM_PROB, degrees=90),
25 | # TODO this contrast transform is sometimes making the image too dark
26 | # consider fixing it if needing more regularization
27 | # K.RandomContrast(p=RANDOM_TRANSFORM_PROB, contrast=(0.9, 1.1)),
28 | data_keys=["image"],
29 | same_on_batch=False,
30 | keepdim=True,
31 | )
32 | elif data_group == "val":
33 | return K.AugmentationSequential(
34 | K.CenterCrop(size=(crop_size, crop_size)),
35 | data_keys=["image"],
36 | same_on_batch=False,
37 | keepdim=True,
38 | )
39 | elif data_group == "test":
40 | return K.AugmentationSequential(
41 | K.CenterCrop(size=(crop_size, crop_size)),
42 | data_keys=["image"],
43 | same_on_batch=False,
44 | keepdim=True,
45 | )
46 | else:
47 | raise ValueError(
48 | f"Invalid data group: {data_group}." "Expected one of: train, val, test."
49 | )
50 |
51 |
52 | efficientnet_transform = K.AugmentationSequential(
53 | K.Resize(size=(256, 256)),
54 | K.CenterCrop(size=(224, 224)),
55 | K.Normalize(
56 | mean=torch.tensor([0.485, 0.456, 0.406]),
57 | std=torch.tensor([0.229, 0.224, 0.225]),
58 | ),
59 | data_keys=["image"],
60 | same_on_batch=False,
61 | keepdim=True,
62 | )
63 |
--------------------------------------------------------------------------------