├── .gitignore ├── README.md ├── data └── graphs │ ├── cloud_cover_hist.html │ ├── facilities_map.html │ ├── operation_status_hist.html │ └── operation_status_per_power_plant_ts.html ├── notebooks ├── 01_data_matching.ipynb ├── 02_satellite_imagery.ipynb ├── 03_split_data_for_ml.ipynb ├── 04_dataset_loading.ipynb ├── 05_images_download.ipynb ├── 06_model_training.ipynb └── google_cooling_tower_on_off_data.ipynb ├── requirements.txt ├── scripts └── download_images.py ├── setup.cfg ├── setup.py └── src └── coal_emissions_monitoring ├── __init__.py ├── constants.py ├── data_cleaning.py ├── data_viz.py ├── dataset.py ├── ml_utils.py ├── model.py ├── satellite_imagery.py └── transforms.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .vscode/ 131 | notebooks/lightning_logs/* 132 | data/campd/ 133 | data/google/ 134 | data/models/ 135 | .DS_Store 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ccai-ss23-ai-monitoring-tutorial 2 | Experiments for the Climate Change AI summer school 2023 tutorial on "AI for Monitoring, Reporting, and Verification" 3 | -------------------------------------------------------------------------------- /data/graphs/facilities_map.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 27 | 37 | 38 | 39 | 53 | 54 | 55 | 56 | 68 | 115 | 116 | 117 | 118 | 119 | 120 |
121 |
data_set
122 |
123 | 127 |
128 |
129 | 130 | 131 |
132 | 133 | 134 | 243 | -------------------------------------------------------------------------------- /notebooks/01_data_matching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Data matching\n", 9 | "---\n", 10 | "\n", 11 | "Experimenting with matching data from:\n", 12 | "- Global Energy Monitor (GEM)'s [Global Coal Plant Tracker](https://www.globalenergymonitor.org/coal.html)\n", 13 | "- USA's [CAMPD emissions data](https://campd.epa.gov/data)\n", 14 | "- OSM's [cooling_tower](https://wiki.openstreetmap.org/wiki/Tag:man_made%3Dcooling_tower) tag" 15 | ] 16 | }, 17 | { 18 | "attachments": {}, 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Setup" 23 | ] 24 | }, 25 | { 26 | "attachments": {}, 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### Imports" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "import geopandas as gpd\n", 41 | "import plotly.express as px" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "from coal_emissions_monitoring.data_cleaning import (\n", 51 | " load_clean_gcpt_gdf,\n", 52 | " load_clean_campd_facilities_gdf,\n", 53 | " load_clean_campd_emissions_df,\n", 54 | " load_osm_data,\n", 55 | ")" 56 | ] 57 | }, 58 | { 59 | "attachments": {}, 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Parameters" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# show all columns in pandas\n", 73 | "pd.set_option(\"display.max_columns\", None)" 74 | ] 75 | }, 76 | { 77 | "attachments": {}, 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Load data" 82 | ] 83 | }, 84 | { 85 | "attachments": {}, 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### GEM Global Coal Plant Tracker" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "gcpt_df = load_clean_gcpt_gdf(\"/Users/adminuser/Downloads/Global-Coal-Plant-Tracker-January-2023.xlsx\")\n", 99 | "gcpt_df" 100 | ] 101 | }, 102 | { 103 | "attachments": {}, 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "### CAMPD facilities metadata" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "campd_facilities_df = load_clean_campd_facilities_gdf(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\")\n", 117 | "campd_facilities_df" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "campd_facilities_df.capacity_mw.describe()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# find distance to the nearest facility\n", 136 | "for facility_id in campd_facilities_df.facility_id:\n", 137 | " campd_facilities_df.loc[\n", 138 | " campd_facilities_df.facility_id == facility_id,\n", 139 | " \"dist_to_nearest_facility\"\n", 140 | " ] = gpd.sjoin_nearest(\n", 141 | " campd_facilities_df.loc[campd_facilities_df.facility_id == facility_id],\n", 142 | " campd_facilities_df.loc[campd_facilities_df.facility_id != facility_id],\n", 143 | " distance_col=\"dist\",\n", 144 | " ).dist.min()\n", 145 | "campd_facilities_df.groupby(\"facility_id\").dist_to_nearest_facility.min().sort_values()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "campd_facilities_df[campd_facilities_df.year == 2023].explore()" 155 | ] 156 | }, 157 | { 158 | "attachments": {}, 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "### CAMPD emissions data" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "campd_emissions_df = load_clean_campd_emissions_df(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\")\n", 172 | "campd_emissions_df" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "campd_emissions_df[\"year\"] = campd_emissions_df[\"date\"].dt.year\n", 182 | "yearly_emissions = campd_emissions_df.groupby(\"year\").co2_mass_short_tons.mean()\n", 183 | "yearly_emissions" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "px.line(campd_emissions_df, x=\"date\", y=\"co2_mass_short_tons\", color=\"facility_name\")" 193 | ] 194 | }, 195 | { 196 | "attachments": {}, 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### OSM cooling_tower tag" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "osm_gdf = load_osm_data()\n", 210 | "osm_gdf" 211 | ] 212 | }, 213 | { 214 | "attachments": {}, 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## Match data" 219 | ] 220 | }, 221 | { 222 | "attachments": {}, 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "### CAMPD facilities metadata and emissions" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "campd_emissions_df[\"year\"] = pd.to_datetime(campd_emissions_df[\"date\"].dt.year, format=\"%Y\")\n", 236 | "campd_gdf = pd.merge(\n", 237 | " campd_facilities_df,\n", 238 | " campd_emissions_df,\n", 239 | " on=[\"facility_id\", \"year\"],\n", 240 | " how=\"inner\",\n", 241 | " suffixes=(\"_delete\", \"\"),\n", 242 | ")\n", 243 | "campd_gdf = campd_gdf.drop(columns=[col for col in campd_gdf.columns if \"_delete\" in col])\n", 244 | "campd_gdf" 245 | ] 246 | }, 247 | { 248 | "attachments": {}, 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "### CAMPD data and OSM cooling_tower tag" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "campd_ndt_gdf = gpd.sjoin_nearest(campd_gdf, osm_gdf, how=\"inner\", distance_col=\"distances\", max_distance=0.01)\n", 262 | "campd_ndt_gdf" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "campd_ndt_gdf.distances.describe()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "ndt_plants = campd_ndt_gdf.facility_id.nunique()\n", 281 | "ndt_plants" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [] 290 | } 291 | ], 292 | "metadata": { 293 | "kernelspec": { 294 | "display_name": "ccai_ss23", 295 | "language": "python", 296 | "name": "python3" 297 | }, 298 | "language_info": { 299 | "codemirror_mode": { 300 | "name": "ipython", 301 | "version": 3 302 | }, 303 | "file_extension": ".py", 304 | "mimetype": "text/x-python", 305 | "name": "python", 306 | "nbconvert_exporter": "python", 307 | "pygments_lexer": "ipython3", 308 | "version": "3.10.9" 309 | }, 310 | "orig_nbformat": 4 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 2 314 | } 315 | -------------------------------------------------------------------------------- /notebooks/02_satellite_imagery.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Satellite imagery collection and processing\n", 9 | "---\n", 10 | "\n", 11 | "Experimenting with filtering, downloading and displaying Sentinel 2 images from the [AWS STAC of Cloud-Optimized GeoTIFFs](https://registry.opendata.aws/sentinel-2-l2a-cogs/)" 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Setup" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Imports" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from coal_emissions_monitoring.data_cleaning import (\n", 37 | " load_clean_campd_facilities_gdf,\n", 38 | " load_clean_campd_emissions_df,\n", 39 | " load_clean_image_metadata_df,\n", 40 | " get_final_dataset\n", 41 | ")\n", 42 | "from coal_emissions_monitoring.satellite_imagery import (\n", 43 | " create_aoi_for_plants,\n", 44 | " get_image_metadata_for_plants,\n", 45 | " get_image_from_cog\n", 46 | ")\n", 47 | "from coal_emissions_monitoring.data_viz import view_satellite_image" 48 | ] 49 | }, 50 | { 51 | "attachments": {}, 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Parameters" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "cloud_filter_percent = 25" 65 | ] 66 | }, 67 | { 68 | "attachments": {}, 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Load CAMPD data" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "campd_facilities_gdf = load_clean_campd_facilities_gdf(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\")\n", 82 | "campd_facilities_gdf" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "campd_facilities_gdf = create_aoi_for_plants(campd_facilities_gdf)\n", 92 | "campd_facilities_gdf" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "campd_facilities_gdf.geometry.explore()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "campd_emissions_df = load_clean_campd_emissions_df(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\")\n", 111 | "campd_emissions_df" 112 | ] 113 | }, 114 | { 115 | "attachments": {}, 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "## Filter emissions data to days when a cloudless image is available" 120 | ] 121 | }, 122 | { 123 | "attachments": {}, 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Get image metadata for every power plant" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "image_metadata_df = get_image_metadata_for_plants(campd_facilities_gdf, max_cloud_cover_prct=cloud_filter_percent)\n", 137 | "image_metadata_df.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\", index=False)\n", 138 | "image_metadata_df" 139 | ] 140 | }, 141 | { 142 | "attachments": {}, 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### Join with emissions data" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "df = get_final_dataset(\n", 156 | " image_metadata_path=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n", 157 | " campd_facilities_path=\"https://drive.google.com/file/d/1b-5BriZUiiv2r0wFLubccLQpd2xb5ysl/view?usp=share_link\",\n", 158 | " campd_emissions_path=\"https://drive.google.com/file/d/1oxZXR7GDcSXwwVoIjp66iS179cFVA5dP/view?usp=share_link\",\n", 159 | " cog_type=\"all\",\n", 160 | ")\n", 161 | "df" 162 | ] 163 | }, 164 | { 165 | "attachments": {}, 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## Download and display images" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "image = get_image_from_cog(cog_url=image_metadata_df.cog_url.iloc[0], geometry=campd_facilities_gdf.geometry.iloc[0])" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "image.shape, image.min(), image.mean(), image.max()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "view_satellite_image(image)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | } 206 | ], 207 | "metadata": { 208 | "kernelspec": { 209 | "display_name": "ccai_ss23", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "codemirror_mode": { 215 | "name": "ipython", 216 | "version": 3 217 | }, 218 | "file_extension": ".py", 219 | "mimetype": "text/x-python", 220 | "name": "python", 221 | "nbconvert_exporter": "python", 222 | "pygments_lexer": "ipython3", 223 | "version": "3.10.9" 224 | }, 225 | "orig_nbformat": 4 226 | }, 227 | "nbformat": 4, 228 | "nbformat_minor": 2 229 | } -------------------------------------------------------------------------------- /notebooks/03_split_data_for_ml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Split data for machine learning\n", 9 | "---\n", 10 | "\n", 11 | "Experimenting with splitting the data for machine learning model training." 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Setup" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Imports" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import numpy as np\n", 36 | "import geopandas as gpd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from coal_emissions_monitoring.data_cleaning import get_final_dataset\n", 46 | "from coal_emissions_monitoring.ml_utils import get_facility_set_mapper, split_data_in_sets" 47 | ] 48 | }, 49 | { 50 | "attachments": {}, 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Parameters" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "train_val_ratio = 0.8\n", 64 | "test_data_year = 2023" 65 | ] 66 | }, 67 | { 68 | "attachments": {}, 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Load data" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "df = get_final_dataset(\n", 82 | " image_metadata_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n", 83 | " campd_facilities_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\",\n", 84 | " campd_emissions_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\",\n", 85 | ")\n", 86 | "df" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "df.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/final_dataset.csv\", index=False)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "df.co2_mass_short_tons.value_counts()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "df.isna().sum()" 114 | ] 115 | }, 116 | { 117 | "attachments": {}, 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Split data" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "facility_set_mapper = get_facility_set_mapper(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\")\n", 131 | "df[\"data_set\"] = df.apply(lambda row: split_data_in_sets(row=row, data_set_mapper=facility_set_mapper, test_year=test_data_year), axis=1)\n", 132 | "df" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "df.data_set.value_counts()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "df.data_set.value_counts() / df.shape[0]" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "for data_set in df.data_set.unique():\n", 160 | " print(data_set)\n", 161 | " print(df[df.data_set == data_set].ts.dt.year.value_counts())\n", 162 | " print()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "ccai_ss23", 176 | "language": "python", 177 | "name": "python3" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": { 181 | "name": "ipython", 182 | "version": 3 183 | }, 184 | "file_extension": ".py", 185 | "mimetype": "text/x-python", 186 | "name": "python", 187 | "nbconvert_exporter": "python", 188 | "pygments_lexer": "ipython3", 189 | "version": "3.10.9" 190 | }, 191 | "orig_nbformat": 4 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 2 195 | } -------------------------------------------------------------------------------- /notebooks/04_dataset_loading.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Dataset loading\n", 9 | "---\n", 10 | "Experimenting with loading the PyTorch Lightning dataset and visualising its outputs." 11 | ] 12 | }, 13 | { 14 | "attachments": {}, 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Setup" 19 | ] 20 | }, 21 | { 22 | "attachments": {}, 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Imports" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "from coal_emissions_monitoring.dataset import CoalEmissionsDataModule\n", 36 | "from coal_emissions_monitoring.data_viz import view_satellite_image" 37 | ] 38 | }, 39 | { 40 | "attachments": {}, 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Create the dataset" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "data = CoalEmissionsDataModule(\n", 54 | " image_metadata_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n", 55 | " campd_facilities_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/facility_attributes.csv\",\n", 56 | " campd_emissions_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/daily_emissions_facility_aggregation.csv\",\n", 57 | " batch_size=2,\n", 58 | " predownload_images=True,\n", 59 | " images_dir=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/images\",\n", 60 | ")\n", 61 | "data.setup(stage=\"fit\")" 62 | ] 63 | }, 64 | { 65 | "attachments": {}, 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Load some batches" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "for batch in data.train_dataloader():\n", 79 | " break\n", 80 | "print(f\"Keys in batch: {batch.keys()}\")\n", 81 | "print(f\"Image shape: {batch['image'].shape}\")" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "idx = 0\n", 91 | "print(f\"Target: {batch['target'][idx]}\")\n", 92 | "print(f\"Facility name: {batch['metadata']['facility_name'][idx]}\")\n", 93 | "print(f\"Timestamp: {batch['metadata']['ts'][idx]}\")\n", 94 | "view_satellite_image(batch[\"image\"][idx])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "((batch[\"image\"][idx] <= 1) | (batch[\"image\"][idx].isnan())).sum() / batch[\"image\"][idx].numel()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "ccai_ss23", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.10.9" 131 | }, 132 | "orig_nbformat": 4 133 | }, 134 | "nbformat": 4, 135 | "nbformat_minor": 2 136 | } 137 | -------------------------------------------------------------------------------- /notebooks/05_images_download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Images download\n", 9 | "---\n", 10 | "\n", 11 | "Download all images before training models." 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Setup" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Imports" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from tqdm.auto import tqdm" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "from coal_emissions_monitoring.constants import ALL_BANDS\n", 46 | "from coal_emissions_monitoring.data_cleaning import get_final_dataset\n", 47 | "from coal_emissions_monitoring.satellite_imagery import fetch_image_path_from_cog" 48 | ] 49 | }, 50 | { 51 | "attachments": {}, 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Get final datase" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df = get_final_dataset(\n", 65 | " image_metadata_path=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/image_metadata.csv\",\n", 66 | " campd_facilities_path=\"https://drive.google.com/file/d/1b-5BriZUiiv2r0wFLubccLQpd2xb5ysl/view?usp=share_link\",\n", 67 | " campd_emissions_path=\"https://drive.google.com/file/d/1oxZXR7GDcSXwwVoIjp66iS179cFVA5dP/view?usp=share_link\",\n", 68 | " cog_type=\"all\",\n", 69 | ")\n", 70 | "df" 71 | ] 72 | }, 73 | { 74 | "attachments": {}, 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Download images" 79 | ] 80 | }, 81 | { 82 | "attachments": {}, 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "### TCI (True Color Image)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "tqdm.pandas(desc=\"Downloading visual images\")\n", 96 | "df[\"local_image_path\"] = df.progress_apply(\n", 97 | " lambda row: fetch_image_path_from_cog(\n", 98 | " cog_url=row.cog_url,\n", 99 | " geometry=row.geometry,\n", 100 | " images_dir=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/visual/\",\n", 101 | " download_missing_images=True,\n", 102 | " ),\n", 103 | " axis=1,\n", 104 | ")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# compress all images into one file\n", 114 | "!tar -czvf /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/visual_images.tar.gz /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/visual" 115 | ] 116 | }, 117 | { 118 | "attachments": {}, 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### All bands" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "tqdm.pandas(desc=\"Downloading all bands images\")\n", 132 | "df[\"local_image_all_bands_path\"] = df.progress_apply(\n", 133 | " lambda row: fetch_image_path_from_cog(\n", 134 | " cog_url=[row[band] for band in ALL_BANDS],\n", 135 | " geometry=row.geometry,\n", 136 | " cog_type=\"all\",\n", 137 | " images_dir=\"/home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/all_bands/\",\n", 138 | " download_missing_images=True,\n", 139 | " ),\n", 140 | " axis=1,\n", 141 | ")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# compress all images into one file\n", 151 | "!tar -czvf /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/all_bands_images.tar.gz /home/adminuser/ccai-ss23-ai-monitoring-tutorial/data/images/all_bands" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "ccai_ss23", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.10.10" 179 | }, 180 | "orig_nbformat": 4 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /notebooks/06_model_training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Model training\n", 9 | "---\n", 10 | "\n", 11 | "Experimenting with training some models over the dataset." 12 | ] 13 | }, 14 | { 15 | "attachments": {}, 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Setup" 20 | ] 21 | }, 22 | { 23 | "attachments": {}, 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Imports" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import timm\n", 37 | "from lightning import Trainer\n", 38 | "from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from coal_emissions_monitoring.dataset import CoalEmissionsDataModule\n", 48 | "from coal_emissions_monitoring.model import CoalEmissionsModel, SmallCNN\n", 49 | "from coal_emissions_monitoring.transforms import efficientnet_transform" 50 | ] 51 | }, 52 | { 53 | "attachments": {}, 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "### Parameters" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "batch_size = 128\n", 67 | "crop_size = 52\n", 68 | "num_workers = 0\n", 69 | "learning_rate = 1e-3" 70 | ] 71 | }, 72 | { 73 | "attachments": {}, 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Create the dataset" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "data = CoalEmissionsDataModule(\n", 87 | " final_dataset_path=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/final_dataset.csv\",\n", 88 | " batch_size=batch_size,\n", 89 | " num_workers=num_workers,\n", 90 | " predownload_images=True,\n", 91 | " download_missing_images=False,\n", 92 | " images_dir=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual\",\n", 93 | " crop_size=crop_size,\n", 94 | ")" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "data.setup(\"fit\")" 104 | ] 105 | }, 106 | { 107 | "attachments": {}, 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Create the model" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# model = timm.create_model(\"efficientnet_b0\", pretrained=True, num_classes=1)\n", 121 | "model = SmallCNN(num_input_channels=3, num_classes=1)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "model = model.float().to(\"cpu\")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "lit_model = CoalEmissionsModel(\n", 140 | " model=model,\n", 141 | " learning_rate=learning_rate,\n", 142 | " pos_weight=data.pos_weight,\n", 143 | ")" 144 | ] 145 | }, 146 | { 147 | "attachments": {}, 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## Confirm that the model can be run on a batch of data" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "data.setup(stage=\"fit\")\n", 161 | "for batch in data.train_dataloader():\n", 162 | " break\n", 163 | "print(f\"Keys in batch: {batch.keys()}\")\n", 164 | "print(f\"Image shape: {batch['image'].shape}\")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "y_pred = lit_model(batch[\"image\"])\n", 174 | "y_pred" 175 | ] 176 | }, 177 | { 178 | "attachments": {}, 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## Check that the model can overfit a single batch" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "trainer = Trainer(\n", 192 | " max_epochs=1,\n", 193 | " callbacks=[\n", 194 | " EarlyStopping(monitor=\"val_loss\", mode=\"min\", patience=10),\n", 195 | " ModelCheckpoint(\n", 196 | " monitor=\"val_loss\",\n", 197 | " mode=\"min\",\n", 198 | " filename=\"{val_loss:2f}-{val_balanced_accuracy:.2f}-{epoch}-64crop_full_data\",\n", 199 | " save_top_k=1,\n", 200 | " dirpath=\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/models/\",\n", 201 | " )\n", 202 | " ],\n", 203 | " limit_train_batches=round(0.1 * len(data.train_dataset.gdf) / batch_size),\n", 204 | " limit_val_batches=round(0.4 * len(data.val_dataset.gdf) / batch_size),\n", 205 | " reload_dataloaders_every_n_epochs=1,\n", 206 | " precision=\"16-mixed\",\n", 207 | " accelerator=\"cpu\",\n", 208 | " devices=1,\n", 209 | " log_every_n_steps=5,\n", 210 | " # overfit_batches=1,\n", 211 | ")\n", 212 | "trainer.fit(lit_model, data)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "_ = trainer.test(\n", 222 | " model=lit_model,\n", 223 | " datamodule=data,\n", 224 | " ckpt_path=\"best\",\n", 225 | " verbose=True,\n", 226 | ")" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [] 235 | } 236 | ], 237 | "metadata": { 238 | "kernelspec": { 239 | "display_name": "ccai_ss23", 240 | "language": "python", 241 | "name": "python3" 242 | }, 243 | "language_info": { 244 | "codemirror_mode": { 245 | "name": "ipython", 246 | "version": 3 247 | }, 248 | "file_extension": ".py", 249 | "mimetype": "text/x-python", 250 | "name": "python", 251 | "nbconvert_exporter": "python", 252 | "pygments_lexer": "ipython3", 253 | "version": "3.10.9" 254 | }, 255 | "orig_nbformat": 4 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 2 259 | } 260 | -------------------------------------------------------------------------------- /notebooks/google_cooling_tower_on_off_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import geopandas as gpd\n", 11 | "from tqdm.auto import tqdm" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from coal_emissions_monitoring.satellite_imagery import (\n", 21 | " create_aoi_for_plants,\n", 22 | " get_image_metadata_for_plants,\n", 23 | " get_image_from_cog\n", 24 | ")\n", 25 | "from coal_emissions_monitoring.data_viz import view_satellite_image" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "df = pd.read_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/labeled_geospatial_data.csv\")\n", 35 | "df" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "df.is_powered_on.value_counts()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# get unique combinations of lat/lon\n", 54 | "unique_coords = df[[\"lat\", \"lon\"]].drop_duplicates().reset_index(drop=True)\n", 55 | "unique_coords.reset_index(inplace=True)\n", 56 | "unique_coords.set_index([\"lat\", \"lon\"], inplace=True)\n", 57 | "unique_coords = unique_coords[\"index\"].to_dict()\n", 58 | "unique_coords" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# set an epsg code for each unique lat/lon\n", 68 | "df[\"facility_id\"] = df.apply(\n", 69 | " lambda x: unique_coords[(x[\"lat\"], x[\"lon\"])], axis=1\n", 70 | ")\n", 71 | "df" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "df.facility_id.value_counts()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "df.rename(columns={\"lat\": \"latitude\", \"lon\": \"longitude\"}, inplace=True)\n", 90 | "df" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "df.rename(columns={\"timestamp\": \"ts\"}, inplace=True)\n", 100 | "df.ts = pd.to_datetime(df.ts)\n", 101 | "df.dtypes" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs=\"EPSG:4326\")\n", 111 | "gdf" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "gdf = create_aoi_for_plants(gdf)\n", 121 | "gdf" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "gdf.geometry.explore()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# image_metadata_df = get_image_metadata_for_plants(\n", 140 | "# gdf,\n", 141 | "# start_date=gdf.ts.min(),\n", 142 | "# end_date=gdf.ts.max(),\n", 143 | "# max_cloud_cover_prct=50,\n", 144 | "# )\n", 145 | "image_metadata_df = pd.read_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/image_metadata.csv\")\n", 146 | "image_metadata_df.ts = pd.to_datetime(image_metadata_df.ts)\n", 147 | "image_metadata_df" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "# filter the image metadata to match the day of each row of gdf\n", 157 | "image_metadata_df[\"date\"] = image_metadata_df.ts.dt.date\n", 158 | "gdf[\"date\"] = gdf.ts.dt.date\n", 159 | "image_metadata_df = image_metadata_df.merge(\n", 160 | " gdf[[\"facility_id\", \"date\"]], on=[\"facility_id\", \"date\"]\n", 161 | ")\n", 162 | "image_metadata_df" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "image_metadata_df.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/image_metadata.csv\", index=False)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "gdf.merge(\n", 181 | " image_metadata_df.drop(columns=[\"ts\"]),\n", 182 | " on=[\"facility_id\", \"date\"]\n", 183 | ").to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/all_urls_dataset.csv\", index=False)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "gdf = gdf.merge(\n", 193 | " image_metadata_df[[\"facility_id\", \"date\", \"cloud_cover\", \"visual\"]],\n", 194 | " on=[\"facility_id\", \"date\"]\n", 195 | ")\n", 196 | "gdf.rename(columns={\"visual\": \"cog_url\"}, inplace=True)\n", 197 | "gdf.drop(columns=[\"date\"], inplace=True)\n", 198 | "gdf" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "gdf.sort_values(by=[\"facility_id\", \"ts\"], inplace=True)\n", 208 | "gdf.to_csv(\"/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/final_dataset.csv\", index=False)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "row = gdf.iloc[0]\n", 218 | "image = get_image_from_cog(\n", 219 | " cog_url=row.cog_url,\n", 220 | " geometry=row.geometry,\n", 221 | " size=64,\n", 222 | ")\n", 223 | "image.shape" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "view_satellite_image(image)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": "ccai_ss23", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.10.9" 260 | }, 261 | "orig_nbformat": 4 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 2 265 | } 266 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | overpy==0.6 2 | pandas==1.5.3 3 | geopandas==0.12.2 4 | openpyxl==3.1.2 5 | requests==2.28.2 6 | folium==0.14.0 7 | mapclassify==2.5.0 8 | matplotlib==3.7.1 9 | plotly==5.14.1 10 | nbformat==5.8.0 11 | pystac-client==0.6.1 12 | rasterio==1.3.6 13 | loguru==0.6.0 14 | tqdm==4.65.0 15 | black==23.3.0 16 | flake8==6.0.0 17 | torch==2.0.0 18 | lightning==2.0.1.post0 19 | kornia==0.6.11 20 | timm==0.6.13 21 | backoff==2.2.1 -------------------------------------------------------------------------------- /scripts/download_images.py: -------------------------------------------------------------------------------- 1 | # %% [markdown] 2 | # # Images download 3 | # --- 4 | # 5 | # Download all images before training models. 6 | 7 | # %% [markdown] 8 | # ## Setup 9 | 10 | # %% [markdown] 11 | # ### Imports 12 | 13 | # %% 14 | import os 15 | from tqdm.auto import tqdm 16 | 17 | # %% 18 | from coal_emissions_monitoring.constants import ALL_BANDS, MAIN_COLUMNS 19 | from coal_emissions_monitoring.data_cleaning import load_final_dataset 20 | from coal_emissions_monitoring.satellite_imagery import fetch_image_path_from_cog 21 | 22 | # %% [markdown] 23 | # ## Get final datase 24 | 25 | # %% 26 | gdf = load_final_dataset( 27 | "/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/all_urls_dataset.csv" 28 | ) 29 | 30 | # %% [markdown] 31 | # ## Download images 32 | 33 | # %% [markdown] 34 | # ### TCI (True Color Image) 35 | 36 | # %% 37 | tqdm.pandas(desc="Downloading visual images") 38 | gdf["local_image_path"] = gdf.progress_apply( 39 | lambda row: fetch_image_path_from_cog( 40 | cog_url=row.visual, 41 | geometry=row.geometry, 42 | cog_type="visual", 43 | images_dir="/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual/", 44 | download_missing_images=True, 45 | ), 46 | axis=1, 47 | ) 48 | 49 | # %% 50 | path = "/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/" 51 | os.makedirs(path, exist_ok=True) 52 | gdf.rename(columns={"visual": "cog_url"})[MAIN_COLUMNS + ["local_image_path"]].to_csv( 53 | f"{path}final_dataset.csv", 54 | index=False, 55 | ) 56 | 57 | # %% 58 | # compress all images into one file 59 | os.system( 60 | "tar -czvf /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual_images.tar.gz /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/visual" 61 | ) 62 | 63 | # %% [markdown] 64 | # ### All bands 65 | 66 | # %% 67 | tqdm.pandas(desc="Downloading all bands images") 68 | gdf["local_image_all_bands_path"] = gdf.progress_apply( 69 | lambda row: fetch_image_path_from_cog( 70 | cog_url=[row[band] for band in ALL_BANDS], 71 | geometry=row.geometry, 72 | size=32, # smaller images to make the download faster 73 | cog_type="all", 74 | images_dir="/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/all_bands/", 75 | download_missing_images=True, 76 | ), 77 | axis=1, 78 | ) 79 | 80 | # %% 81 | # compress all images into one file 82 | os.system( 83 | "tar -czvf /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/all_bands_images.tar.gz /Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/images/all_bands" 84 | ) 85 | 86 | # %% 87 | path = "/Users/adminuser/GitHub/ccai-ss23-ai-monitoring-tutorial/data/google/" 88 | os.makedirs(path, exist_ok=True) 89 | gdf.rename(columns={"visual": "cog_url"})[ 90 | MAIN_COLUMNS + ["local_image_path", "local_image_all_bands_path"] 91 | ].to_csv( 92 | f"{path}final_dataset.csv", 93 | index=False, 94 | ) 95 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = coal-emissions-monitoring 3 | version = 0.0.1 4 | description = A data science project to monitor coal power emissions 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/AndreCNF/ccai-ss23-ai-monitoring-tutorial 8 | 9 | [options] 10 | package_dir = 11 | = src 12 | packages = find: 13 | python_requires = >=3.7 14 | install_requires = 15 | overpy 16 | pandas 17 | geopandas 18 | openpyxl 19 | requests 20 | folium 21 | mapclassify 22 | matplotlib 23 | plotly 24 | nbformat>=4.2.0 25 | pystac-client 26 | rasterio 27 | loguru 28 | tqdm 29 | torch 30 | lightning 31 | kornia 32 | timm 33 | backoff 34 | 35 | [options.packages.find] 36 | where=src -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | use_scm_version=True, 5 | setup_requires=["setuptools_scm"], 6 | ) 7 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreCNF/ccai-ss23-ai-monitoring-tutorial/8b7ba2d2b11175c8f12f87b22a7d18acb6c8628e/src/coal_emissions_monitoring/__init__.py -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/constants.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | GLOBAL_EPSG = 4326 4 | API_URL = "https://earth-search.aws.element84.com/v0" 5 | COLLECTION = "sentinel-s2-l2a-cogs" # Sentinel-2, Level 2A, COGs 6 | AOI_SIZE_METERS = 640 7 | IMAGE_SIZE_PX = 64 8 | CROP_SIZE_PX = 52 9 | START_DATE = datetime(year=2016, month=1, day=1) 10 | END_DATE = datetime(year=2019, month=12, day=31) 11 | MAX_DARK_FRAC = 0.5 12 | MAX_BRIGHT_MEAN = 250 13 | MAX_CLOUD_COVER_PRCT = 50 14 | TRAIN_VAL_RATIO = 0.8 15 | TEST_YEAR = 2020 16 | BATCH_SIZE = 32 17 | MAIN_COLUMNS = [ 18 | "facility_id", 19 | "latitude", 20 | "longitude", 21 | "ts", 22 | "is_powered_on", 23 | "cloud_cover", 24 | "cog_url", 25 | "geometry", 26 | ] 27 | ALL_BANDS = [ 28 | "b01", 29 | "b02", 30 | "b03", 31 | "b04", 32 | "b05", 33 | "b06", 34 | "b07", 35 | "b08", 36 | "b8a", 37 | "b09", 38 | "b11", 39 | "b12", 40 | ] 41 | EMISSIONS_TARGET = "is_powered_on" 42 | EMISSIONS_CATEGORIES = { 43 | 0: "no_emissions", 44 | 1: "low", 45 | 2: "medium", 46 | 3: "high", 47 | 4: "very_high", 48 | } 49 | RANDOM_TRANSFORM_PROB = 0.5 50 | POSITIVE_THRESHOLD = 0.5 51 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/data_cleaning.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import warnings 3 | from typing import Callable, Optional, Union 4 | 5 | import pandas as pd 6 | import geopandas as gpd 7 | import overpy 8 | 9 | from coal_emissions_monitoring.constants import ALL_BANDS, GLOBAL_EPSG, MAIN_COLUMNS 10 | from coal_emissions_monitoring.satellite_imagery import create_aoi_for_plants 11 | 12 | OSM_API = overpy.Overpass() 13 | 14 | # suppress geopandas CRS warning as we don't need to worry too much about 15 | # the precision of distances 16 | warnings.filterwarnings("ignore", message="Geometry is in a geographic CRS*") 17 | # suppress pandas warning of setting value in copy 18 | warnings.filterwarnings("ignore", message="A value is trying to be set on a copy*") 19 | # suppress pandas warning on regex 20 | warnings.filterwarnings("ignore", message="The default value of regex will change*") 21 | 22 | 23 | def clean_column_names( 24 | df: Union[pd.DataFrame, gpd.GeoDataFrame] 25 | ) -> Union[pd.DataFrame, gpd.GeoDataFrame]: 26 | """ 27 | Clean column names in a data frame. 28 | 29 | Args: 30 | df (Union[pd.DataFrame, gpd.GeoDataFrame]): 31 | Data frame to clean 32 | 33 | Returns: 34 | df (Union[pd.DataFrame, gpd.GeoDataFrame]): 35 | Cleaned data frame 36 | """ 37 | df.columns = ( 38 | df.columns.str.lower() 39 | .str.replace(" ", "_") 40 | .str.replace("(", "") 41 | .str.replace(")", "") 42 | .str.replace("/", "_") 43 | .str.replace("-", "_") 44 | .str.replace(",", "_") 45 | ) 46 | return df 47 | 48 | 49 | def fix_google_drive_url(url: str) -> str: 50 | """ 51 | Fix a Google Drive URL. 52 | 53 | Args: 54 | url (str): 55 | URL to fix 56 | 57 | Returns: 58 | url (str): 59 | Fixed URL 60 | """ 61 | assert url.startswith( 62 | "https://drive.google.com/file/d/" 63 | ), "URL must start with https://drive.google.com/file/d/" 64 | return "https://drive.google.com/uc?id=" + url.split("/")[-2] 65 | 66 | 67 | def load_csv(path: str) -> pd.DataFrame: 68 | """ 69 | Load a CSV file. 70 | 71 | Args: 72 | path (str): 73 | Path to CSV file 74 | 75 | Returns: 76 | df (pd.DataFrame): 77 | Data frame 78 | """ 79 | if path.startswith("https://drive.google.com/file/d/"): 80 | return pd.read_csv(fix_google_drive_url(path)) 81 | else: 82 | return pd.read_csv(path) 83 | 84 | 85 | def load_clean_data_df( 86 | data_path: Union[str, Path], 87 | load_func: Optional[Callable] = load_csv, 88 | clean_func: Optional[Callable] = clean_column_names, 89 | ) -> pd.DataFrame: 90 | """ 91 | Load and clean a data frame. 92 | 93 | Args: 94 | data_path (Union[str, Path]): 95 | Path to data 96 | load_func (Optional[Callable]): 97 | Function to load data 98 | clean_func (Optional[Callable]): 99 | Function to clean data 100 | 101 | Returns: 102 | df (pd.DataFrame): 103 | Cleaned data frame 104 | """ 105 | df = load_func(data_path) 106 | df = clean_func(df) 107 | return df 108 | 109 | 110 | def load_clean_data_gdf( 111 | data_path: Union[str, Path], 112 | load_func: Optional[Callable] = load_csv, 113 | clean_func: Optional[Callable] = clean_column_names, 114 | ) -> gpd.GeoDataFrame: 115 | """ 116 | Load and clean a data frame, outputting it as a GeoDataFrame. 117 | 118 | Args: 119 | data_path (Union[str, Path]): 120 | Path to data 121 | load_func (Optional[Callable]): 122 | Function to load data 123 | clean_func (Optional[Callable]): 124 | Function to clean data 125 | 126 | Returns: 127 | gdf (gpd.GeoDataFrame): 128 | Cleaned data frame 129 | """ 130 | df = load_clean_data_df( 131 | data_path=data_path, load_func=load_func, clean_func=clean_func 132 | ) 133 | gdf = gpd.GeoDataFrame( 134 | df, 135 | geometry=gpd.points_from_xy( 136 | df["longitude"], 137 | df["latitude"], 138 | ), 139 | crs=f"EPSG:{GLOBAL_EPSG}", 140 | ) 141 | return gdf 142 | 143 | 144 | def load_raw_gcpt_data(gcpt_path: Union[str, Path]) -> pd.DataFrame: 145 | """ 146 | Load GCPT data in its raw excel format from GCS. 147 | 148 | Returns: 149 | df (pd.DataFrame): 150 | GCPT data frame 151 | """ 152 | df = pd.read_excel( 153 | gcpt_path, 154 | sheet_name="Units", 155 | ) 156 | return df 157 | 158 | 159 | def clean_gcpt(df: pd.DataFrame) -> pd.DataFrame: 160 | """ 161 | Clean the GCPT data frame, setting better column names. 162 | 163 | Args: 164 | df (pd.DataFrame): 165 | GCPT data frame 166 | 167 | Returns: 168 | df (pd.DataFrame): 169 | Cleaned GCPT data frame 170 | """ 171 | df = clean_column_names(df) 172 | df.rename(columns={"parentid": "parent_id"}, inplace=True) 173 | df.rename(columns={"trackerloc": "tracker_loc"}, inplace=True) 174 | return df 175 | 176 | 177 | def load_clean_gcpt_gdf(gcpt_path: Union[str, Path]) -> gpd.GeoDataFrame: 178 | """ 179 | Load and clean the GCPT data frame. 180 | 181 | Args: 182 | gcpt_path (Union[str, Path]): 183 | Path to GCPT data 184 | 185 | Returns: 186 | gdf (gpd.GeoDataFrame): 187 | Cleaned GCPT data frame 188 | """ 189 | return load_clean_data_gdf( 190 | data_path=gcpt_path, load_func=load_raw_gcpt_data, clean_func=clean_gcpt 191 | ) 192 | 193 | 194 | def clean_campd_facilities(df: pd.DataFrame) -> pd.DataFrame: 195 | """ 196 | Clean the CAMPD facilities data frame. 197 | 198 | Args: 199 | df (pd.DataFrame): 200 | CAMPD facilities data frame 201 | 202 | Returns: 203 | df (pd.DataFrame): 204 | Cleaned CAMPD facilities data frame 205 | """ 206 | df = clean_column_names(df) 207 | # get the capacity 208 | df["capacity_mw"] = ( 209 | df["associated_generators_&_nameplate_capacity_mwe"] 210 | .str.split(" ") 211 | .str[-1] 212 | .str.replace("(", "") 213 | .str.replace(")", "") 214 | .astype(float) 215 | ) 216 | # filter to operating units 217 | df = df[(df.operating_status == "Operating") & (df.capacity_mw > 0)] 218 | # aggregate by facility 219 | df = df.groupby(["facility_id", "year"]).agg( 220 | { 221 | "capacity_mw": "sum", 222 | "facility_name": "first", 223 | "latitude": "mean", 224 | "longitude": "mean", 225 | } 226 | ) 227 | # rearrange columns 228 | df = df.reset_index()[ 229 | ["facility_id", "facility_name", "year", "capacity_mw", "latitude", "longitude"] 230 | ] 231 | # fix datetime column data type 232 | df.year = pd.to_datetime(df.year, format="%Y") 233 | return df 234 | 235 | 236 | def load_clean_campd_facilities_gdf( 237 | campd_facilities_path: Union[str, Path] 238 | ) -> gpd.GeoDataFrame: 239 | """ 240 | Load and clean the CAMPD facilities data frame. 241 | 242 | Args: 243 | campd_facilities_path (Union[str, Path]): 244 | Path to CAMPD facilities data 245 | 246 | Returns: 247 | gdf (gpd.GeoDataFrame): 248 | Cleaned CAMPD facilities data frame 249 | """ 250 | return load_clean_data_gdf( 251 | data_path=campd_facilities_path, 252 | load_func=load_csv, 253 | clean_func=clean_campd_facilities, 254 | ) 255 | 256 | 257 | def clean_campd_emissions(df: pd.DataFrame) -> pd.DataFrame: 258 | """ 259 | Clean the CAMPD emissions data frame. 260 | 261 | Args: 262 | df (pd.DataFrame): 263 | CAMPD emissions data frame 264 | 265 | Returns: 266 | df (pd.DataFrame): 267 | Cleaned CAMPD emissions data frame 268 | """ 269 | df = clean_column_names(df) 270 | # fix datetime column data type 271 | df.date = pd.to_datetime(df.date) 272 | # fill missing values (emissions seem to be ignored if their value is 0) 273 | df = df.fillna(0) 274 | return df 275 | 276 | 277 | def load_clean_campd_emissions_df( 278 | campd_emissions_path: Union[str, Path] 279 | ) -> pd.DataFrame: 280 | """ 281 | Load and clean the CAMPD emissions data frame. 282 | 283 | Args: 284 | campd_emissions_path (Union[str, Path]): 285 | Path to CAMPD emissions data 286 | 287 | Returns: 288 | df (pd.DataFrame): 289 | Cleaned CAMPD emissions data frame 290 | """ 291 | return load_clean_data_df( 292 | data_path=campd_emissions_path, 293 | load_func=load_csv, 294 | clean_func=clean_campd_emissions, 295 | ) 296 | 297 | 298 | def load_osm_data( 299 | country: str = "United States", tag: str = "man_made", value: str = "cooling_tower" 300 | ) -> gpd.GeoDataFrame: 301 | """ 302 | Load OSM data. 303 | 304 | Args: 305 | country (str): 306 | Country to filter to 307 | tag (str): 308 | OSM tag to filter to 309 | value (str): 310 | OSM value to filter to 311 | 312 | Returns: 313 | gdf (gpd.GeoDataFrame): 314 | OSM cooling towers data frame 315 | """ 316 | # load the data 317 | osm_results = OSM_API.query( 318 | query=f""" 319 | area[name="{country}"]->.searchArea; 320 | ( 321 | node["{tag}"="{value}"](area.searchArea); 322 | way["{tag}"="{value}"](area.searchArea); 323 | relation["{tag}"="{value}"](area.searchArea); 324 | ); 325 | out body; 326 | >; 327 | out skel qt; 328 | """ 329 | ) 330 | df = pd.DataFrame( 331 | [ 332 | { 333 | "osm_id": element.id, 334 | "latitude": element.lat, 335 | "longitude": element.lon, 336 | } 337 | for element in osm_results.nodes 338 | ] 339 | ) 340 | # convert to geodataframe 341 | gdf = gpd.GeoDataFrame( 342 | df, 343 | geometry=gpd.points_from_xy(df.longitude, df.latitude), 344 | crs="EPSG:4326", 345 | ) 346 | return gdf 347 | 348 | 349 | def filter_to_cooling_tower_plants( 350 | gdf: gpd.GeoDataFrame, 351 | campd_facilities_path: Union[str, Path], 352 | ) -> gpd.GeoDataFrame: 353 | """ 354 | Filter data to plants with cooling towers. 355 | 356 | Args: 357 | gdf (gpd.GeoDataFrame): 358 | Data to be filtered 359 | campd_facilities_path (Union[str, Path]): 360 | Path to CAMPD facilities data 361 | 362 | Returns: 363 | gdf (gpd.GeoDataFrame): 364 | Filtered data 365 | """ 366 | # load the CAMPD facilities data 367 | campd_facilities_gdf = load_clean_campd_facilities_gdf(campd_facilities_path) 368 | # load the OSM data 369 | osm_gdf = load_osm_data() 370 | # spatial join 371 | campd_ndt_gdf = gpd.sjoin_nearest( 372 | campd_facilities_gdf, 373 | osm_gdf, 374 | how="inner", 375 | distance_col="distances", 376 | max_distance=0.01, 377 | ) 378 | # filter to plants with cooling towers 379 | gdf = gdf[gdf.facility_id.isin(campd_ndt_gdf.facility_id)] 380 | return gdf 381 | 382 | 383 | def clean_image_metadata(df: pd.DataFrame, cog_type: str = "visual") -> pd.DataFrame: 384 | """ 385 | Clean the image metadata data frame. 386 | 387 | Args: 388 | df (pd.DataFrame): 389 | Image metadata data frame 390 | cog_type (str): 391 | Type of COG to filter to. If "all", no filtering is done. 392 | 393 | Returns: 394 | df (pd.DataFrame): 395 | Cleaned image metadata data frame 396 | """ 397 | df = clean_column_names(df) 398 | # fix datetime column data type 399 | df.ts = pd.to_datetime(df.ts) 400 | # filter to most relevant columns 401 | if cog_type != "all": 402 | df.rename(columns={cog_type: "cog_url"}, inplace=True) 403 | df = df[["facility_id", "ts", "cloud_cover", "cog_url"]] 404 | else: 405 | df = df[ 406 | [ 407 | "facility_id", 408 | "ts", 409 | "cloud_cover", 410 | "visual", 411 | ] 412 | + ALL_BANDS 413 | ] 414 | return df 415 | 416 | 417 | def load_clean_image_metadata_df( 418 | image_metadata_path: Union[str, Path], cog_type: str = "visual" 419 | ) -> pd.DataFrame: 420 | """ 421 | Load and clean the image metadata data frame. 422 | 423 | Args: 424 | image_metadata_path (Union[str, Path]): 425 | Path to image metadata data 426 | cog_type (str): 427 | Type of COG to filter to 428 | 429 | Returns: 430 | df (pd.DataFrame): 431 | Cleaned image metadata data frame 432 | """ 433 | return load_clean_data_df( 434 | data_path=image_metadata_path, 435 | load_func=load_csv, 436 | clean_func=lambda df: clean_image_metadata(df, cog_type=cog_type), 437 | ) 438 | 439 | 440 | def get_final_dataset( 441 | image_metadata_path: Union[str, Path], 442 | campd_facilities_path: Union[str, Path], 443 | campd_emissions_path: Union[str, Path], 444 | cog_type: str = "visual", 445 | ) -> gpd.GeoDataFrame: 446 | """ 447 | Get the final dataset that has the facility and image metadata, as well as 448 | the emissions data that we'll train models on. 449 | 450 | Args: 451 | image_metadata_path (Union[str, Path]): 452 | Path to image metadata data 453 | campd_facilities_path (Union[str, Path]): 454 | Path to CAMPD facilities data 455 | campd_emissions_path (Union[str, Path]): 456 | Path to CAMPD emissions data 457 | cog_type (str): 458 | Type of COG to filter to. If "all", no filtering is done. 459 | 460 | Returns: 461 | gdf (gpd.GeoDataFrame): 462 | Final dataset that has the facility and image metadata, as well as 463 | the emissions data that we'll train models on 464 | """ 465 | # load all data 466 | image_metadata_df = load_clean_image_metadata_df( 467 | image_metadata_path=image_metadata_path, cog_type=cog_type 468 | ) 469 | campd_facilities_gdf = load_clean_campd_facilities_gdf( 470 | campd_facilities_path=campd_facilities_path 471 | ) 472 | campd_facilities_gdf = create_aoi_for_plants(campd_facilities_gdf) 473 | campd_emissions_df = load_clean_campd_emissions_df( 474 | campd_emissions_path=campd_emissions_path 475 | ) 476 | # remove the hour info from the date so as to join by day of the year 477 | image_metadata_df["date_without_time"] = image_metadata_df["ts"].dt.date 478 | campd_emissions_df["date_without_time"] = campd_emissions_df["date"].dt.date 479 | # merge the emissions with image metadata 480 | merged_df = pd.merge( 481 | left=campd_emissions_df, 482 | right=image_metadata_df, 483 | how="inner", 484 | on=["facility_id", "date_without_time"], 485 | ) 486 | # merge the facilities with the merged emissions and image metadata 487 | merged_df = pd.merge( 488 | left=merged_df, 489 | right=campd_facilities_gdf, 490 | how="inner", 491 | on="facility_id", 492 | suffixes=("", "_to_delete"), 493 | ) 494 | # filter to the columns that we care about for model training 495 | if cog_type != "all": 496 | final_columns = MAIN_COLUMNS + ["cog_url"] 497 | else: 498 | final_columns = MAIN_COLUMNS + ["visual"] + ALL_BANDS 499 | merged_df = merged_df[final_columns] 500 | merged_df.drop_duplicates(["facility_id", "ts"], inplace=True) 501 | # make sure that it's in geopandas format 502 | merged_df = gpd.GeoDataFrame( 503 | merged_df, 504 | geometry=merged_df.geometry, 505 | crs=f"EPSG:{GLOBAL_EPSG}", 506 | ) 507 | return merged_df 508 | 509 | 510 | def clean_final_dataset(df: pd.DataFrame) -> gpd.GeoDataFrame: 511 | """ 512 | Clean the final dataset that has the facility and image metadata, as well as 513 | the emissions data that we'll train models on. 514 | 515 | Args: 516 | df (pd.DataFrame): 517 | Final dataset that has the facility and image metadata, as well as 518 | the emissions data that we'll train models on 519 | 520 | Returns: 521 | gdf (gpd.GeoDataFrame): 522 | Cleaned final dataset that has the facility and image metadata, as 523 | well as the emissions data that we'll train models on 524 | """ 525 | # fix datetime column data type 526 | df.ts = pd.to_datetime(df.ts) 527 | # fix geometry column data type 528 | df.geometry = gpd.GeoSeries.from_wkt(df.geometry) 529 | gdf = gpd.GeoDataFrame(df, geometry=df.geometry, crs=f"EPSG:{GLOBAL_EPSG}") 530 | return gdf 531 | 532 | 533 | def load_final_dataset(final_dataset_path: Union[str, Path]) -> gpd.GeoDataFrame: 534 | """ 535 | Load the final dataset that has the facility and image metadata, as well as 536 | the emissions data that we'll train models on. 537 | 538 | Args: 539 | final_dataset_path (Union[str, Path]): 540 | Path to the final dataset 541 | 542 | Returns: 543 | gdf (gpd.GeoDataFrame): 544 | Final dataset that has the facility and image metadata, as well as 545 | the emissions data that we'll train models on 546 | """ 547 | return load_clean_data_df( 548 | data_path=final_dataset_path, 549 | load_func=load_csv, 550 | clean_func=clean_final_dataset, 551 | ) 552 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/data_viz.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | import torch 4 | from plotly.graph_objs import Figure 5 | import plotly.express as px 6 | 7 | 8 | def view_satellite_image(image: Union[np.ndarray, torch.Tensor]) -> Figure: 9 | """ 10 | View a satellite image using plotly 11 | 12 | Args: 13 | image (np.ndarray): 14 | The satellite image 15 | 16 | Returns: 17 | Figure: 18 | The plotly figure 19 | """ 20 | if isinstance(image, torch.Tensor): 21 | image = image.numpy() 22 | fig = px.imshow(image.transpose(1, 2, 0), zmin=0, zmax=255) 23 | # remove padding 24 | fig.update_layout(margin=dict(l=0, r=0, t=0, b=0)) 25 | return fig 26 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional, Union 3 | from loguru import logger 4 | import warnings 5 | import numpy as np 6 | import torch 7 | from torch.utils.data import IterableDataset, DataLoader 8 | from lightning import LightningDataModule 9 | import geopandas as gpd 10 | from tqdm.auto import tqdm 11 | 12 | # surpress batch size warning 13 | warnings.filterwarnings( 14 | "ignore", message="Trying to infer the `batch_size` from an ambiguous*" 15 | ) 16 | 17 | from coal_emissions_monitoring.constants import ( 18 | BATCH_SIZE, 19 | CROP_SIZE_PX, 20 | EMISSIONS_TARGET, 21 | MAIN_COLUMNS, 22 | IMAGE_SIZE_PX, 23 | MAX_BRIGHT_MEAN, 24 | MAX_CLOUD_COVER_PRCT, 25 | MAX_DARK_FRAC, 26 | TEST_YEAR, 27 | TRAIN_VAL_RATIO, 28 | ) 29 | from coal_emissions_monitoring.satellite_imagery import ( 30 | fetch_image_path_from_cog, 31 | get_image_from_cog, 32 | is_image_too_bright, 33 | is_image_too_dark, 34 | ) 35 | from coal_emissions_monitoring.data_cleaning import ( 36 | filter_to_cooling_tower_plants, 37 | get_final_dataset, 38 | load_final_dataset, 39 | ) 40 | from coal_emissions_monitoring.ml_utils import ( 41 | emissions_to_category, 42 | get_facility_set_mapper, 43 | split_data_in_sets, 44 | ) 45 | from coal_emissions_monitoring.transforms import get_transform 46 | 47 | 48 | class CoalEmissionsDataset(IterableDataset): 49 | def __init__( 50 | self, 51 | gdf: gpd.GeoDataFrame, 52 | target: str = EMISSIONS_TARGET, 53 | image_size: int = IMAGE_SIZE_PX, 54 | max_dark_frac: float = MAX_DARK_FRAC, 55 | max_mean_val: float = MAX_BRIGHT_MEAN, 56 | transforms: Optional[torch.nn.Module] = None, 57 | use_local_images: bool = False, 58 | ): 59 | """ 60 | Dataset that gets images of coal power plants, their emissions 61 | and metadata. 62 | 63 | Args: 64 | gdf (gpd.GeoDataFrame): 65 | A GeoDataFrame with the following columns: 66 | - facility_id 67 | - latitude 68 | - longitude 69 | - ts 70 | - is_powered_on 71 | - cloud_cover 72 | - cog_url 73 | - geometry 74 | target (str): 75 | The target column to predict 76 | image_size (int): 77 | The size of the image in pixels 78 | max_dark_frac (float): 79 | The maximum fraction of dark pixels allowed for an image; 80 | if the image has more dark pixels than this, it is skipped 81 | max_mean_val (float): 82 | The maximum mean value allowed for an image; if the image 83 | has a higher mean value than this, it is skipped 84 | transforms (Optional[torch.nn.Module]): 85 | A PyTorch module that transforms the image 86 | use_local_images (bool): 87 | Whether to use local images instead of downloading them 88 | from the cloud 89 | """ 90 | assert len(set(MAIN_COLUMNS) - set(gdf.columns)) == 0, ( 91 | "gdf must have all columns of the following list:\n" 92 | f"{MAIN_COLUMNS}\n" 93 | f"Instead, gdf has the following columns:\n" 94 | f"{gdf.columns}" 95 | ) 96 | self.gdf = gdf 97 | self.target = target 98 | self.image_size = image_size 99 | self.max_dark_frac = max_dark_frac 100 | self.max_mean_val = max_mean_val 101 | self.transforms = transforms 102 | self.use_local_images = use_local_images 103 | if self.use_local_images: 104 | assert "local_image_path" in self.gdf.columns, ( 105 | "If use_local_images is True, gdf must have a " 106 | "local_image_path column" 107 | ) 108 | 109 | def __iter__(self): 110 | if torch.utils.data.get_worker_info(): 111 | worker_total_num = torch.utils.data.get_worker_info().num_workers 112 | worker_id = torch.utils.data.get_worker_info().id 113 | else: 114 | worker_total_num = 1 115 | worker_id = 0 116 | for idx in range(worker_id, len(self.gdf), worker_total_num): 117 | row = self.gdf.iloc[idx] 118 | if self.use_local_images: 119 | try: 120 | image = np.load(row.local_image_path) 121 | except TypeError as e: 122 | logger.warning( 123 | f"Could not load local image at {row.local_image_path}. " 124 | f"Original error: {e}" 125 | ) 126 | continue 127 | else: 128 | image = get_image_from_cog( 129 | cog_url=row.cog_url, geometry=row.geometry, size=self.image_size 130 | ) 131 | image = torch.from_numpy(image).float() 132 | if is_image_too_dark( 133 | image, max_dark_frac=self.max_dark_frac 134 | ) or is_image_too_bright(image, max_mean_val=self.max_mean_val): 135 | continue 136 | if self.transforms is not None: 137 | try: 138 | image = self.transforms(image).squeeze(0) 139 | except AssertionError as e: 140 | logger.warning( 141 | f"Could not transform image at {row.local_image_path}. " 142 | f"Original error: {e}" 143 | ) 144 | continue 145 | 146 | target = torch.tensor(row[self.target]).float() 147 | metadata = row.drop([self.target, "geometry", "data_set"]).to_dict() 148 | metadata["ts"] = str(metadata["ts"]) 149 | yield { 150 | "image": image, 151 | "target": target, 152 | "metadata": metadata, 153 | } 154 | 155 | 156 | class CoalEmissionsDataModule(LightningDataModule): 157 | def __init__( 158 | self, 159 | final_dataset_path: Optional[Union[str, Path]] = None, 160 | image_metadata_path: Optional[Union[str, Path]] = None, 161 | campd_facilities_path: Optional[Union[str, Path]] = None, 162 | campd_emissions_path: Optional[Union[str, Path]] = None, 163 | target: str = EMISSIONS_TARGET, 164 | image_size: int = IMAGE_SIZE_PX, 165 | crop_size: int = CROP_SIZE_PX, 166 | train_val_ratio: float = TRAIN_VAL_RATIO, 167 | test_year: int = TEST_YEAR, 168 | batch_size: int = BATCH_SIZE, 169 | max_dark_frac: float = MAX_DARK_FRAC, 170 | max_mean_val: float = MAX_BRIGHT_MEAN, 171 | max_cloud_cover_prct: int = MAX_CLOUD_COVER_PRCT, 172 | predownload_images: bool = False, 173 | download_missing_images: bool = False, 174 | images_dir: str = "images/", 175 | num_workers: int = 0, 176 | ): 177 | """ 178 | Lightning Data Module that gets images of coal power plants, 179 | their emissions and metadata, and splits them into train, 180 | validation and test sets. 181 | 182 | Args: 183 | image_metadata_path (Union[str, Path]): 184 | Path to image metadata data 185 | campd_facilities_path (Union[str, Path]): 186 | Path to CAMPD facilities data 187 | campd_emissions_path (Union[str, Path]): 188 | Path to CAMPD emissions data 189 | target (str): 190 | The target column to predict 191 | image_size (int): 192 | The size of the image in pixels 193 | crop_size (int): 194 | The size of the crop in pixels 195 | train_val_ratio (float): 196 | The ratio of train to validation data 197 | test_year (int): 198 | The year to use for testing 199 | batch_size (int): 200 | The batch size, i.e. the number of samples to load at once 201 | max_dark_frac (float): 202 | The maximum fraction of dark pixels allowed for an image; 203 | if the image has more dark pixels than this, it is skipped 204 | max_mean_val (float): 205 | The maximum mean value allowed for an image; if the image 206 | has a higher mean value than this, it is skipped 207 | max_cloud_cover_prct (int): 208 | The maximum cloud cover percentage allowed for an image; 209 | if the image has more cloud cover than this, it is skipped 210 | predownload_images (bool): 211 | Whether to pre-download images from the cloud or load each 212 | one on the fly 213 | download_missing_images (bool): 214 | Whether to download images that are missing from the 215 | images_dir path 216 | images_dir (str): 217 | The directory to save images to if predownload_images is True 218 | num_workers (int): 219 | The number of workers to use for loading data 220 | """ 221 | super().__init__() 222 | self.final_dataset_path = final_dataset_path 223 | self.image_metadata_path = image_metadata_path 224 | self.campd_facilities_path = campd_facilities_path 225 | self.campd_emissions_path = campd_emissions_path 226 | self.target = target 227 | self.image_size = image_size 228 | self.crop_size = crop_size 229 | self.train_val_ratio = train_val_ratio 230 | self.test_year = test_year 231 | self.batch_size = batch_size 232 | self.max_dark_frac = max_dark_frac 233 | self.max_mean_val = max_mean_val 234 | self.max_cloud_cover_prct = max_cloud_cover_prct 235 | self.predownload_images = predownload_images 236 | self.download_missing_images = download_missing_images 237 | self.images_dir = images_dir 238 | self.num_workers = num_workers 239 | self.emissions_quantiles = None 240 | 241 | def setup(self, stage: str): 242 | """ 243 | Split the data into train, validation and test sets. 244 | 245 | Args: 246 | stage (str): 247 | The stage of the setup 248 | """ 249 | # load the final dataset 250 | if self.final_dataset_path is not None: 251 | self.gdf = load_final_dataset(self.final_dataset_path) 252 | else: 253 | self.gdf = get_final_dataset( 254 | image_metadata_path=self.image_metadata_path, 255 | campd_facilities_path=self.campd_facilities_path, 256 | campd_emissions_path=self.campd_emissions_path, 257 | ) 258 | # filter out rows with too much cloud cover 259 | self.gdf = self.gdf[self.gdf.cloud_cover <= self.max_cloud_cover_prct] 260 | if self.predownload_images: 261 | # make sure that images are already downloaded 262 | if "local_image_path" not in self.gdf.columns: 263 | tqdm.pandas(desc="Downloading images") 264 | self.gdf["local_image_path"] = self.gdf.progress_apply( 265 | lambda row: fetch_image_path_from_cog( 266 | cog_url=row.cog_url, 267 | geometry=row.geometry, 268 | size=self.image_size, 269 | images_dir=self.images_dir, 270 | download_missing_images=self.download_missing_images, 271 | ), 272 | axis=1, 273 | ) 274 | # skip rows where the image could not be downloaded 275 | self.gdf = self.gdf[~self.gdf.local_image_path.isna()] 276 | else: 277 | # make sure that the image paths are in the right directory 278 | current_image_path = ( 279 | self.gdf.local_image_path.str.split("/") 280 | .str[:-1] 281 | .str.join("/") 282 | .iloc[0] 283 | ) 284 | if current_image_path != self.images_dir: 285 | self.gdf.local_image_path = self.gdf.local_image_path.str.replace( 286 | current_image_path, self.images_dir 287 | ) 288 | # split the data into train, validation and test sets 289 | facility_set_mapper = get_facility_set_mapper( 290 | self.gdf, 291 | train_val_ratio=self.train_val_ratio, 292 | ) 293 | self.gdf["data_set"] = self.gdf.apply( 294 | lambda row: split_data_in_sets( 295 | row=row, data_set_mapper=facility_set_mapper, test_year=self.test_year 296 | ), 297 | axis=1, 298 | ) 299 | self.pos_weight = self.get_pos_weight(self.gdf) 300 | if stage == "fit": 301 | self.train_dataset = CoalEmissionsDataset( 302 | gdf=self.gdf[self.gdf.data_set == "train"].sample(frac=1), 303 | target=self.target, 304 | image_size=self.image_size, 305 | transforms=get_transform(data_group="train", crop_size=self.crop_size), 306 | use_local_images=self.predownload_images, 307 | max_dark_frac=self.max_dark_frac, 308 | max_mean_val=self.max_mean_val, 309 | ) 310 | self.val_dataset = CoalEmissionsDataset( 311 | gdf=self.gdf[self.gdf.data_set == "val"].sample(frac=1), 312 | target=self.target, 313 | image_size=self.image_size, 314 | transforms=get_transform(data_group="val", crop_size=self.crop_size), 315 | use_local_images=self.predownload_images, 316 | max_dark_frac=self.max_dark_frac, 317 | max_mean_val=self.max_mean_val, 318 | ) 319 | elif stage == "test": 320 | self.test_dataset = CoalEmissionsDataset( 321 | gdf=self.gdf[self.gdf.data_set == "test"].sample(frac=1), 322 | target=self.target, 323 | image_size=self.image_size, 324 | transforms=get_transform(data_group="test", crop_size=self.crop_size), 325 | use_local_images=self.predownload_images, 326 | max_dark_frac=self.max_dark_frac, 327 | max_mean_val=self.max_mean_val, 328 | ) 329 | 330 | def get_dataloader(self, data_group: str): 331 | # reshuffle the dataset 332 | getattr(self, f"{data_group}_dataset").gdf = getattr( 333 | self, f"{data_group}_dataset" 334 | ).gdf.sample(frac=1) 335 | # reset the dataloader 336 | return DataLoader( 337 | getattr(self, f"{data_group}_dataset"), 338 | batch_size=self.batch_size, 339 | num_workers=self.num_workers, 340 | pin_memory=True if torch.cuda.is_available() else False, 341 | ) 342 | 343 | def train_dataloader(self): 344 | return self.get_dataloader("train") 345 | 346 | def val_dataloader(self): 347 | return self.get_dataloader("val") 348 | 349 | def test_dataloader(self): 350 | return self.get_dataloader("test") 351 | 352 | def get_pos_weight(self, gdf: Optional[gpd.GeoDataFrame] = None) -> float: 353 | """ 354 | Get the positive weight for the dataset, based on class imbalance. 355 | 356 | Args: 357 | gdf (Optional[gpd.GeoDataFrame]): 358 | The dataset to use for calculating the positive weight. 359 | If None, the dataset used for training will be used. 360 | 361 | Returns: 362 | float: 363 | The positive weight 364 | """ 365 | if gdf is None: 366 | gdf = self.gdf 367 | num_positives = gdf[self.target].sum() 368 | num_negatives = len(gdf) - num_positives 369 | return num_negatives / num_positives 370 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/ml_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple 2 | import geopandas as gpd 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | 7 | from coal_emissions_monitoring.data_cleaning import load_clean_campd_facilities_gdf 8 | from coal_emissions_monitoring.constants import TEST_YEAR, TRAIN_VAL_RATIO 9 | 10 | 11 | def get_facility_set_mapper( 12 | gdf: gpd.GeoDataFrame, train_val_ratio: float = TRAIN_VAL_RATIO 13 | ) -> Dict[int, str]: 14 | """ 15 | Get a mapper from facility ID to a set of train or validation. 16 | 17 | Args: 18 | gdf (gpd.GeoDataFrame): 19 | The gdf containing the facility IDs 20 | train_val_ratio (float): 21 | The ratio of training to validation data 22 | 23 | Returns: 24 | Dict[int, str]: 25 | A mapper from facility ID to a set of train or validation 26 | """ 27 | assigned_facilities = set() 28 | for facility_id, facility_gdf in gdf.groupby("facility_id"): 29 | if facility_id in assigned_facilities: 30 | continue 31 | # assign a data set to the facility 32 | data_set = np.random.choice( 33 | ["train", "val"], p=[train_val_ratio, 1 - train_val_ratio] 34 | ) 35 | gdf.loc[gdf.facility_id == facility_id, "data_set"] = data_set 36 | assigned_facilities.add(facility_id) 37 | # apply the same data set to intersecting facilities 38 | other_facilities_gdf = gdf.loc[ 39 | gdf.facility_id != facility_id, ["facility_id", "geometry"] 40 | ] 41 | other_facilities_gdf.rename( 42 | columns={"facility_id": "intersecting_facility_id"}, inplace=True 43 | ) 44 | intersecting_facilities_gdf = gpd.sjoin( 45 | facility_gdf, 46 | other_facilities_gdf, 47 | how="inner", 48 | predicate="intersects", 49 | ) 50 | if intersecting_facilities_gdf.empty: 51 | continue 52 | else: 53 | for intersecting_facility_id in intersecting_facilities_gdf[ 54 | "intersecting_facility_id" 55 | ].unique(): 56 | gdf.loc[ 57 | gdf.facility_id == intersecting_facility_id, 58 | "data_set", 59 | ] = data_set 60 | assigned_facilities.add(intersecting_facility_id) 61 | # create a mapper from facility ID to a set of train or validation 62 | return gdf.groupby("facility_id").data_set.first().to_dict() 63 | 64 | 65 | def split_data_in_sets( 66 | row: pd.DataFrame, data_set_mapper: Dict[int, str], test_year: int = TEST_YEAR 67 | ) -> str: 68 | """ 69 | Split the data in sets. This function is meant to be used with pandas.DataFrame.apply. 70 | 71 | Args: 72 | row (pd.DataFrame): 73 | The row of the DataFrame 74 | data_set_mapper (Dict[int, str]): 75 | A mapper from facility ID to a set of train or validation 76 | test_year (int): 77 | The year to use for testing 78 | 79 | Returns: 80 | str: 81 | The data set 82 | """ 83 | if row.ts.year == test_year: 84 | data_set = "test" 85 | else: 86 | data_set = data_set_mapper[row.facility_id] 87 | return data_set 88 | 89 | 90 | def emissions_to_category( 91 | emissions: float, quantiles: Dict[float, float], rescale: bool = False 92 | ) -> int: 93 | """ 94 | Convert emissions to a category based on quantiles. The quantiles are 95 | calculated from the training data. Here's how the categories are defined: 96 | - 0: no emissions 97 | - 1: low emissions 98 | - 2: medium emissions 99 | - 3: high emissions 100 | - 4: very high emissions 101 | 102 | Args: 103 | emissions (float): emissions value 104 | quantiles (Dict[float, float]): quantiles to use for categorization 105 | rescale (bool): whether to rescale emissions to the original range, 106 | using the 99th quantile as the maximum value 107 | 108 | Returns: 109 | int: category 110 | """ 111 | if rescale: 112 | emissions = emissions * quantiles[0.99] 113 | if emissions <= 0: 114 | return 0 115 | elif emissions <= quantiles[0.3]: 116 | return 1 117 | elif emissions > quantiles[0.3] and emissions <= quantiles[0.6]: 118 | return 2 119 | elif emissions > quantiles[0.6] and emissions <= quantiles[0.99]: 120 | return 3 121 | else: 122 | return 4 123 | 124 | 125 | def preds_n_targets_to_categories( 126 | preds: torch.Tensor, 127 | targets: torch.Tensor, 128 | quantiles: Dict[float, float], 129 | rescale: bool = False, 130 | ) -> Tuple[torch.Tensor, torch.Tensor]: 131 | """ 132 | Convert emissions to a category based on quantiles. The quantiles are 133 | calculated from the training data. Here's how the categories are defined: 134 | - 0: no emissions 135 | - 1: low emissions 136 | - 2: medium emissions 137 | - 3: high emissions 138 | - 4: very high emissions 139 | 140 | Args: 141 | preds (torch.Tensor): emissions predictions 142 | targets (torch.Tensor): emissions targets 143 | quantiles (Dict[float, float]): quantiles to use for categorization 144 | rescale (bool): whether to rescale emissions to the original range, 145 | using the 99th quantile as the maximum value 146 | 147 | Returns: 148 | Tuple[torch.Tensor, torch.Tensor]: tuple of predictions and targets 149 | """ 150 | preds_cat = torch.tensor( 151 | [ 152 | emissions_to_category(y_pred_i, quantiles, rescale=rescale) 153 | for y_pred_i in preds 154 | ] 155 | ).to(preds.device) 156 | targets_cat = torch.tensor( 157 | [emissions_to_category(y_i, quantiles, rescale=rescale) for y_i in targets] 158 | ).to(targets.device) 159 | return preds_cat, targets_cat 160 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/model.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | from lightning import LightningModule 3 | import torch 4 | import torchmetrics 5 | from sklearn.metrics import balanced_accuracy_score 6 | import warnings 7 | 8 | from coal_emissions_monitoring.constants import POSITIVE_THRESHOLD 9 | 10 | # surpress balanced accuracy warning 11 | warnings.filterwarnings("ignore", message="y_pred contains classes not in y_true*") 12 | 13 | 14 | class SmallCNN(torch.nn.Module): 15 | def __init__(self, num_input_channels: int = 3, num_classes: int = 1): 16 | super().__init__() 17 | self.num_input_channels = num_input_channels 18 | self.num_classes = num_classes 19 | # build a simple model with EfficientNet-like blocks, global pooling 20 | # and a final linear layer, compatible with images of size 32x32 21 | self.model = torch.nn.Sequential( 22 | torch.nn.Conv2d( 23 | in_channels=self.num_input_channels, 24 | out_channels=16, 25 | kernel_size=3, 26 | padding=1, 27 | ), 28 | torch.nn.ReLU(), 29 | torch.nn.Conv2d( 30 | in_channels=16, 31 | out_channels=32, 32 | kernel_size=3, 33 | padding=1, 34 | ), 35 | torch.nn.ReLU(), 36 | torch.nn.MaxPool2d(kernel_size=2), 37 | torch.nn.Conv2d( 38 | in_channels=32, 39 | out_channels=64, 40 | kernel_size=3, 41 | padding=1, 42 | ), 43 | torch.nn.ReLU(), 44 | torch.nn.Conv2d( 45 | in_channels=64, 46 | out_channels=64, 47 | kernel_size=3, 48 | padding=1, 49 | ), 50 | torch.nn.ReLU(), 51 | torch.nn.MaxPool2d(kernel_size=2), 52 | torch.nn.Conv2d( 53 | in_channels=64, 54 | out_channels=128, 55 | kernel_size=3, 56 | padding=1, 57 | ), 58 | torch.nn.ReLU(), 59 | torch.nn.Conv2d( 60 | in_channels=128, 61 | out_channels=128, 62 | kernel_size=3, 63 | padding=1, 64 | ), 65 | torch.nn.ReLU(), 66 | torch.nn.AdaptiveAvgPool2d(output_size=1), 67 | torch.nn.Flatten(), 68 | torch.nn.Linear(128, self.num_classes), 69 | ) 70 | 71 | def forward(self, x): 72 | return self.model(x) 73 | 74 | 75 | class CoalEmissionsModel(LightningModule): 76 | def __init__( 77 | self, 78 | model: torch.nn.Module, 79 | learning_rate: float = 1e-3, 80 | pos_weight: float = 1.0, 81 | ): 82 | super().__init__() 83 | self.model = model 84 | self.learning_rate = learning_rate 85 | self.pos_weight = pos_weight 86 | self.loss = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.pos_weight)) 87 | 88 | def forward(self, x): 89 | preds = self.model(x).squeeze(-1) 90 | return preds 91 | 92 | def calculate_all_metrics( 93 | self, preds: torch.Tensor, targets: torch.Tensor 94 | ) -> Dict[str, float]: 95 | """ 96 | Calculate metrics for a batch of predictions and targets. 97 | 98 | Args: 99 | preds (torch.Tensor): predictions 100 | targets (torch.Tensor): targets 101 | 102 | Returns: 103 | Dict[str, float]: metrics 104 | """ 105 | metrics = dict() 106 | # calculate the cross entropy loss 107 | metrics["loss"] = self.loss(preds, targets) 108 | # apply sigmoid to the predictions to get a value between 0 and 1 109 | preds = torch.sigmoid(preds) 110 | # calculate emissions vs no-emissions accuracy 111 | metrics["accuracy"] = ( 112 | ((preds > POSITIVE_THRESHOLD) == (targets > 0)).float().mean() 113 | ) 114 | # calculate balanced accuracy, which accounts for class imbalance 115 | metrics["balanced_accuracy"] = balanced_accuracy_score( 116 | y_pred=(preds.cpu() > POSITIVE_THRESHOLD).int(), 117 | y_true=targets.cpu().int(), 118 | ) 119 | # calculate recall and precision 120 | metrics["recall"] = torchmetrics.functional.recall( 121 | preds=preds, 122 | target=targets, 123 | average="macro", 124 | task="binary", 125 | ) 126 | metrics["precision"] = torchmetrics.functional.precision( 127 | preds=preds, 128 | target=targets, 129 | average="macro", 130 | task="binary", 131 | ) 132 | return metrics 133 | 134 | def shared_step( 135 | self, 136 | batch: Dict[str, Any], 137 | batch_idx: int, 138 | stage: str, 139 | ): 140 | if len(batch["image"].shape) == 0: 141 | # avoid iteration over a 0-d array error 142 | return dict() 143 | metrics = dict() 144 | x, y = batch["image"], batch["target"] 145 | x, y = x.float().to(self.device), y.float().to(self.device) 146 | # forward pass (calculate predictions) 147 | y_pred = self(x) 148 | # calculate metrics for the current batch 149 | metrics = self.calculate_all_metrics(preds=y_pred, targets=y) 150 | metrics = { 151 | (f"{stage}_{k}" if k != "loss" or stage != "train" else k): v 152 | for k, v in metrics.items() 153 | } 154 | # log metrics 155 | for k, v in metrics.items(): 156 | if k == "loss": 157 | self.log(k, v, on_step=True, prog_bar=True) 158 | else: 159 | self.log(k, v, on_step=False, on_epoch=True, prog_bar=True) 160 | return metrics 161 | 162 | def training_step(self, batch: Dict[str, Any], batch_idx: int): 163 | return self.shared_step(batch, batch_idx, stage="train") 164 | 165 | def validation_step(self, batch: Dict[str, Any], batch_idx: int): 166 | return self.shared_step(batch, batch_idx, stage="val") 167 | 168 | def test_step(self, batch: Dict[str, Any], batch_idx: int): 169 | return self.shared_step(batch, batch_idx, stage="test") 170 | 171 | def configure_optimizers(self): 172 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 173 | return { 174 | "optimizer": optimizer, 175 | "lr_scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau( 176 | optimizer, mode="min", factor=0.1, patience=3 177 | ), 178 | "monitor": "val_loss", 179 | } 180 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/satellite_imagery.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | from typing import List, Optional, Union 4 | import backoff 5 | 6 | import geopandas as gpd 7 | import numpy as np 8 | import rasterio as rio 9 | from rasterio.errors import RasterioIOError 10 | import pandas as pd 11 | import pystac_client 12 | from loguru import logger 13 | from pyproj.aoi import AreaOfInterest 14 | from pyproj.database import query_utm_crs_info 15 | from shapely.geometry.base import BaseGeometry 16 | import torch 17 | from tqdm.auto import tqdm 18 | 19 | from coal_emissions_monitoring.constants import ( 20 | ALL_BANDS, 21 | AOI_SIZE_METERS, 22 | API_URL, 23 | COLLECTION, 24 | END_DATE, 25 | GLOBAL_EPSG, 26 | IMAGE_SIZE_PX, 27 | MAX_BRIGHT_MEAN, 28 | MAX_CLOUD_COVER_PRCT, 29 | MAX_DARK_FRAC, 30 | START_DATE, 31 | ) 32 | 33 | STAC_CLIENT = pystac_client.Client.open(API_URL) 34 | 35 | 36 | def get_epsg_from_coords(latitude: float, longitude: float) -> int: 37 | """ 38 | Get the EPSG code for a specific coordinate 39 | 40 | Args: 41 | latitude (float): 42 | The latitude of the coordinate 43 | longitude (float): 44 | The longitude of the coordinate 45 | 46 | Returns: 47 | int: 48 | The EPSG code for the coordinate 49 | """ 50 | crs_info = query_utm_crs_info( 51 | datum_name="WGS 84", 52 | area_of_interest=AreaOfInterest( 53 | west_lon_degree=longitude, 54 | south_lat_degree=latitude, 55 | east_lon_degree=longitude, 56 | north_lat_degree=latitude, 57 | ), 58 | ) 59 | return int(crs_info[0].code) 60 | 61 | 62 | def create_aoi_for_plants(campd_facilities_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: 63 | """ 64 | Create a square area of interest (AOI) for each plant in the CAMPD facilities data. 65 | This will later be used to query for satellite imagery. 66 | 67 | Args: 68 | campd_facilities_gdf (gpd.GeoDataFrame): 69 | The CAMPD facilities data frame 70 | 71 | Returns: 72 | gpd.GeoDataFrame: 73 | A data frame containing the AOIs for each plant 74 | """ 75 | facility_dfs = list() 76 | for _, facility_df in tqdm( 77 | campd_facilities_gdf.groupby("facility_id"), 78 | total=campd_facilities_gdf.facility_id.nunique(), 79 | desc="Creating AOIs for plants", 80 | ): 81 | # identify what is the local CRS for the current facility, 82 | # based on its latitude and longitude 83 | epsg = get_epsg_from_coords( 84 | facility_df.latitude.mean(), facility_df.longitude.mean() 85 | ) 86 | # convert to the local CRS, based on the coordinates 87 | facility_df = facility_df.to_crs(epsg=epsg) 88 | # buffer the geometry into a square that is ~3.2km on each side 89 | facility_df.geometry = facility_df.geometry.buffer( 90 | AOI_SIZE_METERS / 2, cap_style=3 91 | ) 92 | # convert back to the global CRS 93 | facility_df = facility_df.to_crs(epsg=GLOBAL_EPSG) 94 | facility_dfs.append(facility_df) 95 | return gpd.GeoDataFrame(pd.concat(facility_dfs, ignore_index=True)) 96 | 97 | 98 | def get_aws_cog_links_from_geom( 99 | geometry: BaseGeometry, 100 | collection: str = COLLECTION, 101 | start_date: Optional[datetime] = START_DATE, 102 | end_date: Optional[datetime] = END_DATE, 103 | max_cloud_cover_prct: Optional[int] = MAX_CLOUD_COVER_PRCT, 104 | sort_by: str = "updated", 105 | max_items: Optional[int] = None, 106 | verbose: bool = True, 107 | ) -> pd.DataFrame: 108 | """ 109 | Retrieve links from AWS' Sentinel 2 L2A STAC 110 | 111 | Args: 112 | geometry (BaseGeometry): 113 | The geometry to query for images that 114 | contain it in STAC 115 | collection (str): 116 | The STAC collection to query 117 | start_date (Optional[datetime]): 118 | Optional start date to filter images on 119 | end_date (Optional[datetime]): 120 | Optional end date to filter images on 121 | max_cloud_cover_prct (Optional[int]): 122 | Optional maximum cloud cover to filter 123 | images that are too cloudy. Expressed 124 | as a percentage, e.g. 1 = 1% 125 | sort_by (str): 126 | Which property to sort the results by, 127 | in descending order; needs to be a valid 128 | property in the STAC collection 129 | max_items (Optional[int]): 130 | Optional maximum number of items to 131 | return 132 | verbose (bool): 133 | Whether to print the progress of the 134 | query 135 | 136 | Returns: 137 | pd.DataFrame: 138 | A dataframe containing the ID of the tile and 139 | the links to its COGs and metadata 140 | """ 141 | # get the bounding box from the geometry 142 | bbox = geometry.bounds 143 | # specify the cloud filter 144 | if max_cloud_cover_prct == 0: 145 | cloud_filter = "eo:cloud_cover=0" 146 | elif max_cloud_cover_prct is not None: 147 | cloud_filter = f"eo:cloud_cover<={max_cloud_cover_prct}" 148 | # query the STAC collection(s) in a specific bounding box and search criteria 149 | search = STAC_CLIENT.search( 150 | collections=[collection], 151 | bbox=bbox, 152 | datetime=f"{start_date.strftime('%Y-%m-%d')}/{end_date.strftime('%Y-%m-%d')}", 153 | query=[cloud_filter] if max_cloud_cover_prct is not None else None, 154 | ) 155 | if verbose: 156 | logger.info(f"Found {search.matched()} items matching the search criteria") 157 | items = search.get_all_items() 158 | if max_cloud_cover_prct is not None and collection == "sentinel-s2-l2a-cogs": 159 | # some items had invalid cloud cover data and turned out very cloudy; only works for L2A 160 | items_valid_cloud_filter = [ 161 | x for x in items if x.properties["sentinel:valid_cloud_cover"] == True 162 | ] 163 | if verbose: 164 | logger.info( 165 | f"Removed {len(items) - len(items_valid_cloud_filter)} items for invalid cloud filters" 166 | ) 167 | items = items_valid_cloud_filter 168 | items = sorted(items, key=lambda x: x.properties[sort_by], reverse=True) 169 | if max_items is not None: 170 | items = items[:max_items] 171 | # create a dictionary that contains the tile ID and the links to the COGs and metadata 172 | output = dict(tile_id=[item.id for item in items]) 173 | if len(items) == 0: 174 | return None 175 | asset_keys = items[0].assets.keys() 176 | for key in asset_keys: 177 | output[key] = [item.assets[key].href for item in items] 178 | output["cloud_cover"] = [item.properties["eo:cloud_cover"] for item in items] 179 | output[sort_by] = [item.properties[sort_by] for item in items] 180 | output["ts"] = [item.properties["datetime"] for item in items] 181 | output = pd.DataFrame(output) 182 | output["ts"] = pd.to_datetime(output["ts"]) 183 | output.drop_duplicates(subset="ts", keep="first", inplace=True) 184 | output.sort_values("ts", inplace=True) 185 | return output 186 | 187 | 188 | def get_image_metadata_for_plants( 189 | plant_aoi_gdf: gpd.GeoDataFrame, 190 | collection: str = COLLECTION, 191 | start_date: datetime = START_DATE, 192 | end_date: datetime = END_DATE, 193 | max_cloud_cover_prct: int = MAX_CLOUD_COVER_PRCT, 194 | sort_by: str = "updated", 195 | ) -> pd.DataFrame: 196 | """ 197 | Get the metadata for the satellite images for each plant, 198 | based on the AOI defined for each plant (see create_aoi_for_plants) 199 | 200 | Args: 201 | plant_aoi_gdf (gpd.GeoDataFrame): 202 | The data frame containing the AOIs for each plant 203 | collection (str): 204 | The STAC collection to query 205 | start_date (Optional[datetime]): 206 | Start date to filter images on 207 | end_date (Optional[datetime]): 208 | End date to filter images on 209 | max_cloud_cover_prct (Optional[int]): 210 | Maximum cloud cover to filter 211 | images that are too cloudy. Expressed 212 | as a percentage, e.g. 1 = 1% 213 | sort_by (str): 214 | Which property to sort the results by, 215 | in descending order; needs to be a valid 216 | property in the STAC collection 217 | 218 | Returns: 219 | pd.DataFrame: 220 | A dataframe containing the ID of the tile and 221 | the links to its COGs and metadata 222 | """ 223 | image_metadata_dfs = list() 224 | for facility_id, geometry in tqdm( 225 | plant_aoi_gdf.groupby("facility_id").geometry.first().items(), 226 | total=plant_aoi_gdf.facility_id.nunique(), 227 | desc="Querying STAC API", 228 | ): 229 | stac_results_df = get_aws_cog_links_from_geom( 230 | geometry=geometry, 231 | collection=collection, 232 | start_date=start_date, 233 | end_date=end_date, 234 | max_cloud_cover_prct=max_cloud_cover_prct, 235 | sort_by=sort_by, 236 | verbose=False, 237 | ) 238 | stac_results_df["facility_id"] = facility_id 239 | image_metadata_dfs.append(stac_results_df) 240 | return pd.concat(image_metadata_dfs, ignore_index=True) 241 | 242 | 243 | def pad_or_crop_to_size(image: np.ndarray, size: int = IMAGE_SIZE_PX) -> np.ndarray: 244 | """ 245 | Pad or crop an image to a specific size 246 | 247 | Args: 248 | image (np.ndarray): 249 | The image to pad or crop, with dimensions (C, H, W), 250 | where C is the number of channels, H is the height and 251 | W is the width 252 | size (int): 253 | The size to pad or crop to 254 | 255 | Returns: 256 | np.ndarray: 257 | The padded or cropped image 258 | """ 259 | if image.shape[1] > size: 260 | # crop the image 261 | image = image[:, :size, :size] 262 | elif image.shape[1] < size: 263 | # pad the image 264 | image = np.pad( 265 | image, 266 | ((0, 0), (0, size - image.shape[1]), (0, size - image.shape[2])), 267 | ) 268 | return image 269 | 270 | 271 | @backoff.on_exception(backoff.expo, RasterioIOError, max_tries=3) 272 | def get_image_from_cog( 273 | cog_url: str, geometry: BaseGeometry, size: int = IMAGE_SIZE_PX 274 | ) -> np.ndarray: 275 | """ 276 | Get the image from a COG, clipped to the geometry 277 | 278 | Args: 279 | cog_url (str): 280 | The URL to the COG 281 | geometry (BaseGeometry): 282 | The geometry to clip the image to 283 | size (int): 284 | The size to pad or crop to 285 | 286 | Returns: 287 | np.ndarray: 288 | The clipped image 289 | """ 290 | # load only the bbox of the image 291 | with rio.open(cog_url) as src: 292 | # get the bbox converted to the right coordinate reference system (crs); 293 | # doing all of this because geopandas has the convenient to_crs function 294 | crs_bbox = ( 295 | gpd.GeoDataFrame(geometry=[geometry], crs=GLOBAL_EPSG) 296 | .to_crs(src.crs) 297 | .total_bounds 298 | ) 299 | # define window in RasterIO 300 | window = rio.windows.from_bounds(*crs_bbox, transform=src.transform) 301 | # actual HTTP range request 302 | image = src.read(window=window) 303 | # make sure that the image has the shape that we want 304 | image = pad_or_crop_to_size(image, size=size) 305 | return image 306 | 307 | 308 | def get_all_bands_image( 309 | cog_urls: List[str], 310 | geometry: BaseGeometry, 311 | size: int = IMAGE_SIZE_PX, 312 | ) -> np.ndarray: 313 | """ 314 | Get an image that stacks all bands for a given row, 315 | clipped to the geometry. 316 | 317 | Args: 318 | cog_urls (List[str]): 319 | The URLs to the COGs 320 | geometry (BaseGeometry): 321 | The geometry to clip the image to 322 | size (int): 323 | The size to pad or crop to 324 | 325 | Returns: 326 | np.ndarray: 327 | The stacked image 328 | """ 329 | bands = [ 330 | get_image_from_cog(cog_url=url, geometry=geometry, size=size).squeeze() 331 | for url in cog_urls 332 | ] 333 | return np.stack(bands, axis=0) 334 | 335 | 336 | def fetch_image_path_from_cog( 337 | cog_url: Union[str, List[str]], 338 | geometry: BaseGeometry, 339 | size: int = IMAGE_SIZE_PX, 340 | cog_type: str = "visual", 341 | images_dir: str = "images/", 342 | download_missing_images: bool = False, 343 | ) -> Union[str, None]: 344 | """ 345 | Fetch the image path from a COG; if download_missing_images is True, 346 | the image will be downloaded if it does not exist. 347 | 348 | Args: 349 | cog_url (Union[str, List[str]]): 350 | The URL to the COG 351 | geometry (BaseGeometry): 352 | The geometry to clip the image to 353 | size (int): 354 | The size to pad or crop to 355 | cog_type (str): 356 | The type of COG to download. Can be either "visual" or "all". 357 | images_dir (str): 358 | The directory to save the image to 359 | download_missing_images (bool): 360 | Whether to download the image if it does not exist 361 | 362 | Returns: 363 | Union[str, None]: 364 | The path to the downloaded image. If the image 365 | doesn't exist or could not be downloaded, None is returned. 366 | """ 367 | if cog_type == "all": 368 | assert isinstance(cog_url, list) and len(cog_url) == len(ALL_BANDS), ( 369 | "If cog_type is 'all', cog_url must be a list " 370 | f"of length {len(ALL_BANDS)}" 371 | ) 372 | image_name = "_".join(cog_url[0].split("/")[-2:]).replace(".tif", "") 373 | else: 374 | image_name = "_".join(cog_url.split("/")[-2:]).replace(".tif", "") 375 | lat, lon = geometry.centroid.coords[0] 376 | patch_name = f"{image_name}_{lat}_{lon}_{size}" 377 | image_path = os.path.join(images_dir, f"{patch_name}.npy") 378 | if os.path.exists(image_path): 379 | # image already exists in the expected location 380 | return str(image_path) 381 | else: 382 | if not download_missing_images: 383 | # image does not exist and we don't want to download it 384 | return None 385 | else: 386 | # download and save the image 387 | os.makedirs(images_dir, exist_ok=True) 388 | try: 389 | if cog_type == "visual": 390 | image = get_image_from_cog( 391 | cog_url=cog_url, geometry=geometry, size=size 392 | ) 393 | elif cog_type == "all": 394 | try: 395 | image = get_all_bands_image( 396 | cog_urls=cog_url, geometry=geometry, size=size 397 | ) 398 | except ValueError as e: 399 | logger.warning( 400 | f"Failed to download image {cog_url}. Original error:\n{e}" 401 | ) 402 | return None 403 | except RasterioIOError as e: 404 | logger.warning( 405 | f"Failed to download image {cog_url}. Original error:\n{e}" 406 | ) 407 | return None 408 | np.save(image_path, image) 409 | return str(image_path) 410 | 411 | 412 | def is_image_too_dark( 413 | image: torch.Tensor, max_dark_frac: float = MAX_DARK_FRAC 414 | ) -> bool: 415 | """ 416 | Check if an image is too dark, based on the fraction of pixels that are 417 | black or NaN 418 | 419 | Args: 420 | image (torch.Tensor): 421 | The image to check, with dimensions (C, H, W), 422 | where C is the number of channels, H is the height and 423 | W is the width 424 | max_dark_frac (float): 425 | The maximum fraction of pixels that can be black or NaN 426 | 427 | Returns: 428 | bool: 429 | Whether the image is too dark 430 | """ 431 | dark_frac = ((image <= 1) | (image.isnan())).sum() / image.numel() 432 | return dark_frac > max_dark_frac 433 | 434 | 435 | def is_image_too_bright(image: torch.Tensor, max_mean_val: MAX_BRIGHT_MEAN) -> bool: 436 | """ 437 | Check if the image is too bright, such as because of clouds or snow, based 438 | on the mean value of the image 439 | 440 | Args: 441 | image (torch.Tensor): 442 | The image to check, with dimensions (C, H, W), 443 | where C is the number of channels, H is the height and 444 | W is the width 445 | max_mean_val (float): 446 | The maximum mean value of the image 447 | 448 | Returns: 449 | bool: 450 | Whether the image is too bright 451 | """ 452 | return image.mean() > max_mean_val 453 | -------------------------------------------------------------------------------- /src/coal_emissions_monitoring/transforms.py: -------------------------------------------------------------------------------- 1 | import kornia.augmentation as K 2 | import torch 3 | 4 | from coal_emissions_monitoring.constants import CROP_SIZE_PX, RANDOM_TRANSFORM_PROB 5 | 6 | 7 | def get_transform( 8 | data_group: str, crop_size: int = CROP_SIZE_PX 9 | ) -> K.AugmentationSequential: 10 | """ 11 | Get the transform for the given data group, i.e. train, val, or test. 12 | 13 | Args: 14 | data_group (str): data group 15 | crop_size (int): crop size 16 | 17 | Returns: 18 | K.AugmentationSequential: transforms 19 | """ 20 | if data_group == "train": 21 | return K.AugmentationSequential( 22 | K.RandomCrop(size=(crop_size, crop_size)), 23 | K.RandomHorizontalFlip(p=RANDOM_TRANSFORM_PROB), 24 | K.RandomRotation(p=RANDOM_TRANSFORM_PROB, degrees=90), 25 | # TODO this contrast transform is sometimes making the image too dark 26 | # consider fixing it if needing more regularization 27 | # K.RandomContrast(p=RANDOM_TRANSFORM_PROB, contrast=(0.9, 1.1)), 28 | data_keys=["image"], 29 | same_on_batch=False, 30 | keepdim=True, 31 | ) 32 | elif data_group == "val": 33 | return K.AugmentationSequential( 34 | K.CenterCrop(size=(crop_size, crop_size)), 35 | data_keys=["image"], 36 | same_on_batch=False, 37 | keepdim=True, 38 | ) 39 | elif data_group == "test": 40 | return K.AugmentationSequential( 41 | K.CenterCrop(size=(crop_size, crop_size)), 42 | data_keys=["image"], 43 | same_on_batch=False, 44 | keepdim=True, 45 | ) 46 | else: 47 | raise ValueError( 48 | f"Invalid data group: {data_group}." "Expected one of: train, val, test." 49 | ) 50 | 51 | 52 | efficientnet_transform = K.AugmentationSequential( 53 | K.Resize(size=(256, 256)), 54 | K.CenterCrop(size=(224, 224)), 55 | K.Normalize( 56 | mean=torch.tensor([0.485, 0.456, 0.406]), 57 | std=torch.tensor([0.229, 0.224, 0.225]), 58 | ), 59 | data_keys=["image"], 60 | same_on_batch=False, 61 | keepdim=True, 62 | ) 63 | --------------------------------------------------------------------------------