├── .DS_Store ├── .gitignore ├── LICENSE ├── LICENSE-data ├── README.md ├── bash_scripts ├── data_100k_128.sh ├── data_1M_128.sh └── data_1M_64.sh ├── config ├── .DS_Store ├── config_data.yaml ├── config_tiles.yaml └── ee_init.py ├── create_tiles_polygon.py ├── data_exp ├── data_exp.py ├── density_maps.py └── view_h5.py ├── ee_utils ├── .DS_Store └── ee_data.py ├── main_download.py ├── post_download.py ├── redownload.py ├── requirements.txt ├── slurm_scripts ├── slurm_create_tiles.sh ├── slurm_download_parallel.sh ├── slurm_download_seq.sh ├── slurm_redownload_parallel.sh └── slurm_temp.sh ├── stats ├── biome_labels.json ├── biome_names.json ├── biome_stats.json ├── eco_labels.json ├── realm_stats.json ├── total_area_biome.json └── total_area_eco_region.json ├── tmp.py └── utils ├── biome_data_utils.py ├── chunking_h5.py ├── convert_to_h5.py ├── normalization.py ├── splits.py └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | outputs/ 3 | 1M_v001_plots/ 4 | data/ 5 | # data_*/ 6 | tests/ 7 | tile_polygons/ 8 | data_1000/ 9 | tile_info_* 10 | tiles_* 11 | tile_info/ 12 | __pycache__/ 13 | config/__pycache__/ 14 | ee_utils/__pycache__/ 15 | utils/__pycache__/ 16 | .DS_Store 17 | ee_utils/.DS_Store 18 | config/.DS_Store 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Vishal Nedungadi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE-data: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. 396 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![MMEarth-logo](https://github.com/vishalned/MMEarth-data/assets/27778126/09675b82-ff9e-43be-9160-3267b948e941) 3 | 4 | 5 | 6 | 7 | 8 | # MMEarth - Data Downloading 9 | [![Project Website](https://img.shields.io/badge/Project%20Website-8A2BE2)](https://vishalned.github.io/mmearth) 10 | [![Paper](https://img.shields.io/badge/arXiv-2405.02771-blue)](https://arxiv.org/abs/2405.02771) 11 | [![Code - Models](https://img.shields.io/badge/Code%20--%20Model-darkgreen)](https://github.com/vishalned/MMEarth-train/tree/main) 12 | 13 | 14 | This repository contains scripts to download the data presented in the paper [MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial Representation Learning](https://arxiv.org/abs/2405.02771). The scripts are used to download large scale satellite data from different sensors and satellites (Sentinel-2, Sentinel-1, ERA5 - temperature & precipitation, Aster GDEM etc) which we call modalities. The data is downloaded from [Google Earth Engine](https://earthengine.google.com/). 15 | 16 | ## 📢 Latest Updates 17 | :fire::fire::fire: Last Updated on 2024.11.07 :fire::fire::fire: 18 | - MMEarth has been added to the [TorchGeo](https://torchgeo.readthedocs.io/en/latest/api/datasets.html#mmearth) datasets class. 19 | - **Paper accepted to ECCV 2024 !!** 20 | - Updated datasets to version v001. 21 | - Dataset fix: Removed duplicates and corrected ERA5 yearly statistics. 22 | - Fixed downloading scripts. 23 | 24 | 25 | ## Table of contents 26 | 1. [Data Download](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#data-download) 27 | 2. [Data Loading](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#data-loading) 28 | 3. [Getting Started](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#getting-started) 29 | 4. [Data Stacks](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#data-stacks) 30 | 5. [Code Structure](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#code-structure) 31 | 6. [Slurm Execution](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#slurm-execution) 32 | 7. [Citation](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#citation) 33 | 34 | ## Data Download 35 | The MMEarth data can be downloaded using the following links. To enable more easier development with Multi-Modal data, we also provide 2 more "taster" datasets along with the original MMEarth data. The license for the data is [CC BY 4.0](https://github.com/vishalned/MMEarth-data/blob/main/LICENSE-data). 36 | 37 | :bangbang: **UPDATE: The new Version 001 data is ready to download.** 38 | 39 | | **Dataset** | **Image Size** | **Number of Tiles** | **Dataset size** | **Data Link** | **Bash Script** | 40 | | :---: | :---: | :---: | :---: | :---: | :---: | 41 | | MMEarth | 128x128 | 1.2M | 597GB | [download](https://sid.erda.dk/sharelink/ChL1BoVEyH) | [bash](https://github.com/vishalned/MMEarth-data/blob/main/bash_scripts/data_1M_128.sh)| 42 | | MMEarth64 | 64x64 | 1.2M | 152GB | [download](https://sid.erda.dk/sharelink/bX5JzPuwJF) | [bash](https://github.com/vishalned/MMEarth-data/blob/main/bash_scripts/data_1M_64.sh)| 43 | | MMEarth100k | 128x128 | 100k | 48GB | [download](https://sid.erda.dk/sharelink/CoaUojVXzu) | [bash](https://github.com/vishalned/MMEarth-data/blob/main/bash_scripts/data_100k_128.sh)| 44 | 45 | All 3 dataset have a similar structure as below: 46 | 47 | . 48 | ├── data_1M_v001/ # root data directory 49 | │ ├── data_1M_v001.h5 # h5 file containing the data 50 | │ ├── data_1M_v001_band_stats.json # json file containing information about the bands present in the h5 file for each data stack 51 | │ ├── data_1M_v001_splits.json # json file containing information for train, val, test splits 52 | │ └── data_1M_v001_tile_info.json # json file containing additional meta information of each tile that was downloaded. 53 | 54 | 55 | ## Data Loading 56 | A sample Jupyter Notebook that shows an example to load the data using pytorch is [here](https://github.com/vishalned/MMEarth-train/blob/main/examples/data_loader_example.ipynb). Alternatively, the dataloader has also been added to [TorchGeo](https://torchgeo.readthedocs.io/en/latest/api/datasets.html#mmearth). 57 | 58 | ## Getting Started 59 | To get started with this repository, you can install the dependencies and packages with this command 60 | 61 | ```sh 62 | pip install -r requirements.txt 63 | ``` 64 | 65 | Once this is done, you need to setup gcloud and earthengine to make the code work. Follow the below steps: 66 | - Earthengine requires the initialization of gcloud, so install gcloud by following the instructions from [here](https://cloud.google.com/sdk/docs/install). 67 | - Setting up earthengine on your local machine: To setup earthengine on your local machine run `earthengine authenticate`. 68 | - Setting up earthengine on a remote cluster: Many times `earthengine authenticate` doesnt directly work since you will get multiple links to click, and these links 69 | wouldnt work when opening them from the browser on your local machine. Hence run this command `earthengine authenticate --quiet`. Follow the instructions on your terminal 70 | and everything should work. An additional step is to add the project name in every file that has `earthengine.initialize(project = '$PROJECT_NAME')`. 71 | 72 | ## Data Stacks 73 | This repository allows downloading data from various sensors. Currently the code is written to download the following sensors/modalities: 74 | - [Sentinel-2](https://developers.google.com/earth-engine/datasets/catalog/sentinel-2) 75 | - [Sentinel-1](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S1_GRD) 76 | - [ERA5 (Temperature and precipitation)](https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_LAND_MONTHLY_AGGR) 77 | - [Aster GDEM (Elevation and Slope)](https://gee-community-catalog.org/projects/aster/) 78 | - [Dynamic world (LULC)](https://developers.google.com/earth-engine/datasets/catalog/GOOGLE_DYNAMICWORLD_V1) 79 | - [Canopy Height](https://gee-community-catalog.org/projects/canopy/) 80 | - [ESA WorldCover](https://developers.google.com/earth-engine/datasets/catalog/ESA_WorldCover_v100) 81 | 82 | 83 | 84 | ## Code Structure 85 | The data downloading happens only when you have a geojson file with all the tiles you want to download. Here tiles represent ROI (or polygons) for each location that you want. Once you have the tiles, the data stacks (data for each modality) are downloaded for each tile in the geojson. The data can be downloaded by following this broad structure, and each of these points are further explained below: 86 | * creating tiles (small ROIs sampled globally) 87 | * download data stacks for each of the tiles 88 | * post processing of the downloaded data 89 | * redownload (if needed) 90 | 91 | #### Creating Tiles 92 | - `create_tiles_polygon.py` is the file used to create the tiles. The corresponding config is `config/config_tiles.yaml`. For a global sample, the various sampling techniques are based on the biomes and ecoregions from the [RESOLVE ECOREGIONS](https://developers.google.com/earth-engine/datasets/catalog/RESOLVE_ECOREGIONS_2017). 93 | - In the config you can set the size of the tile in meters along with the number of tiles to download and the sampling method (how to sample the tiles in a region). 94 | 95 | #### Downloading Data Stacks 96 | - `main_download.py` is the main script to download the data. The corresponding config is `config/config_data.yaml`. The config file contains various parameter to be set regarding the different modalities, and paths. Based on the geojson file created from the above step, this file downloads the data stacks for each tile. 97 | - The `ee_utils/ee_data.py` file contains custom functions for retrieving each modality in the data stack from GEE. It merges all these modalities into one array, and export it as a GeoTIFF file. The band information and other tile information is stored in a json file (`tile_info.json`). 98 | 99 | #### Post Processing 100 | - The `post_download.py` file performs 4 operations sequentially: 101 | - Mergining multiple `tile_info.json` files (these files are created when parallely downloading using slurm - explained more below) 102 | - Converting the GeoTIFFs to single hdf5 file. 103 | - Obtaining statistics for each band. (used for normalization purposes) 104 | - Computing the splits (train, val splits - only if needed). 105 | 106 | 107 | #### Redownload 108 | - `redownload.py` is the file that can be used to redownload any tiles that failed to download. Sometimes when downloading the data stacks, the script can skip tiles due to various reasons (lack of sentinel-2 reference image, network issues, GEE issues). Hence if needed, we have an option to redownload these tile. (An alternative is to just download more tiles than needed). 109 | 110 | 111 | (**NOTE**: The files are executed by making use of SLURM. More information on this is provided in the [Slurm Execution](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#slurm-execution) section) 112 | 113 | ## Slurm Execution 114 | 115 | MMEarth-data 116 | 117 | 118 | **Downloading Data Stacks:** GEE provides a function called `getDownloadUrl()` that allows you to export images as GeoTIFF files. We extend this by merging all modalities for a single location into one image, and export this as a single GeoTIFF file. To further speed up the data downloading, we make use of parallel processing using SLURM. The above figures give an idea of how this is done. The tile information (tile GeoJSON) contains location information and more about N tiles we need to download. N/40 tiles are downloaded by 40 slurm jobs (we set the max jobs as 40 since this is the maximum number of concurrent requests by the GEE API). 119 | 120 | To run the slurm parallel download, execute the following command 121 | ```sh 122 | sbatch slurm_scripts/slurm_download_parallel.sh 123 | ``` 124 | 125 | 126 | ## Citation 127 | Please cite our paper if you use this code or any of the provided data. 128 | 129 | Vishal Nedungadi, Ankit Kariryaa, Stefan Oehmcke, Serge Belongie, Christian Igel, & Nico Lang (2024). MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial Representation Learning. 130 | ``` 131 | @inproceedings{nedungadi2024mmearth, 132 | title={MMEarth: Exploring multi-modal pretext tasks for geospatial representation learning}, 133 | author={Nedungadi, Vishal and Kariryaa, Ankit and Oehmcke, Stefan and Belongie, Serge and Igel, Christian and Lang, Nico}, 134 | booktitle={European Conference on Computer Vision}, 135 | pages={164--182}, 136 | year={2024}, 137 | organization={Springer} 138 | } 139 | ``` 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /bash_scripts/data_100k_128.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # URLs of the files to download 4 | file_urls=( 5 | "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001_band_stats.json" 6 | "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001_tile_info.json" 7 | "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001_splits.json" 8 | "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001.h5" 9 | "https://sid.erda.dk/share_redirect/CoaUojVXzu/LICENSE-data" 10 | ) 11 | 12 | # Destination folder to save the downloaded files 13 | destination_folder="./data_100k_v001/" 14 | 15 | # Create the destination folder if it doesn't exist 16 | mkdir -p "$destination_folder" 17 | 18 | # Loop through each URL and download the corresponding file 19 | for url in "${file_urls[@]}"; do 20 | # Extract filename from URL 21 | filename=$(basename "$url") 22 | # Download the file using curl 23 | curl -o "${destination_folder}${filename}" "$url" 24 | # Check if the download was successful 25 | if [ $? -eq 0 ]; then 26 | echo "File '${filename}' downloaded successfully." 27 | else 28 | echo "Failed to download the file '${filename}'." 29 | fi 30 | done 31 | 32 | -------------------------------------------------------------------------------- /bash_scripts/data_1M_128.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # URLs of the files to download 4 | file_urls=( 5 | "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001_band_stats.json" 6 | "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001_tile_info.json" 7 | "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001_splits.json" 8 | "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001.h5" 9 | "https://sid.erda.dk/share_redirect/ChL1BoVEyH/LICENSE-data" 10 | ) 11 | 12 | # Destination folder to save the downloaded files 13 | destination_folder="./data_1M_v001/" 14 | 15 | # Create the destination folder if it doesn't exist 16 | mkdir -p "$destination_folder" 17 | 18 | # Loop through each URL and download the corresponding file 19 | for url in "${file_urls[@]}"; do 20 | # Extract filename from URL 21 | filename=$(basename "$url") 22 | # Download the file using curl 23 | curl -o "${destination_folder}${filename}" "$url" 24 | # Check if the download was successful 25 | if [ $? -eq 0 ]; then 26 | echo "File '${filename}' downloaded successfully." 27 | else 28 | echo "Failed to download the file '${filename}'." 29 | fi 30 | done 31 | 32 | -------------------------------------------------------------------------------- /bash_scripts/data_1M_64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # URLs of the files to download 4 | file_urls=( 5 | "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64_band_stats.json" 6 | "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64_tile_info.json" 7 | "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64_splits.json" 8 | "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64.h5" 9 | "https://sid.erda.dk/share_redirect/bX5JzPuwJF/LICENSE-data" 10 | ) 11 | 12 | # Destination folder to save the downloaded files 13 | destination_folder="./data_1M_v001_64/" 14 | 15 | # Create the destination folder if it doesn't exist 16 | mkdir -p "$destination_folder" 17 | 18 | # Loop through each URL and download the corresponding file 19 | for url in "${file_urls[@]}"; do 20 | # Extract filename from URL 21 | filename=$(basename "$url") 22 | # Download the file using curl 23 | curl -o "${destination_folder}${filename}" "$url" 24 | # Check if the download was successful 25 | if [ $? -eq 0 ]; then 26 | echo "File '${filename}' downloaded successfully." 27 | else 28 | echo "Failed to download the file '${filename}'." 29 | fi 30 | done 31 | 32 | -------------------------------------------------------------------------------- /config/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/config/.DS_Store -------------------------------------------------------------------------------- /config/config_data.yaml: -------------------------------------------------------------------------------- 1 | 2 | # general config 3 | export_folder: /projects/dereeco/data/global-lr/data_1M_v001_era5/ # name of the main folder inside the bucket or the local folder 4 | # bucket: global-dataset # name of the bucket 5 | start_from: 0 # start from the 0th tile or start from a custom tile (useful if the script fails and you want to start from where it left off) 6 | end_at: 1000 # end at the 1000th tile or end at a custom tile (useful if the script fails and you want to start from where it left off) 7 | log: INFO # log level #DEBUG, INFO, ERROR 8 | tiles_path: '/projects/dereeco/data/global-lr/geojson_files/tiles_1M_v001.geojson' #1000 tiles 9 | tile_info_path: '/projects/dereeco/data/global-lr/data_1M_v001_era5/data' # this is the path that contains all the tile info - useful if you want to start with a new data apart from s2 10 | seed: 42 # seed for random image selection in S2 11 | 12 | 13 | # dataset config 14 | 15 | # dataset to download 16 | # THINGS TO NOTE: for this version of the code, you need to put sentinel2 first if you want to download it along with other datasets. This is because s2 is the base dataset and we use it to get the tile information. 17 | # Incase you have already downloaded s2, then you can put any dataset first, and it downloads the other datasets. The name of the dataset should be the same name as the function call. Here are the names of the functions: 18 | # sentinel2, sentinel1, srtm, era5 19 | # datasets: ["sentinel2", "sentinel1", "aster", "era5", "dynamic_world", "canopy_height_eth", "esa_worldcover"] 20 | datasets: ["era5"] 21 | 22 | 23 | # config for the data 24 | # make sure there is no space in the 'name' field 25 | sentinel2: 26 | name: "sentinel2" 27 | BANDS: [["B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8A", "B8", "B9", "B11", "B12", "SCL", "MSK_CLDPRB", "QA60"], 28 | ["B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8A", "B8", "B9", "B10", "B11", "B12", "QA60"]] 29 | collection: ["COPERNICUS/S2_SR_HARMONIZED", 30 | "COPERNICUS/S2_HARMONIZED"] 31 | 32 | sentinel1: 33 | name: "sentinel1" 34 | BANDS: ["VV", "VH", "HH", "HV"] # we download all the bands and both orbits 35 | collection: "COPERNICUS/S1_GRD" 36 | 37 | aster: 38 | name: "aster" 39 | BANDS: ["b1"] 40 | collection: "projects/sat-io/open-datasets/ASTER/GDEM" 41 | 42 | era5: 43 | name: "era5" 44 | BANDS: ["temperature_2m", "temperature_2m_min", "temperature_2m_max", "total_precipitation_sum"] 45 | collection: "ECMWF/ERA5_LAND/MONTHLY_AGGR" 46 | 47 | 48 | dynamic_world: 49 | name: "dynamic_world" 50 | BANDS: ["label"] 51 | collection: "GOOGLE/DYNAMICWORLD/V1" 52 | 53 | canopy_height_eth: 54 | name: "canopy_height_eth" 55 | COLLECTIONS: ["users/nlang/ETH_GlobalCanopyHeight_2020_10m_v1", "users/nlang/ETH_GlobalCanopyHeightSD_2020_10m_v1"] 56 | 57 | esa_worldcover: 58 | name: "esa_worldcover" 59 | BANDS: ["Map"] 60 | collection: "ESA/WorldCover/v100" 61 | 62 | 63 | 64 | 65 | 66 | # do not change anything below this line, the main script will automatically update these values 67 | update_geojson: False 68 | read_tile_info: False 69 | defaults: 70 | - hydra/job_logging: disabled # by default hydra has a logging config file, comment this line if you want to use that instead. That will print the logs to the console, and also save it to a file -------------------------------------------------------------------------------- /config/config_tiles.yaml: -------------------------------------------------------------------------------- 1 | 2 | biome_names_path: '/home/qbk152/vishal/global-lr/stats/biome_names.json' # path to the file containing the names of the biomes 3 | tiles_geojson_path: '/home/qbk152/vishal/global-lr/tiles_1M_v001.geojson' # path to the file containing the tiles 4 | failed_eco_regions_path: '/home/qbk152/vishal/global-lr/failed_eco_regions.txt' # path to the file containing the eco regions that failed to download 5 | 6 | # files that contain the total area of each biome and eco-region (these are precomputed) 7 | area_biome_path: '/home/qbk152/vishal/global-lr/stats/total_area_biome.json' 8 | area_eco_path: '/home/qbk152/vishal/global-lr/stats/total_area_eco_region.json' 9 | 10 | 11 | tile_size: 1300 # 1.3km 12 | num_of_images: 1400000 # always set this number more than the number of tiles required. This is because some tiles might fail to download 13 | num_of_biomes: 14 # number of biomes in the world (do not change) 14 | 15 | # uniform sampling type: 16 | # 0: uniform across biomes without equal sampling within each eco-region inside a biome 17 | # 1: uniform across biomes and equal sampling within each eco-region inside a biome 18 | # 2: uniform across eco-regions 19 | uniform_type: 0 20 | 21 | 22 | -------------------------------------------------------------------------------- /config/ee_init.py: -------------------------------------------------------------------------------- 1 | import ee 2 | 3 | # ee.Initialize(project='global-rl-2', opt_url='https://earthengine-highvolume.googleapis.com') 4 | ee.Initialize(project='global-rl-2') -------------------------------------------------------------------------------- /create_tiles_polygon.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to create a geojson with the tiles from various ecoregions or biomes of the world based on the CONFIG file 3 | ''' 4 | 5 | import hashlib 6 | import ee 7 | from datetime import datetime 8 | import json 9 | # Initialize Earth Engine 10 | ee.Initialize(project = 'global-rl-2') 11 | import hydra 12 | from omegaconf import DictConfig, OmegaConf 13 | import random 14 | 15 | 16 | 17 | @hydra.main(config_path='config', config_name='config_tiles') 18 | def main(cfg: DictConfig) -> None: 19 | print(OmegaConf.to_yaml(cfg)) 20 | 21 | eco_region = ee.FeatureCollection("RESOLVE/ECOREGIONS/2017") 22 | 23 | NUM_IMAGES = cfg.num_of_images 24 | if cfg.uniform_type == 1: 25 | print('------- Unform across biomes --------') 26 | NUM_IMAGES_PER_BIOME = NUM_IMAGES // cfg.num_of_biomes 27 | elif cfg.uniform_type == 2: 28 | print('------- Unform across eco-regions --------') 29 | elif cfg.uniform_type == 0: 30 | print('------- Unform across biomes only --------') 31 | NUM_IMAGES_PER_BIOME = NUM_IMAGES // cfg.num_of_biomes 32 | area_biome = json.load(open(cfg.aree_biome_path)) 33 | area_eco = json.load(open(cfg.area_eco_path)) 34 | 35 | # getting the list of biomes 36 | biome_names = json.load(open(cfg.biome_names_path)) 37 | 38 | tile_id_count = 0 # we use a simple number to keep track of the tile_id. This is just a number that is incremented by 1 for each tile 39 | 40 | tiles_list = [] 41 | failed_eco_regions = [] 42 | biomes = list(biome_names.keys())[0:-1] # skipping the last one because it is rock and ice 43 | 44 | # biome loop 45 | for j, biome in enumerate(biomes): 46 | print(f'Biome {j+1}/14: {biome}') 47 | if cfg.uniform_type == 1 or cfg.uniform_type == 0: 48 | print('Number of images per biome: ', NUM_IMAGES_PER_BIOME) 49 | print('Number of eco-regions: ', len(biome_names[biome])) 50 | 51 | # eco region loop 52 | for i, eco in enumerate(biome_names[biome]): 53 | try: 54 | eco_region_name, realm = eco[0], eco[1] 55 | print(f'Eco-region {i}/{len(biome_names[biome])}: {eco_region_name} ') 56 | if cfg.uniform_type == 1: 57 | num_of_tiles = NUM_IMAGES_PER_BIOME // len(biome_names[biome]) 58 | elif cfg.uniform_type == 2: 59 | num_of_tiles = NUM_IMAGES // 846 # 846 is the total number of eco-regions in the RESOLVE ecoregions dataset 60 | elif cfg.uniform_type == 0: 61 | num_of_tiles = int(NUM_IMAGES_PER_BIOME * (area_eco[eco_region_name] / area_biome[biome])) 62 | 63 | print('Number of tiles in the eco-region: ', num_of_tiles) 64 | 65 | # gee only allows max of 5000 features to be exported at a time. So we need to split the eco-regions into smaller batches 66 | num_while_loops = 0 # a variable to keep track of the number of while loops we have done 67 | while num_of_tiles > 0: 68 | 69 | tiles_to_export = min(num_of_tiles, 5000) 70 | 71 | print('Tiles to export inside the while loop: ', tiles_to_export) 72 | 73 | 74 | single_region = eco_region.filter(ee.Filter.eq('ECO_NAME', eco_region_name)) 75 | 76 | # the following 2 lines is just to generate a number based on a string. This ensure that the number is the same for the same string. 77 | # we mod it by 10^5 to keep it small. 78 | coord_string = f"{i}{eco_region_name}{tiles_to_export}{num_while_loops}" 79 | seed = int(hashlib.sha256(coord_string.encode('utf-8')).hexdigest(), 16) % 10**5 80 | 81 | # adding a line to make the seed new as compared to the previous one 82 | # seed += 42 83 | random_points = ee.FeatureCollection.randomPoints(single_region, tiles_to_export, seed) 84 | 85 | tiles = random_points.map(lambda point: point.buffer(cfg.tile_size / 2).bounds()) 86 | tile_features = tiles.getInfo()['features'] 87 | 88 | for idx in range(len(tile_features)): 89 | tile_features[idx]['properties'] = { 90 | 'tile_id': f"{tile_id_count}", 91 | 'biome': biome, 92 | 'eco_region': eco_region_name, 93 | 94 | } 95 | tiles_list.append(tile_features[idx]) 96 | tile_id_count += 1 # incrementing the tile_id 97 | 98 | # shuffling the tiles_list 99 | random.shuffle(tiles_list) 100 | geojson_collection = { 101 | 'type': 'FeatureCollection', 102 | 'features': tiles_list 103 | } 104 | 105 | with open(cfg.tiles_geojson_path, 'w') as f: 106 | json.dump(geojson_collection, f) 107 | 108 | num_of_tiles -= tiles_to_export 109 | num_while_loops += 1 110 | 111 | except ee.ee_exception.EEException as e: 112 | print('Could not get this eco-region. Skipping...') 113 | print(e) 114 | failed_eco_regions.append(eco_region_name) 115 | continue 116 | 117 | 118 | with open(cfg.failed_eco_regions_path, 'w') as f: 119 | f.write('\n'.join(failed_eco_regions)) 120 | 121 | if __name__ == '__main__': 122 | main() 123 | 124 | -------------------------------------------------------------------------------- /data_exp/data_exp.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import json 3 | import numpy as np 4 | import h5py 5 | 6 | 7 | # tile_path = "/home/qbk152/vishal/global-lr/data/data_1M_130_tile_info.json" 8 | # tile_path = "/home/qbk152/vishal/global-lr/data/data_tmp_tile_info.json" 9 | # data_path = '/home/qbk152/vishal/global-lr/data/tmp.h5' 10 | # tile_info = json.load(open(tile_path)) 11 | 12 | def month_only(args): 13 | ''' 14 | Plot the number of tiles per month 15 | ''' 16 | 17 | tile_info = json.load(open(args.tile_info_path)) 18 | month = np.arange(1, 13) 19 | month_counts = np.zeros(12) 20 | for tile in tile_info: 21 | month_counts[int(tile_info[tile]['S2_DATE'].split('-')[1]) - 1] += 1 22 | 23 | plt.bar(month, month_counts) 24 | plt.xlabel('Month') 25 | plt.ylabel('Number of tiles') 26 | plt.title('Number of tiles per month') 27 | 28 | plt.savefig(os.path.join(args.store_path, 'month_counts.png')) 29 | 30 | plt.clf() 31 | # stats about which months in a year are present in the dataset 32 | 33 | def s2_type(args): 34 | ''' 35 | Plot the number of tiles per month per year 36 | ''' 37 | tile_info = json.load(open(args.tile_info_path)) 38 | month = np.arange(1, 12*4 + 1) 39 | month_counts_l1c = np.zeros(12*4) 40 | month_counts_l2a = np.zeros(12*4) 41 | 42 | for tile in tile_info: 43 | m = int(tile_info[tile]['S2_DATE'].split('-')[1]) 44 | y = int(tile_info[tile]['S2_DATE'].split('-')[0]) 45 | if tile_info[tile]['S2_type'] == 'l1c': 46 | month_counts_l1c[(y - 2017) * 12 + m - 1] += 1 47 | else: 48 | month_counts_l2a[(y - 2017) * 12 + m - 1] += 1 49 | 50 | years = np.arange(2017, 2021) 51 | yearly_counts_l1c = [month_counts_l1c[i:i+12] for i in range(0, len(month_counts_l1c), 12)] 52 | yearly_counts_l2a = [month_counts_l2a[i:i+12] for i in range(0, len(month_counts_l2a), 12)] 53 | 54 | 55 | 56 | month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] * 4 57 | month_labels = [f"{years[i // 12]} {month_names[i]}" for i in range(12*4)] 58 | year_colors = ['b', 'g', 'r', 'c'] 59 | 60 | 61 | # print(np.sum(yearly_counts_l1c)) 62 | 63 | # Create a bar plot for each year 64 | for i, year_count in enumerate(yearly_counts_l1c): 65 | plt.bar(np.arange(12*i + 1, 12*i + 13), year_count, label=str(years[i]), color=year_colors[i], alpha=0.7) 66 | 67 | plt.xticks(np.arange(1, 12 * 4 + 1), month_labels, rotation=90, fontsize=6) 68 | plt.xlabel('Month') 69 | plt.ylabel('Count') 70 | plt.title('Monthly Counts by Year') 71 | plt.legend(title='Year') 72 | # increase the spacing between each bar plot 73 | plt.tight_layout() 74 | plt.savefig(os.path.join(args.store_path, 'yearly_counts_l1c.png')) 75 | plt.clf() 76 | 77 | for i, year_count in enumerate(yearly_counts_l2a): 78 | plt.bar(np.arange(12*i + 1, 12*i + 13), year_count, label=str(years[i]), color=year_colors[i], alpha=0.7) 79 | 80 | plt.xticks(np.arange(1, 12 * 4 + 1), month_labels, rotation=90, fontsize=6) 81 | plt.xlabel('Month') 82 | plt.ylabel('Count') 83 | plt.title('Monthly Counts by Year') 84 | plt.legend(title='Year') 85 | # increase the spacing between each bar plot 86 | plt.tight_layout() 87 | plt.savefig(os.path.join(args.store_path, 'yearly_counts_l2a.png')) 88 | 89 | 90 | 91 | 92 | def month_year(args): 93 | ''' 94 | Plot the number of tiles per month per year 95 | ''' 96 | import matplotlib.cm as cm 97 | tile_info = json.load(open('/projects/dereeco/data/global-lr/data_1M_v001/data_1M_v001_tile_info.json')) 98 | month = np.arange(1, 12*4 + 1) 99 | month_counts = np.zeros(12*4) 100 | 101 | for tile in tile_info: 102 | # only choosing either l1c or l2a 103 | # if tile_info[tile]['S2_type'] == 'l1c': 104 | m = int(tile_info[tile]['S2_DATE'].split('-')[1]) 105 | y = int(tile_info[tile]['S2_DATE'].split('-')[0]) 106 | month_counts[(y - 2017) * 12 + m - 1] += 1 107 | years = np.arange(2017, 2021) 108 | yearly_counts = [month_counts[i:i+12] for i in range(0, len(month_counts), 12)] 109 | 110 | month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] * 4 111 | month_labels = [f"{years[i // 12]} {month_names[i]}" for i in range(12*4)] 112 | # month_labels = [i if 'Jan' in i else '' for i in month_labels] 113 | # we name the months 1, 5, 9 for each year 114 | for id, i in enumerate(month_labels): 115 | if 'Jan' in i: 116 | month_labels[id] = '1' 117 | elif 'May' in i: 118 | month_labels[id] = '5' 119 | elif 'Sep' in i: 120 | month_labels[id] = '9' 121 | else: 122 | month_labels[id] = '' 123 | # year_colors = ['b', 'g', 'r', 'c'] 124 | year_colors = [cm.inferno(i/4) for i in range(4)] 125 | plt.rcParams.update({'figure.figsize': (14, 10)}) 126 | # Create a bar plot for each year 127 | for i, year_count in enumerate(yearly_counts): 128 | plt.bar(np.arange(12*i + 1, 12*i + 13), year_count, label=str(years[i]), color=year_colors[i], alpha=0.7) 129 | 130 | plt.xticks(np.arange(1, 12 * 4 + 1), month_labels, fontsize=30) 131 | plt.xlabel('Date in months', labelpad=8, fontsize=30) 132 | plt.ylabel('Number of samples', labelpad=8, fontsize=30) 133 | plt.legend(loc='upper center', ncols = 4, fontsize=24) 134 | plt.yticks(fontsize=30) 135 | # limit the ylimit to 30000 136 | plt.ylim(0, 30000) 137 | # increase the spacing between each bar plot 138 | # plt.rcParams.update({'font.size': 18}) 139 | 140 | # plt.tight_layout() 141 | plt.savefig(os.path.join(args.store_path, 'yearly.png'), dpi=300, format='png', bbox_inches='tight') 142 | plt.savefig(os.path.join(args.store_path, 'yearly.pdf'), dpi=300, format='pdf', bbox_inches='tight') 143 | plt.clf() 144 | 145 | def dynamic_world(args): 146 | ''' 147 | Plot the number of pixels per class in the dynamic world dataset 148 | ''' 149 | class_names = [ 150 | "No data", 151 | "Water", 152 | "Trees", 153 | "Grass", 154 | "Flooded vegetation", 155 | "Crops", 156 | "Shrub and scrub", 157 | "Built", 158 | "Bare", 159 | "Snow and ice" 160 | ] 161 | 162 | hdf5_file = h5py.File(args.data_path, 'r') 163 | meta = hdf5_file['metadata'] 164 | dw_count = {i:0 for i in range(0, 10)} 165 | 166 | num_tiles = len(meta) 167 | for i in range(num_tiles): 168 | tile_id = meta[i][0].decode('utf-8') 169 | img = hdf5_file['dynamic_world'][i] 170 | 171 | # obtain the number of pixels in each class 172 | for j in range(10): 173 | dw_count[j] += np.sum(img == j) 174 | 175 | if i % 1000 == 0: 176 | print(f"Processed {i} tiles") 177 | 178 | 179 | plt.bar(dw_count.keys(), dw_count.values()) 180 | plt.xticks(np.arange(0, 10), class_names, rotation=90, fontsize=8) 181 | plt.subplots_adjust(bottom=0.4) 182 | 183 | plt.xlabel('Class') 184 | plt.ylabel('Number of pixels') 185 | plt.title('Number of pixels per class') 186 | 187 | plt.savefig(os.path.join(args.store_path, 'dw_counts.png')) 188 | 189 | 190 | def esa_worldcover(args): 191 | class_names = [ 192 | 'Tree cover', 193 | 'Shrubland', 194 | 'Grassland', 195 | 'Cropland', 196 | 'Built-up', 197 | 'Bare / sparse vegetation', 198 | 'Snow and ice', 199 | 'Permanent water bodies', 200 | 'Herbaceous wetland', 201 | 'Mangroves', 202 | 'Moss and lichen' 203 | ] 204 | class_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100] 205 | hdf5_file = h5py.File(args.data_path, 'r') 206 | meta = hdf5_file['metadata'] 207 | esa_count = {i:0 for i in class_values} 208 | 209 | num_tiles = len(meta) 210 | for i in range(num_tiles): 211 | tile_id = meta[i][0].decode('utf-8') 212 | img = hdf5_file['esa_worldcover'][i] 213 | 214 | # obtain the number of pixels in each class 215 | for j in class_values: 216 | esa_count[j] += np.sum(img == j) 217 | 218 | 219 | 220 | if i % 1000 == 0: 221 | print(f"Processed {i} tiles") 222 | 223 | plt.bar(esa_count.keys(), esa_count.values(), width=7) 224 | plt.xticks(class_values, class_names, rotation=90, fontsize=8) 225 | plt.subplots_adjust(bottom=0.4) 226 | 227 | plt.xlabel('Class') 228 | plt.ylabel('Number of pixels') 229 | plt.title('Number of pixels per class') 230 | 231 | plt.savefig(os.path.join(args.store_path, 'esa_counts.png')) 232 | 233 | 234 | 235 | def custom(args): 236 | ''' 237 | Trying to plot the number of tiles per grid cell for the tiles that have the date as dec 2018 238 | ''' 239 | minx, miny, maxx, maxy = -179.9863841350967, -86.78204367236995, 180.05963072575278, 83.48337010728358 240 | grid_size = 1 241 | tile_info = json.load(open(args.tile_info_path)) 242 | 243 | x = np.arange(minx, maxx + grid_size, grid_size) 244 | y = np.arange(miny, maxy + grid_size, grid_size) 245 | 246 | grid_counts = np.zeros((len(x) - 1, len(y) - 1)) 247 | count = 0 248 | 249 | for tile in tile_info: 250 | m = int(tile_info[tile]['S2_DATE'].split('-')[1]) 251 | y = int(tile_info[tile]['S2_DATE'].split('-')[0]) 252 | 253 | if m == 12 and y == 2018: 254 | lon, lat = tile_info[tile]['lon'], tile_info[tile]['lat'] 255 | 256 | 257 | x_idx = int((lon - minx) / grid_size) 258 | y_idx = int((lat - miny) / grid_size) 259 | 260 | grid_counts[x_idx][y_idx] += 1 261 | count += 1 262 | 263 | 264 | print(count) 265 | 266 | fig, ax = plt.subplots() 267 | grid_counts = np.transpose(grid_counts) 268 | 269 | im = ax.imshow(grid_counts, extent=(minx, maxx, miny, maxy), origin='lower', cmap='inferno') 270 | im.set_clim(0, 50) # this is hardcoded for grid_size = 1 271 | # im.set_clim(-2, 15) # this is hardcoded for grid_size = 0.1 272 | 273 | plt.colorbar(im, fraction=0.020, pad=0.04) 274 | 275 | 276 | # plt.show() 277 | plt.savefig(os.path.join(args.store_path, f"grid_{grid_size}.png"), dpi=300) 278 | 279 | 280 | def dw_custom(args): 281 | ''' 282 | Plot the tiles per grid for a specific dw class 283 | ''' 284 | minx, miny, maxx, maxy = -179.9863841350967, -86.78204367236995, 180.05963072575278, 83.48337010728358 285 | grid_size = 1 286 | tile_info = json.load(open(args.tile_info_path)) 287 | 288 | 289 | x = np.arange(minx, maxx + grid_size, grid_size) 290 | y = np.arange(miny, maxy + grid_size, grid_size) 291 | 292 | grid_counts = np.zeros((len(x) - 1, len(y) - 1)) 293 | count = 0 294 | 295 | hdf5_file = h5py.File(args.data_path, 'r') 296 | meta = hdf5_file['metadata'] 297 | 298 | 299 | num_tiles = len(meta) 300 | for i in range(num_tiles): 301 | tile = meta[i][0].decode('utf-8') 302 | img = hdf5_file['dynamic_world'][i] 303 | if np.sum(img == 0) > 1000: 304 | lon, lat = tile_info[tile]['lon'], tile_info[tile]['lat'] 305 | 306 | x_idx = int((lon - minx) / grid_size) 307 | y_idx = int((lat - miny) / grid_size) 308 | 309 | grid_counts[x_idx][y_idx] += 1 310 | count += 1 311 | 312 | 313 | print(count) 314 | 315 | fig, ax = plt.subplots() 316 | grid_counts = np.transpose(grid_counts) 317 | 318 | im = ax.imshow(grid_counts, extent=(minx, maxx, miny, maxy), origin='lower', cmap='inferno') 319 | im.set_clim(0, 50) # this is hardcoded for grid_size = 1 320 | # im.set_clim(-2, 15) # this is hardcoded for grid_size = 0.1 321 | 322 | # plt.show() 323 | plt.colorbar(im, fraction=0.020, pad=0.04) 324 | plt.savefig(os.path.join(args.store_path, f"grid_{grid_size}.png"), dpi=300) 325 | 326 | 327 | def era_stats(args): 328 | ''' 329 | Plot the distribution of temperature for each tile. 330 | ''' 331 | 332 | hdf5_file = h5py.File(args.data_path, 'r') 333 | meta = hdf5_file['metadata'] 334 | num_tiles = len(meta) 335 | data_month = [] 336 | 337 | for i in range(num_tiles): 338 | tile = meta[i][0].decode('utf-8') 339 | data = hdf5_file['era5'][i][4:8] 340 | data_month.append(data[0]) 341 | 342 | 343 | data_month = np.array(data_month) - 273.15 344 | 345 | # plot the distribution of temperature for each tile 346 | 347 | fig, ax = plt.subplots() 348 | ax.hist(data_month, bins=100) 349 | 350 | plt.xlabel('Temperature (C)') 351 | plt.ylabel('Number of tiles') 352 | plt.title('Distribution of Monthly Temperature for Each Tile') 353 | 354 | plt.savefig(os.path.join(args.store_path, 'era5_temperature.png')) 355 | 356 | 357 | def aster_stats(args): 358 | ''' 359 | Plot the distribution of elevation for each tile. 360 | ''' 361 | hdf5_file = h5py.File(args.data_path, 'r') 362 | meta = hdf5_file['metadata'] 363 | num_tiles = len(meta) 364 | 365 | bins = np.arange(-170, 6500, 10) 366 | 367 | hist_counts = np.zeros(len(bins)) 368 | min_, max_ = 100000, -100000 369 | for i in range(num_tiles): 370 | img = hdf5_file['aster'][i] 371 | # Extract the elevation band 372 | data = img[0, :, :] 373 | 374 | data = data.flatten() 375 | # if np.min(data) < min_: 376 | # min_ = np.min(data) 377 | 378 | # if np.max(data) > max_: 379 | # max_ = np.max(data) 380 | ind = np.digitize(data, bins=bins) 381 | ind = ind - 1 382 | hist_counts[ind] += 1 383 | 384 | if i % 1000 == 0: 385 | print(f"Processed {i} tiles") 386 | 387 | 388 | hdf5_file.close() 389 | 390 | # print("Min and max elevation") 391 | # print(min_, max_) 392 | # Plot the histogram with bin labels 393 | fig, ax = plt.subplots() 394 | ax.bar(bins, hist_counts, width=100) 395 | plt.xlabel('Elevation (m)') 396 | plt.ylabel('Number of pixels') 397 | plt.title('Distribution of Elevation for All Tiles') 398 | 399 | plt.savefig(os.path.join(args.store_path, 'aster_elevation.png')) 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | if __name__ == '__main__': 408 | 409 | import argparse 410 | import os 411 | parser = argparse.ArgumentParser() 412 | parser.add_argument('--data_dir', type=str, required=True) 413 | parser.add_argument('--store_path', type=str, default='/home/qbk152/vishal/MMEarth-data/1M_v001_plots/') 414 | args = parser.parse_args() 415 | 416 | 417 | name = args.data_dir.split('/')[-1] if args.data_dir[-1] != '/' else args.data_dir.split('/')[-2] 418 | args.tile_info_path = os.path.join(args.data_dir, name + '_tile_info.json') 419 | args.data_path = os.path.join(args.data_dir, name + '.h5') 420 | args.store_path = os.path.join(args.store_path, name) 421 | if not os.path.exists(args.store_path): 422 | os.makedirs(args.store_path) 423 | 424 | print('storing plots in', args.store_path) 425 | 426 | 427 | 428 | # month_only(args) 429 | month_year(args) 430 | # s2_type(args) 431 | # dynamic_world(args) 432 | # # era_stats() 433 | # # aster_stats() 434 | # esa_worldcover(args) 435 | 436 | 437 | 438 | -------------------------------------------------------------------------------- /data_exp/density_maps.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from matplotlib.colors import LinearSegmentedColormap 5 | import json 6 | # world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) 7 | 8 | 9 | def create_density(grid_size): 10 | 11 | gdf = gpd.read_file("/projects/dereeco/data/global-lr/geojson_files/tiles_1M_v001.geojson") 12 | json_ = json.load(open("/projects/dereeco/data/global-lr/data_1M_v001/data_1M_v001_tile_info.json")) 13 | # gdf = gpd.read_file("/home/qbk152/vishal/global-lr/tile_polygons/uni_biomes_only/tiles_100k_130.geojson") 14 | grid_size = grid_size 15 | min_x, min_y, max_x, max_y = gdf.total_bounds 16 | # print(min_x, min_y, max_x, max_y) 17 | # exit() 18 | 19 | # Create grid cells 20 | x = np.arange(min_x, max_x + grid_size, grid_size) 21 | y = np.arange(min_y, max_y + grid_size, grid_size) 22 | grid_counts = np.zeros((len(x) - 1, len(y) - 1)) 23 | 24 | 25 | gdf_sindex = gdf.sindex 26 | 27 | for index, row in gdf.iterrows(): 28 | # if index % 10000 == 0: 29 | tile_center = row['geometry'].centroid 30 | possible_matches_index = list(gdf_sindex.intersection(tile_center.bounds)) 31 | name = row['tile_id'] 32 | 33 | 34 | # we only want to count the number of samples for l2a tiles 35 | try: 36 | if json_[name]['S2_type'] == "l2a": 37 | continue 38 | except: 39 | # the name is not in the json file 40 | continue 41 | 42 | for i in possible_matches_index: 43 | # ensure the possible matches are l2a tiles 44 | try: 45 | if json_[gdf.loc[i, 'tile_id']]['S2_type'] == "l2a": 46 | continue 47 | except: 48 | continue 49 | if gdf.loc[i, 'geometry'].contains(tile_center): 50 | x_idx = int((tile_center.x - min_x) / grid_size) 51 | y_idx = int((tile_center.y - min_y) / grid_size) 52 | grid_counts[x_idx][y_idx] += 1 53 | 54 | # fig, ax = plt.subplots() 55 | # grid_counts = np.transpose(grid_counts) 56 | # world.boundary.plot( 57 | # ax=ax, 58 | # color="gray", 59 | # # edgecolor="black", 60 | # linewidth=0.4 61 | # ) 62 | 63 | 64 | # cmap = plt.cm.get_cmap('inferno') 65 | 66 | # ax.set_xticks([]) 67 | # ax.set_yticks([]) 68 | 69 | # im = ax.imshow(grid_counts, extent=(min_x, max_x, min_y, max_y), origin='lower', cmap='inferno') 70 | 71 | 72 | # im.set_clim(1, 1000) 73 | # cbar = plt.colorbar(im, fraction=0.020, pad=0.04) 74 | # cbar.ax.set_ylabel('Number of samples', rotation=90, labelpad=2) 75 | # y_tick_labels = [str(i) for i in range(0, 801, 200)] 76 | # y_tick_labels.append(">1k") 77 | # cbar.ax.set_yticklabels(y_tick_labels) 78 | # plt.rcParams.update({'font.size': 6}) 79 | 80 | # # plt.show() 81 | # plt.savefig(f"/home/qbk152/vishal/global-lr/data_exp/t-grid_{grid_size}_uni_biomes.png", dpi=300, format='png') 82 | # plt.savefig(f"/home/qbk152/vishal/global-lr/data_exp/t-grid_{grid_size}_uni_biomes.pdf", dpi=300, format='pdf') 83 | from matplotlib.colors import ListedColormap 84 | fig, ax = plt.subplots() 85 | plt.rcParams.update({'font.size': 9}) 86 | plt.rcParams.update({'figure.figsize': (8, 6)}) 87 | grid_counts = np.transpose(grid_counts) 88 | # world.boundary.plot( 89 | # ax=ax, 90 | # color="white", 91 | # linewidth=0.5 92 | # ) 93 | 94 | # Create custom colormap 95 | colors = plt.cm.inferno(np.linspace(0, 1, 256)) 96 | colors[0] = (1, 1, 1, 0) # Set color for 0 to white (or (1,1,1,0) for transparent) 97 | custom_cmap = ListedColormap(colors) 98 | 99 | ax.set_xticks([]) 100 | ax.set_yticks([]) 101 | 102 | im = ax.imshow(grid_counts, extent=(min_x, max_x, min_y, max_y), origin='lower', cmap=custom_cmap) 103 | 104 | 105 | # Adjust color limits 106 | 107 | # im.set_clim(0, 1000) # this is hardcoded for grid_size = 1 108 | im.set_clim(0, 600) 109 | 110 | cbar = plt.colorbar(im, fraction=0.020, pad=0.04) 111 | cbar.ax.set_ylabel('Number of samples', rotation=90, labelpad=2) 112 | y_tick_labels = ["1", "100", "200", "300", "400", "500", ">600"] 113 | y_tick_loc = [1, 100, 200, 300, 400, 500, 600] 114 | cbar.set_ticks(y_tick_loc) 115 | cbar.ax.set_yticklabels(y_tick_labels) 116 | 117 | plt.savefig(f"/home/qbk152/vishal/MMEarth-data/1M_v001_plots/data_1M_v001/grid_{grid_size}_uni_biomes_whitebg_L1C.png", dpi=300, format='png', bbox_inches='tight') 118 | plt.savefig(f"/home/qbk152/vishal/MMEarth-data/1M_v001_plots/data_1M_v001/grid_{grid_size}_uni_biomes_whitebg_L1C.pdf", dpi=300, format='pdf', bbox_inches='tight') 119 | # plt.show() 120 | 121 | 122 | 123 | def create_density_custom(grid_size): 124 | file = open("data/missing_tiles_1M.csv") 125 | lines = file.readlines() 126 | 127 | minx, miny, maxx, maxy = -179.9863841350967, -86.78204367236995, 180.05963072575278, 83.48337010728358 128 | grid_size = 1 129 | 130 | x = np.arange(minx, maxx + grid_size, grid_size) 131 | y = np.arange(miny, maxy + grid_size, grid_size) 132 | 133 | grid_counts = np.zeros((len(x) - 1, len(y) - 1)) 134 | count = 0 135 | 136 | for line in lines: 137 | tile_id, lat, lon = line.split(",") 138 | lat = float(lat) 139 | lon = float(lon) 140 | 141 | x_idx = int((lon - minx) / grid_size) 142 | y_idx = int((lat - miny) / grid_size) 143 | 144 | grid_counts[x_idx][y_idx] += 1 145 | count += 1 146 | 147 | 148 | print(count) 149 | 150 | fig, ax = plt.subplots() 151 | grid_counts = np.transpose(grid_counts) 152 | 153 | im = ax.imshow(grid_counts, extent=(minx, maxx, miny, maxy), origin='lower', cmap='inferno') 154 | im.set_clim(0, 50) # this is hardcoded for grid_size = 1 155 | # im.set_clim(-2, 15) # this is hardcoded for grid_size = 0.1 156 | 157 | plt.colorbar(im, fraction=0.020, pad=0.04) 158 | 159 | 160 | # plt.show() 161 | plt.savefig(f"data_exp/missing_tiles_1M_{grid_size}.png", dpi=300) 162 | 163 | 164 | if __name__ == "__main__": 165 | create_density(1.1) 166 | # create_density(0.1) 167 | # create_density(0.01) 168 | # create_density_custom(1) 169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /data_exp/view_h5.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to view the contents of the h5 file. 3 | ''' 4 | 5 | 6 | import h5py 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import random 10 | import os 11 | import json 12 | from matplotlib.colors import ListedColormap, BoundaryNorm 13 | 14 | 15 | 16 | h5_path = '/projects/dereeco/data/global-lr/data_1M_130_new/data_1M_130_new.h5' 17 | tile_info_path = '/home/qbk152/vishal/global-lr/data/data_1M_130_new/data_1M_130_new_tile_info.json' 18 | save_dir = '/home/qbk152/vishal/global-lr/data/visualizations/130' 19 | splits_path = '/home/qbk152/vishal/global-lr/data/data_1M_130_new/data_1M_130_new_splits.json' 20 | 21 | # display_num = 20 # number of tiles to display 22 | # display_id = 'tsmbfsngfsf_29' 23 | display_id = 'mfwsnsasfmf_485' 24 | 25 | 26 | save_tif = False 27 | 28 | 29 | 30 | def save_img(path, img, cmap = None, norm = None): 31 | if cmap is None: 32 | plt.imshow(img) 33 | else: 34 | if norm is None: 35 | plt.imshow(img, cmap=cmap) 36 | else: 37 | plt.imshow(img, cmap=cmap, norm=norm) 38 | plt.axis('off') 39 | plt.savefig(path, bbox_inches='tight', pad_inches=0) 40 | plt.close() 41 | 42 | 43 | 44 | def view_h5(h5_path): 45 | 46 | hdf5_file = h5py.File(h5_path, 'r') 47 | print('h5 KEYS: ', hdf5_file.keys()) 48 | splits = json.load(open(splits_path, 'r')) 49 | meta = hdf5_file['metadata'] 50 | 51 | num_tiles = len(meta) 52 | 53 | if display_id is not None: 54 | for i in range(num_tiles): 55 | tile_id = meta[i][0].decode('utf-8') 56 | if tile_id == display_id: 57 | dir = os.path.join(save_dir, tile_id) 58 | os.makedirs(dir, exist_ok=True) 59 | for key in hdf5_file.keys(): 60 | # if len(hdf5_file[key].shape) != 4: 61 | # continue 62 | # print('Key: ', key) 63 | img = hdf5_file[key][i] 64 | write_img(img, key, dir) 65 | else: 66 | # choose a random tile 67 | for j in range(display_num): 68 | i = random.randint(0, num_tiles - 1) 69 | tile_id = meta[i][0].decode('utf-8') 70 | print('Tile ID: ', tile_id) 71 | 72 | train = splits['train'] 73 | val = splits['val'] 74 | test = splits['test'] 75 | 76 | if i in train: 77 | print('Train, idx: ', i) 78 | elif i in val: 79 | print('Val, idx: ', i) 80 | elif i in test: 81 | print('Test, idx: ', i) 82 | 83 | 84 | if not save_tif: 85 | dir = os.path.join(save_dir, tile_id) 86 | os.makedirs(dir, exist_ok=True) 87 | for key in hdf5_file.keys(): 88 | # we only want to visualize sentinel2 for now TESTING PURPOSES 89 | # if key != 'sentinel2': 90 | # continue 91 | if len(hdf5_file[key].shape) != 4: 92 | continue 93 | # print('Key: ', key) 94 | img = hdf5_file[key][i] 95 | write_img(img, key, dir) 96 | 97 | def write_img(img, key, dir): 98 | if key == 'sentinel2': 99 | img = img[[3, 2, 1], :, :]/10000 100 | clip_val = 0.2 101 | img = np.clip(img, 0, clip_val) 102 | img = img/clip_val 103 | 104 | img = img.transpose(1, 2, 0) 105 | 106 | # plt.imsave(os.path.join(dir, 'sentinel2.png'), img) 107 | save_img(os.path.join(dir, 's2.png'), img) 108 | 109 | elif key == 'sentinel1': 110 | bands_map = {'VV': 0, 'VH': 1, 'HH': 2, 'HV': 3} 111 | orbit_map = {'asc': 0, 'desc': 4} 112 | # write each band separately 113 | for band, band_idx in bands_map.items(): 114 | for orbit, orbit_idx in orbit_map.items(): 115 | img_ = img[orbit_idx + band_idx, :, :] 116 | # print(np.min(img_), np.max(img_)) 117 | img_ = (np.clip(img_, -30, 0) + 30) / 30 118 | # print(np.min(img_), np.max(img_)) 119 | # plt.imsave(os.path.join(dir, 's1_' + band + '_' + orbit + '.png'), img_) 120 | save_img(os.path.join(dir, 's1_' + band + '_' + orbit + '.png'), img_) 121 | 122 | elif key == 'aster': 123 | # write elevation and slope 124 | img_ = img[0, :, :] 125 | # plt.imsave(os.path.join(dir, 'aster_elevation.png'), img_) 126 | save_img(os.path.join(dir, 'aster_elevation.png'), img_) 127 | img_ = img[1, :, :] 128 | # plt.imsave(os.path.join(dir, 'aster_slope.png'), img_) 129 | save_img(os.path.join(dir, 'aster_slope.png'), img_) 130 | 131 | elif key == 'dynamic_world': 132 | # write the label band 133 | img_ = img[0, :, :] 134 | colors = ['#000000', '#419bdf', '#397d49', '#88b053', '#7a87c6', '#e49635', '#dfc35a', '#c4281b', '#a59b8f', '#b39fe1'] 135 | norm = BoundaryNorm([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], len(colors)) 136 | 137 | cmap = ListedColormap(colors) 138 | # plt.imsave(os.path.join(dir, 'dynamic_world.png'), img_, cmap=cmap) 139 | save_img(os.path.join(dir, 'dynamic_world.png'), img_, cmap=cmap, norm=norm) 140 | 141 | 142 | elif key == 'canopy_height_eth': 143 | # write the 2 bands 144 | img_ = img[0, :, :] 145 | # plt.imsave(os.path.join(dir, 'canopy_height_height.png'), img_) 146 | save_img(os.path.join(dir, 'canopy_height_height.png'), img_) 147 | img_ = img[1, :, :] 148 | # plt.imsave(os.path.join(dir, 'canopy_height_std.png'), img_) 149 | save_img(os.path.join(dir, 'canopy_height_std.png'), img_) 150 | 151 | elif key == 'esa_worldcover': 152 | img_ = img[0, :, :] 153 | colormap = [ 154 | '#006400', # Tree cover - 10 155 | '#ffbb22', # Shrubland - 20 156 | '#ffff4c', # Grassland - 30 157 | '#f096ff', # Cropland - 40 158 | '#fa0000', # Built-up - 50 159 | '#b4b4b4', # Bare / sparse vegetation - 60 160 | '#f0f0f0', # Snow and ice - 70 161 | '#0064c8', # Permanent water bodies - 80 162 | '#0096a0', # Herbaceous wetland - 90 163 | '#00cf75', # Mangroves - 95 164 | '#fae6a0' # Moss and lichen - 100 165 | ] 166 | 167 | bounds = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100] 168 | norm = BoundaryNorm(bounds, len(colormap)) 169 | 170 | cmap = ListedColormap(colormap) 171 | # plt.imsave(os.path.join(dir, 'esa_worldcover.png'), img_, cmap=cmap, norm=norm) 172 | 173 | save_img(os.path.join(dir, 'esa_worldcover.png'), img_, cmap=cmap, norm=norm) 174 | 175 | else: 176 | print(key, img) 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | if __name__ == '__main__': 185 | os.makedirs(save_dir, exist_ok=True) 186 | view_h5(h5_path) -------------------------------------------------------------------------------- /ee_utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/ee_utils/.DS_Store -------------------------------------------------------------------------------- /ee_utils/ee_data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A general class to collect the data from GEE. Each function in the class will be a different dataset, and they share common variables like the start and end date, the projection etc. 3 | ''' 4 | 5 | import io 6 | import config.ee_init 7 | import ee 8 | from matplotlib import pyplot as plt 9 | import numpy as np 10 | import requests 11 | import shutil 12 | import zipfile 13 | import os 14 | import logging 15 | from utils.utils import get_points_filter, get_ee_task_list, read_json 16 | import math 17 | import random 18 | import time 19 | from numpy.lib import recfunctions as rfn 20 | import h5py 21 | from retry import retry 22 | from multiprocessing import Pool, cpu_count 23 | from datetime import datetime, timedelta 24 | import hashlib 25 | 26 | BIOME_LABELS = read_json('./stats/biome_labels.json') 27 | ECOREGION_LABELS = read_json('./stats/eco_labels.json') 28 | 29 | 30 | class ee_set: 31 | def __init__(self, tile, cfg, tile_info = None): 32 | self.tile = tile 33 | self.crs = '' 34 | self.start_date = '2017-01-01' # the start date specifies the general time period for the data. The specific date is specified in the function. We consider a 2 year period 35 | self.end_date = '2020-12-31' # the end date specifies the general time period for the data. The specific date is specified in the function. We consider a 2 year period 36 | self.s2_date = '' 37 | self.s2_imageid = '' 38 | self.id = tile['properties']['tile_id'] 39 | self.polygon = ee.Geometry.Polygon(tile['geometry']['coordinates']) 40 | self.lon = self.polygon.centroid().coordinates().get(0).getInfo() 41 | self.lat = self.polygon.centroid().coordinates().get(1).getInfo() 42 | self.biome = BIOME_LABELS[tile['properties']['biome']] 43 | self.eco_region = ECOREGION_LABELS[tile['properties']['eco_region']] 44 | self.cfg = cfg # loading the config file 45 | self.export_folder = self.cfg.export_folder 46 | self.image_set = {} 47 | self.no_data = False 48 | self.return_dict = {} # dict to store the arr returned by export_pixels NOT USED 49 | self.img_bands = {} # a dictionary that stores the bands of each dataset acquired 50 | self.era5_data = {} 51 | self.proj = None 52 | self.s2_type = None 53 | coord_string = f"{self.lat}_{self.lon}" 54 | self.seed = int(hashlib.sha256(coord_string.encode('utf-8')).hexdigest(), 16) % 10**5 55 | random.seed(self.seed) 56 | 57 | 58 | 59 | 60 | if tile_info is not None: 61 | self.s2_date = tile_info['S2_DATE'] 62 | self.crs = tile_info['CRS'] 63 | # self.s2_imageid = tile_info['S2_IMAGEID'] 64 | if self.proj is None: 65 | self.proj = ee.Projection(self.crs).atScale(10) 66 | 67 | 68 | # start series of function calls to get the data 69 | for function_name in cfg.datasets: 70 | if hasattr(self, function_name) and callable(getattr(self, function_name)): 71 | if getattr(self, function_name)() is False: 72 | logging.error(f"Function {function_name} returned None") 73 | 74 | if function_name == 'sentinel2': 75 | logging.error(f"Skipping tile {self.id}") 76 | self.no_data = True 77 | break 78 | else: 79 | logging.error(f"Function {function_name} does not exist") 80 | 81 | 82 | 83 | 84 | # merging all the images into one image - comment these lines if you want to export the images seperately 85 | if not self.no_data: 86 | merged_image = self.image_set[self.cfg.datasets[0]] 87 | for data_name, image in self.image_set.items(): 88 | if data_name == self.cfg.datasets[0]: 89 | continue 90 | 91 | if isinstance(image, dict): 92 | for extra_info, img in image.items(): 93 | if img is None: 94 | continue 95 | merged_image = ee.Image.cat([merged_image, img]) 96 | elif image is None: 97 | continue 98 | else: 99 | merged_image = ee.Image.cat([merged_image, image]) 100 | 101 | self.image_set = {} 102 | if tile_info is not None: 103 | self.image_set['extra'] = merged_image 104 | else: 105 | self.image_set['merged'] = merged_image 106 | 107 | 108 | if not self.no_data: 109 | start = time.time() 110 | try: 111 | self.export_local_single() 112 | except Exception as e: 113 | logging.error(f"Error exporting to local directory: {e}") 114 | self.no_data = True 115 | # self.export_local_parallel() 116 | logging.debug(f"Time taken for exporting all: {time.time() - start}") 117 | 118 | 119 | 120 | 121 | ################################################################################################################################################################################################ 122 | # THE FOLLOWING SET OF FUNCTIONS ARE FOR GETTING THE DATA FROM GEE. WRITE A NEW FUNCTION FOR EACH DATASET 123 | # MAKE SURE THE NAME OF THE FUNCTION IS THE SAME AS THE NAME OF THE DATASET IN THE CONFIG FILE 124 | # FOR EACH FUNCTION YOU RETURN A DICTIONARY WITH THE NAME OF THE DATASET AS THE KEY AND THE IMAGE AS THE VALUE 125 | ################################################################################################################################################################################################ 126 | def sentinel2(self, cld_threshold = 10): 127 | ''' 128 | This function gets the sentinel2 data for the tile. The function searches for the least cloudy image in the time period and returns that image. To ensure 129 | that the image covers the entire tile, we use a points filter that only selects the images that have the bottom right and top left points of the tile. 130 | 131 | S2 is used as the base image, and hence we get the date and projection from this image. The bands are selected from the config file. 132 | 133 | ''' 134 | start = time.time() 135 | 136 | cfg = self.cfg.sentinel2 137 | data_name = cfg.name 138 | bands_l2a = list(cfg.BANDS[0]) 139 | bands_l1c = list(cfg.BANDS[1]) 140 | collection_l2a = cfg.collection[0] 141 | collection_l1c = cfg.collection[1] 142 | 143 | 144 | rnd_year = random.randint(2017, 2020) 145 | if rnd_year == 2018: 146 | # we only go up to november 2018 since l2a is global from dec 2018 147 | s_date = f"{rnd_year}-01-01" 148 | e_date = f"{rnd_year}-11-30" 149 | elif rnd_year == 2017 or rnd_year == 2020: 150 | s_date = f"{rnd_year}-01-01" 151 | e_date = f"{rnd_year}-12-31" 152 | elif rnd_year == 2019: 153 | # we also include dec 2018 154 | s_date = f"{rnd_year - 1}-12-01" 155 | e_date = f"{rnd_year}-12-31" 156 | 157 | # random.seed(self.seed) 158 | if random.randint(0, 1) == 0: 159 | S2 = ee.ImageCollection(collection_l2a)\ 160 | .filterBounds(self.polygon)\ 161 | .filterDate(f"{s_date}", f"{e_date}")\ 162 | .filterMetadata('CLOUDY_PIXEL_PERCENTAGE', 'less_than', cld_threshold) 163 | self.s2_type = 'l2a' 164 | 165 | if S2.size().getInfo() == 0: 166 | S2 = ee.ImageCollection(collection_l1c)\ 167 | .filterBounds(self.polygon)\ 168 | .filterDate(f"{s_date}", f"{e_date}")\ 169 | .filterMetadata('CLOUDY_PIXEL_PERCENTAGE', 'less_than', cld_threshold) 170 | self.s2_type = 'l1c' 171 | else: 172 | S2 = ee.ImageCollection(collection_l1c)\ 173 | .filterBounds(self.polygon)\ 174 | .filterDate(f"{s_date}", f"{e_date}")\ 175 | .filterMetadata('CLOUDY_PIXEL_PERCENTAGE', 'less_than', cld_threshold) 176 | self.s2_type = 'l1c' 177 | 178 | # points_filter = get_points_filter(self.polygon, buffer_size = -200) 179 | # filtered_images = S2.filter(points_filter) 180 | 181 | filtered_images = S2.filter(ee.Filter.contains('.geo', self.polygon.buffer(200))) 182 | 183 | num_filtered_images = filtered_images.size().getInfo() 184 | if num_filtered_images == 0: 185 | logging.error('\t No sentinel2 image found for both l1c and l2a') 186 | return False 187 | img_list = filtered_images.toList(filtered_images.size()) 188 | random_number = random.randint(0, num_filtered_images - 1) 189 | sampled_image_full = ee.Image(img_list.get(random_number)) 190 | # Select the desired bands and clip the image 191 | if self.s2_type == 'l2a': 192 | if "MSK_CLDPRB" in sampled_image_full.bandNames().getInfo(): 193 | sampled_image = sampled_image_full.select(bands_l2a).clip(self.polygon).float() 194 | else: 195 | new_bands = [band for band in bands_l2a if band != 'MSK_CLDPRB'] 196 | bands_l2a = new_bands 197 | sampled_image = sampled_image_full.select(new_bands).clip(self.polygon).float() 198 | else: 199 | sampled_image = sampled_image_full.select(bands_l1c).clip(self.polygon).float() 200 | 201 | 202 | try: 203 | self.s2_date = sampled_image.date().format('YYYY-MM-dd').getInfo() 204 | except ee.ee_exception.EEException: 205 | tmp = ee.Image(img_list.get(random_number)) 206 | logging.error(f"type: {self.s2_type}, num images in collection: {num_filtered_images}") 207 | logging.error(f"bands: {tmp.bandNames().getInfo()}") 208 | 209 | tmp = sampled_image.select('B4') 210 | self.proj = tmp.projection() 211 | self.crs = self.proj.getInfo()['crs'] 212 | 213 | logging.debug(f"\t ID: {self.id}\ 214 | \nBiome name: {self.tile['properties']['biome']}\ 215 | \nEco-region name: {self.tile['properties']['eco_region']}\ 216 | \nDate: {self.s2_date}\ 217 | \nProjection: {self.crs}\ 218 | \nLat: {self.lat} Lon: {self.lon}\ 219 | \nPolygon: {self.polygon.getInfo()['coordinates']}\ 220 | \nS2 type:{self.s2_type}"\ 221 | ) 222 | if self.s2_type == 'l2a': 223 | scl = sampled_image.select(['SCL', 'QA60']).reproject(self.proj) 224 | sampled_image = sampled_image.select([band for band in bands_l2a if band not in ['SCL', 'QA60']]).resample('bilinear').reproject(self.proj) 225 | sampled_image = sampled_image.addBands(scl) 226 | else: 227 | qa60 = sampled_image.select('QA60').reproject(self.proj) 228 | sampled_image = sampled_image.select([band for band in bands_l1c if band != 'QA60']).resample('bilinear').reproject(self.proj) 229 | sampled_image = sampled_image.addBands(qa60) 230 | self.image_set[data_name] = sampled_image 231 | self.img_bands[data_name] = sampled_image.bandNames().getInfo() 232 | logging.debug('\t Sentinel2 image loaded') 233 | logging.debug(f"Time taken for {data_name}: {time.time() - start}") 234 | 235 | 236 | 237 | 238 | def sentinel1(self): 239 | ''' 240 | This function gets the sentinel1 data for the tile. We already know the date and projection from the sentinel2 image, so we use that to get the sentinel1 image. 241 | If the image on that date is not available, we use the closest available image. We use the VV and VH bands from the image for both ascending and descending orbits. 242 | 243 | ''' 244 | 245 | start = time.time() 246 | 247 | img = (ee.ImageCollection('COPERNICUS/S1_GRD') 248 | .filterDate(self.start_date, self.end_date) # gets images in the specified date range 249 | .filterBounds(self.polygon) # gets images that have some overlap with the tile 250 | .filter(ee.Filter.contains('.geo', self.polygon.buffer(200))) # gets images containing the tile plus some buffer 251 | .map(lambda image: image.clip(self.polygon)) # crops to tile 252 | .filterMetadata('instrumentMode', 'equals', 'IW') # selects for the interferometric wide swath mode 253 | .map(lambda image: image.set('date_difference', image.date().difference(self.s2_date, 'day').abs())) # calculate days off from S2 image 254 | .sort('date_difference')) # sort in ascending order by days off 255 | 256 | 257 | # getting the ascending and descending images 258 | img_asc = img.filterMetadata('orbitProperties_pass', 'equals', 'ASCENDING').first() 259 | img_desc = img.filterMetadata('orbitProperties_pass', 'equals', 'DESCENDING').first() 260 | 261 | 262 | 263 | # selecting the bands 264 | try: 265 | bands_asc = img_asc.bandNames().getInfo() 266 | if 'angle' in bands_asc: 267 | bands_asc.remove('angle') 268 | except ee.ee_exception.EEException: 269 | logging.debug('\t No ascending image found') 270 | img_asc = None 271 | try: 272 | bands_desc = img_desc.bandNames().getInfo() 273 | if 'angle' in bands_desc: 274 | bands_desc.remove('angle') 275 | except ee.ee_exception.EEException: 276 | logging.debug('\t No descending image found') 277 | img_desc = None 278 | 279 | # if angle bands are available, remove them 280 | img_asc = img_asc.select(bands_asc).float() if img_asc is not None else None 281 | img_desc = img_desc.select(bands_desc).float() if img_desc is not None else None 282 | 283 | # resampling the image 284 | if img_asc is not None: 285 | img_asc = img_asc.resample('bilinear').reproject(self.proj) 286 | if img_desc is not None: 287 | img_desc = img_desc.resample('bilinear').reproject(self.proj) 288 | 289 | self.image_set[data_name] = {} 290 | self.image_set[data_name]['asc'] = img_asc 291 | self.image_set[data_name]['desc'] = img_desc 292 | 293 | 294 | self.img_bands[data_name + '_asc'] = bands_asc if img_asc is not None else None 295 | self.img_bands[data_name + '_desc'] = bands_desc if img_desc is not None else None 296 | 297 | logging.debug('\t Sentinel1 image loaded') 298 | logging.debug(f"Time taken for {data_name}: {time.time() - start}") 299 | 300 | 301 | 302 | def aster(self): 303 | ''' 304 | This function gets the elevation data for the tile. The data usually consists of the elevation, we also compute the slope from the elevation data, and return both. 305 | ''' 306 | start = time.time() 307 | cfg = self.cfg.aster # getting the config for aster elevation data 308 | data_name = cfg.name # the name used to save the image in the image_set dictionary and the export name 309 | bands = list(cfg.BANDS) # the bands to be selected from the image 310 | 311 | elevation = ee.Image(cfg.collection).clip(self.polygon).select(bands).float() 312 | slope = ee.Terrain.slope(elevation) 313 | merge = ee.Image.cat([elevation, slope]) 314 | 315 | 316 | 317 | # self.image_set[data_name]['elevation'] = elevation 318 | # self.image_set[data_name]['slope'] = slope 319 | merge = merge.resample('bilinear').reproject(self.proj) 320 | self.image_set[data_name] = merge 321 | self.img_bands[data_name] = merge.bandNames().getInfo() 322 | 323 | logging.debug('\t elevation and slope image loaded') 324 | logging.debug(f"Time taken for {data_name}: {time.time() - start}") 325 | 326 | 327 | def era5(self): 328 | ''' 329 | This function gets the ERA5 data for the tile. The ERA5 is only computed until the mid of 2020, hence we compute the same stats but for the period of 2018 - 2019. As per the world 330 | temperature stats, the average temperature from 2018 - 2021 was roughly the same. We compute 3 sets of stats. 1 for the current month, 1 for the previous month, and 1 for the full year. 331 | ''' 332 | 333 | start = time.time() 334 | 335 | cfg = self.cfg.era5 # getting the config for era5 336 | data_name = cfg.name 337 | bands = list(cfg.BANDS) 338 | 339 | parts = self.s2_date.split('-') 340 | year = int(parts[0]) 341 | month = int(parts[1]) 342 | 343 | end_date = (datetime(year, month, 1) + timedelta(days=32)).replace(day=1) - timedelta(days=1) 344 | end_date = end_date.strftime('%Y-%m-%d') 345 | if month > 1: 346 | month = month - 1 347 | start_date = f"{year}-{month}-01" 348 | else: 349 | month = 12 350 | start_date = f"{year - 1}-{month}-01" 351 | 352 | # getting 2 months in one image collection 353 | ERA5_monthly = ee.ImageCollection(cfg.collection)\ 354 | .filterDate(start_date, end_date)\ 355 | .map(lambda image: image.clip(self.polygon))\ 356 | .select(bands)\ 357 | .toBands() 358 | 359 | 360 | # getting the year in one image collection - we get exactly one year of stats including the current month. 361 | year, month, _ = map(int, self.s2_date.split('-')) 362 | 363 | # Calculate start_date and end_date for the year 364 | # we subtract 1 from the year to get the previous year 365 | start_date = f"{year - 1}-{month}-01" 366 | end_date = (datetime(year, month, 1) + timedelta(days=32)).replace(day=1) - timedelta(days=1) 367 | end_date = end_date.strftime('%Y-%m-%d') 368 | 369 | 370 | ERA5_yearly = ee.ImageCollection(cfg.collection)\ 371 | .filterDate(start_date, end_date)\ 372 | .map(lambda image: image.clip(self.polygon)) 373 | 374 | 375 | def compute_yearly(bandName, imageCollection): 376 | if 'min' in bandName: 377 | reducer = ee.Reducer.min() 378 | elif 'max' in bandName: 379 | reducer = ee.Reducer.max() 380 | elif 'total' in bandName: 381 | reducer = ee.Reducer.sum() 382 | else: 383 | reducer = ee.Reducer.mean() 384 | yearly = imageCollection.select(bandName).reduce(reducer) 385 | return yearly 386 | 387 | 388 | ERA5_yearly_image = ee.ImageCollection([compute_yearly(band, ERA5_yearly) for band in bands]).toBands().float() 389 | ERA5_combined = ee.Image.cat([ERA5_monthly, ERA5_yearly_image]) 390 | 391 | # if for some reason you wish to export them seperately - uncomment the following lines 392 | # self.image_set[data_name] = {} 393 | # self.image_set[data_name]['month1'] = ERA5_month1 394 | # self.image_set[data_name]['month2'] = ERA5_month2 395 | # self.image_set[data_name]['year'] = ERA5_yearly_image 396 | 397 | center_pixels = ERA5_combined.reduceRegion( 398 | reducer=ee.Reducer.mean(), 399 | geometry=self.polygon, 400 | scale=10 401 | ) 402 | center_pixels = center_pixels.getInfo() 403 | 404 | band_names = ERA5_combined.bandNames().getInfo() 405 | 406 | self.era5_data['month1'] = [center_pixels[band] for band in band_names[:4]] 407 | self.era5_data['month2'] = [center_pixels[band] for band in band_names[4:8]] 408 | self.era5_data['year'] = [center_pixels[band] for band in band_names[8:]] 409 | 410 | 411 | # ERA5_combined = ERA5_combined.reproject(self.proj) 412 | # self.image_set[data_name] = ERA5_combined 413 | 414 | self.img_bands[data_name] = band_names 415 | 416 | 417 | logging.debug('\t ERA5 image loaded') 418 | logging.debug(f"Time taken for {data_name}: {time.time() - start}") 419 | 420 | 421 | def dynamic_world(self): 422 | ''' 423 | This function gets the dynamic world data for the tile. The dynamic world data is a collection of images with the same name as the sentinel 2 image for that tile. It consist of 9 classes, we add one more to indicate missing 424 | information. The classes are as follows: 425 | 0: No data 426 | 1: Water 427 | 2: Trees 428 | 3: Grass 429 | 4: Flooded vegetation 430 | 5: Crops 431 | 6: Shrub and scrub 432 | 7: Built 433 | 8: Bare 434 | 9: Snow and ice 435 | 436 | We choose the label band since that contains which of these labels were chosen. 437 | ''' 438 | start = time.time() 439 | cfg = self.cfg.dynamic_world 440 | data_name = cfg.name 441 | bands = list(cfg.BANDS) 442 | 443 | year = self.s2_date.split('-')[0] 444 | start_date = f"{year}-01-01" 445 | end_date = f"{year}-12-31" 446 | dw_ic = ee.ImageCollection(cfg.collection)\ 447 | .filterBounds(self.polygon)\ 448 | .filterDate(start_date, end_date)\ 449 | .select(bands) 450 | 451 | 452 | 453 | def reclasify(image): 454 | label = image.select('label') 455 | label2 = label\ 456 | .where(image.eq(0), 1)\ 457 | .where(image.eq(1), 2)\ 458 | .where(image.eq(2), 3)\ 459 | .where(image.eq(3), 4)\ 460 | .where(image.eq(4), 5)\ 461 | .where(image.eq(5), 6)\ 462 | .where(image.eq(6), 7)\ 463 | .where(image.eq(7), 8)\ 464 | .where(image.eq(8), 9)\ 465 | .where(image.eq(9), 10) 466 | 467 | # replacing the label band with the new label band 468 | image = image.addBands(label2.rename('label2')) 469 | image = image.select('label2') 470 | image = image.rename('label') 471 | return image 472 | 473 | dw_ic = dw_ic.map(reclasify) 474 | dw_image = dw_ic.mode().clip(self.polygon) 475 | 476 | bands = dw_image.bandNames().getInfo() 477 | 478 | 479 | if len(bands) == 0: 480 | logging.debug('\t No dynamic world image found') 481 | self.image_set[data_name] = None 482 | else: 483 | dw_image = dw_image.reproject(self.proj) 484 | self.image_set[data_name] = dw_image 485 | self.img_bands[data_name] = dw_image.bandNames().getInfo() 486 | logging.debug('\t Dynamic world image loaded') 487 | 488 | 489 | 490 | logging.debug(f"Time taken for {data_name}: {time.time() - start}") 491 | 492 | def canopy_height_eth(self): 493 | ''' 494 | This function gets the ETH canopy height and standard deviation from the year 2020. 495 | ''' 496 | start = time.time() 497 | cfg = self.cfg.canopy_height_eth # getting the config for canopy_height_eth 498 | data_name = cfg.name # the name used to save the image in the image_set dictionary and the export name 499 | collections = list(cfg.COLLECTIONS) # the collections with single bands that will be used 500 | 501 | height = ee.Image(collections[0]).clip(self.polygon).float() 502 | std = ee.Image(collections[1]).clip(self.polygon).float() 503 | merge = ee.Image.cat([height, std]) 504 | 505 | merge = merge.resample('bilinear').reproject(self.proj) 506 | merge = merge.rename(['height', 'std']) 507 | self.image_set[data_name] = merge 508 | self.img_bands[data_name] = merge.bandNames().getInfo() 509 | 510 | logging.debug('\t ETH canopy height and std loaded') 511 | logging.debug(f"Time taken for {data_name}: {time.time() - start}") 512 | 513 | def esa_worldcover(self): 514 | ''' 515 | This function gets the esa worldcover data for the tile. 516 | ''' 517 | 518 | start = time.time() 519 | cfg = self.cfg.esa_worldcover # getting the config for esa_worldcover 520 | data_name = cfg.name # the name used to save the image in the image_set dictionary and the export name 521 | bands = list(cfg.BANDS) # the bands to be selected from the image 522 | 523 | dataset = ee.ImageCollection(cfg.collection).first().clip(self.polygon).select(bands) 524 | 525 | dataset = dataset.reproject(self.proj) 526 | 527 | self.image_set[data_name] = dataset 528 | self.img_bands[data_name] = dataset.bandNames().getInfo() 529 | 530 | logging.debug('\t esa worldcover loaded') 531 | logging.debug(f"Time taken for {data_name}: {time.time() - start}") 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | ################################################################################################################################################################################################ 541 | # THE FOLLOWING SET OF FUNCTIONS ARE FOR EXPORTING THE ABOVE DATA TO GCS OR LOCAL DIRECTORY 542 | ################################################################################################################################################################################################ 543 | 544 | def export_local_parallel(self): 545 | ''' 546 | A function that exports the data to the local directory. To make it parallel, we create the number of processes equal to the 547 | number of datasets. Each process will export one dataset. 548 | ''' 549 | start = time.time() 550 | # Create a process pool with a limited number of processes 551 | num_processes = min(len(self.image_set), cpu_count()) 552 | with Pool(num_processes) as pool: 553 | args_list = [(data_name, image) for data_name, image in self.image_set.items()] 554 | pool.starmap(self.export_local, args_list) 555 | logging.debug(f"Time taken for exporting all: {time.time() - start}") 556 | logging.info(f"Exported all images for {self.id}") 557 | 558 | 559 | @retry(tries=10, delay=1, backoff=2) 560 | def export_local(self, data_name, image): 561 | # data_name, image = args_list 562 | os.makedirs(f"{self.export_folder}/{data_name}", exist_ok=True) 563 | if isinstance(image, dict): 564 | for extra_info, img in image.items(): 565 | if img is None: 566 | continue 567 | start = time.time() 568 | url = img.getDownloadUrl({ 569 | 'name': f"{data_name}_{extra_info}_{self.id}", 570 | 'scale': 10, 571 | 'crs': self.crs, 572 | 'region': self.polygon.getInfo()['coordinates'], 573 | 'format': 'GeoTIFF', 574 | 'bands': img.bandNames().getInfo() 575 | }) 576 | logging.debug(f"time taken for getting url: {time.time() - start}") 577 | 578 | r = requests.get(url, stream=True, verify=False) 579 | if r.status_code == 200: 580 | with open(f"{self.export_folder}/{data_name}/{self.id}_{extra_info}.tif", 'wb') as f: 581 | f.write(r.content) 582 | logging.debug(f"Downloaded {data_name} to local directory") 583 | else: 584 | logging.debug(f"Error downloading {data_name} to local directory") 585 | return 586 | if image is None: 587 | return 588 | start = time.time() 589 | url = image.getDownloadUrl({ 590 | 'name': f"{data_name}_{self.id}", 591 | 'scale': 10, 592 | 'crs': self.crs, 593 | 'region': self.polygon.getInfo()['coordinates'], 594 | 'format': 'GeoTIFF', 595 | 'bands': image.bandNames().getInfo() 596 | }) 597 | logging.debug(f"time taken for getting url: {time.time() - start}") 598 | 599 | 600 | r = requests.get(url, stream=True, verify=False) 601 | 602 | 603 | if r.status_code == 200: 604 | with open(f"{self.export_folder}/{data_name}/{self.id}.tif", 'wb') as f: 605 | f.write(r.content) 606 | logging.debug(f"Downloaded {data_name} to local directory") 607 | else: 608 | logging.debug(f"Error downloading {data_name} to local directory") 609 | return 610 | 611 | @retry(tries=10, delay=1, backoff=2) 612 | def export_local_single(self): 613 | for data_name, image in self.image_set.items(): 614 | os.makedirs(f"{self.export_folder}/{data_name}", exist_ok=True) 615 | if isinstance(image, dict): 616 | for extra_info, img in image.items(): 617 | if img is None: 618 | continue 619 | start = time.time() 620 | url = img.getDownloadUrl({ 621 | 'name': f"{data_name}_{extra_info}_{self.id}", 622 | 'scale': 10, 623 | 'crs': self.crs, 624 | 'region': self.polygon.getInfo()['coordinates'], 625 | 'format': 'GeoTIFF', 626 | 'bands': img.bandNames().getInfo() 627 | }) 628 | logging.debug(f"time taken for getting url: {time.time() - start}") 629 | start = time.time() 630 | r = requests.get(url, stream=True, verify=False) 631 | if r.status_code == 200: 632 | with open(f"{self.export_folder}/{data_name}/{self.id}_{extra_info}.tif", 'wb') as f: 633 | f.write(r.content) 634 | logging.debug(f"Downloaded {data_name} to local directory") 635 | logging.debug(f"time taken for downloading: {time.time() - start}") 636 | 637 | else: 638 | logging.debug(f"Error downloading {data_name} to local directory") 639 | return 640 | if image is None: 641 | return 642 | start = time.time() 643 | url = image.getDownloadUrl({ 644 | 'name': f"{data_name}_{self.id}", 645 | 'scale': 10, 646 | 'crs': self.crs, 647 | 'region': self.polygon.getInfo()['coordinates'], 648 | 'format': 'GeoTIFF', 649 | 'bands': image.bandNames().getInfo() 650 | }) 651 | logging.debug(f"time taken for getting url: {time.time() - start}") 652 | start = time.time() 653 | r = requests.get(url, stream=True, verify=False) 654 | 655 | 656 | if r.status_code == 200: 657 | with open(f"{self.export_folder}/{data_name}/{self.id}.tif", 'wb') as f: 658 | f.write(r.content) 659 | logging.debug(f"Downloaded {data_name} to local directory") 660 | logging.debug(f"time taken for downloading: {time.time() - start}") 661 | else: 662 | logging.debug(f"Error downloading {data_name} to local directory") 663 | return 664 | 665 | 666 | def export(self): 667 | ''' 668 | Export the images to GCS. Sometimes the dictionary has a sub dictionary, for example sentinel1 has ascending and descending orbits, hence we create a sub dictionary for each orbit 669 | ''' 670 | logging.debug('\t ---- Exporting images to GCS ----') 671 | for data_name, image in self.image_set.items(): 672 | if isinstance(image, dict): 673 | for extra_info, img in image.items(): 674 | if img is None: 675 | continue 676 | task = ee.batch.Export.image.toCloudStorage( 677 | image = img, 678 | description = f"{data_name}_{extra_info}_{self.id}", 679 | bucket = self.cfg.bucket, 680 | fileNamePrefix = self.export_folder + '/' + data_name + '/' + self.id + '_' + extra_info , 681 | region = self.polygon.getInfo()['coordinates'], 682 | scale = 10, 683 | crs = self.crs, 684 | maxPixels = 1e13 685 | ) 686 | task.start() 687 | logging.debug(f"\t Exporting {data_name} to GCS") 688 | # print(task.status()) 689 | continue 690 | task = ee.batch.Export.image.toCloudStorage( 691 | image = image, 692 | description = f"{data_name}_{self.id}", 693 | bucket = self.cfg.bucket, 694 | fileNamePrefix = self.export_folder + '/' + data_name + '/' + self.id, 695 | region = self.polygon.getInfo()['coordinates'], 696 | scale = 10, 697 | crs = self.crs, 698 | maxPixels = 1e13 699 | ) 700 | task.start() 701 | logging.debug(f"\t Exporting {data_name} to GCS") 702 | 703 | # print(task.status()) 704 | 705 | 706 | @retry(tries=10, delay=1, backoff=2) 707 | def download_and_process_image(self, image, crs): 708 | url = image.getDownloadUrl({ 709 | 'bands': image.bandNames().getInfo(), 710 | 'region': self.polygon.getInfo()['coordinates'], 711 | 'scale': 10, 712 | 'format': 'NPY'}) 713 | r = requests.get(url) 714 | np_geotiff = np.load(io.BytesIO(r.content)) 715 | 716 | # np_geotiff = np.load(io.BytesIO(geotiff)) 717 | # cropping the image to 128x128 718 | new_shape = (128, 128) 719 | old_shape = np_geotiff.shape 720 | start_x = (old_shape[0] - new_shape[0]) // 2 721 | start_y = (old_shape[1] - new_shape[1]) // 2 722 | np_geotiff = np_geotiff[start_x:start_x + new_shape[0], start_y:start_y + new_shape[1]] 723 | 724 | arr = rfn.structured_to_unstructured(np_geotiff[list(np_geotiff.dtype.names)]) 725 | arr = arr.transpose(2, 0, 1) 726 | return arr, np_geotiff.dtype.names 727 | 728 | def export_pixels(self): 729 | ''' 730 | This function exports to the local directory using the computePixels functions. With this code, I am trying to store the np arrays directly into an HDF5 file. 731 | ''' 732 | 733 | # Initialize dictionaries and arrays 734 | self.return_dict = {} 735 | 736 | for data_name, image in self.image_set.items(): 737 | if isinstance(image, dict): 738 | if data_name == 'sentinel1': 739 | arr = np.full((8, 128, 128), np.nan) 740 | elif data_name == 'era5': 741 | arr = np.full((12, 128, 128), np.nan) 742 | c = 0 743 | 744 | for extra_info, img in image.items(): 745 | if img is None: 746 | continue 747 | arr_t, bands_downloaded = self.download_and_process_image(img, self.crs) 748 | if data_name == 'sentinel1': 749 | if extra_info == 'asc': 750 | if 'VV' in bands_downloaded: 751 | arr[0] = arr_t[bands_downloaded.index('VV')] 752 | if 'VH' in bands_downloaded: 753 | arr[1] = arr_t[bands_downloaded.index('VH')] 754 | if 'HH' in bands_downloaded: 755 | arr[2] = arr_t[bands_downloaded.index('HH')] 756 | if 'HV' in bands_downloaded: 757 | arr[3] = arr_t[bands_downloaded.index('HV')] 758 | elif extra_info == 'desc': 759 | if 'VV' in bands_downloaded: 760 | arr[4] = arr_t[bands_downloaded.index('VV')] 761 | if 'VH' in bands_downloaded: 762 | arr[5] = arr_t[bands_downloaded.index('VH')] 763 | if 'HH' in bands_downloaded: 764 | arr[6] = arr_t[bands_downloaded.index('HH')] 765 | if 'HV' in bands_downloaded: 766 | arr[7] = arr_t[bands_downloaded.index('HV')] 767 | 768 | if data_name == 'era5': 769 | arr[c:c+4] = arr_t 770 | c += 4 771 | self.return_dict[data_name] = arr 772 | else: 773 | arr, bands_downloaded = self.download_and_process_image(image, self.crs) 774 | self.return_dict[data_name] = arr 775 | 776 | 777 | 778 | 779 | 780 | 781 | # def export_pixels(self): 782 | # ''' 783 | # This function also exports to the local directory. It is good for small files, and holds band information. With this code, i am trying to store the np arrays directly into a hdf5 file. 784 | 785 | # ''' 786 | 787 | 788 | # for data_name, image in self.image_set.items(): 789 | # if isinstance(image, dict): 790 | # if data_name == 'sentinel1': 791 | # arr = np.full((8, 128, 128), np.nan) 792 | # elif data_name == 'era5': 793 | # arr = np.full((12, 128, 128), np.nan) 794 | # c = 0 795 | 796 | # for extra_info, img in image.items(): 797 | # if img is None: 798 | # continue 799 | # img = img.resample('bicubic').reproject(crs=self.crs, scale=10) 800 | # proj = ee.Projection(self.crs).atScale(10).getInfo() 801 | 802 | # request = { 803 | # 'expression': image, 804 | # 'fileFormat': 'NPY', 805 | # 'grid': { 806 | # 'affineTransform': { 807 | # 'scaleX': 10, 808 | # 'shearX': 0, 809 | # 'shearY': 0, 810 | # 'scaleY': -10, 811 | # }, 812 | # 'crsCode': proj['crs'], 813 | # }, 814 | 815 | # } 816 | # REQUEST = dict(request) 817 | # geotiff = ee.data.computePixels(REQUEST) 818 | # np_geotiff = np.load(io.BytesIO(geotiff)) 819 | 820 | # # cropping the image to 128x128 821 | # new_shape = (128, 128) 822 | # old_shape = np_geotiff.shape 823 | # start_x = (old_shape[0] - new_shape[0]) // 2 824 | # start_y = (old_shape[1] - new_shape[1]) // 2 825 | # np_geotiff = np_geotiff[start_x:start_x + new_shape[0], start_y:start_y + new_shape[1]] 826 | 827 | # arr_t = np_geotiff.view(np.float32).reshape((-1,) + np_geotiff.shape) 828 | # if data_name == 'sentinel1': 829 | # bands = np_geotiff.dtype.names 830 | 831 | # if extra_info == 'asc': 832 | # # put the bands in the correct order [VV, VH, HH, HV], if they are present 833 | # if 'VV' in bands: 834 | # arr[0] = arr_t['VV'] 835 | # if 'VH' in bands: 836 | # arr[1] = arr_t['VH'] 837 | # if 'HH' in bands: 838 | # arr[2] = arr_t['HH'] 839 | # if 'HV' in bands: 840 | # arr[3] = arr_t['HV'] 841 | # elif extra_info == 'desc': 842 | # if 'VV' in bands: 843 | # arr[4] = arr_t['VV'] 844 | # if 'VH' in bands: 845 | # arr[5] = arr_t['VH'] 846 | # if 'HH' in bands: 847 | # arr[6] = arr_t['HH'] 848 | # if 'HV' in bands: 849 | # arr[7] = arr_t['HV'] 850 | # if data_name == 'era5': 851 | # arr[c:c+4] = arr_t 852 | # c += 4 853 | 854 | # self.return_dict[data_name] = arr 855 | 856 | 857 | # image = image.resample('bicubic').reproject(crs=self.crs, scale=10) 858 | # proj = ee.Projection(self.crs).atScale(10).getInfo() 859 | 860 | # request = { 861 | # 'expression': image, 862 | # 'fileFormat': 'NPY', 863 | # 'grid': { 864 | # 'affineTransform': { 865 | # 'scaleX': 10, 866 | # 'shearX': 0, 867 | # 'shearY': 0, 868 | # 'scaleY': -10, 869 | # }, 870 | # 'crsCode': proj['crs'], 871 | # }, 872 | 873 | # } 874 | 875 | # REQUEST = dict(request) 876 | # geotiff = ee.data.computePixels(REQUEST) 877 | # np_geotiff = np.load(io.BytesIO(geotiff)) 878 | 879 | # # cropping the image to 128x128 880 | # new_shape = (128, 128) 881 | # old_shape = np_geotiff.shape 882 | # start_x = (old_shape[0] - new_shape[0]) // 2 883 | # start_y = (old_shape[1] - new_shape[1]) // 2 884 | # np_geotiff = np_geotiff[start_x:start_x + new_shape[0], start_y:start_y + new_shape[1]] 885 | 886 | # arr = np_geotiff.view(np.float32).reshape((-1,) + np_geotiff.shape) 887 | # # display_array = rfn.structured_to_unstructured(np_geotiff[['B4', 'B3', 'B2']])/10000 888 | # # plt.imshow(display_array) 889 | # # plt.show() 890 | # self.return_dict[data_name] = arr 891 | 892 | 893 | # exit() 894 | # # writing the np to geoTIFF 895 | # from osgeo import gdal 896 | # import pyproj 897 | # from osgeo import osr 898 | 899 | # upp_left_coords = self.polygon.bounds().coordinates().get(0).getInfo()[3] 900 | 901 | # source_proj = pyproj.Proj(proj='latlong', datum='WGS84') 902 | # target_proj = pyproj.Proj(init=proj['crs']) 903 | 904 | # print(upp_left_coords) 905 | # print(proj['crs']) 906 | 907 | # upp_left_x, upp_left_y = pyproj.transform(source_proj, target_proj, upp_left_coords[0], upp_left_coords[1]) 908 | # tranform_var = (upp_left_x, 10, 0, upp_left_y, 0, -10) 909 | # driver = gdal.GetDriverByName('GTiff') 910 | # GDT_dtype = gdal.GDT_Float32 911 | # rows, cols = np_geotiff.shape[0], np_geotiff.shape[1] 912 | # band_num = len(np_geotiff.dtype.names) 913 | # outRaster = driver.Create('/Users/qbk152/Desktop/codes/global-LR/gdal-test.tif', cols, rows, band_num, GDT_dtype) 914 | 915 | # outRaster.SetGeoTransform(tranform_var) 916 | # for b in range(band_num): 917 | # outband = outRaster.GetRasterBand(b + 1) 918 | # outband.WriteArray(np_geotiff[np_geotiff.dtype.names[b]]) 919 | 920 | 921 | # outRasterSRS = osr.SpatialReference() 922 | # outRasterSRS.ImportFromEPSG(int(proj['crs'].split(':')[1])) 923 | # outRaster.SetProjection(outRasterSRS.ExportToWkt()) 924 | 925 | # outband.FlushCache() 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | -------------------------------------------------------------------------------- /main_download.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The main function to download all the data from GEE 3 | 4 | ''' 5 | 6 | 7 | import os 8 | import config.ee_init 9 | import ee 10 | import numpy as np 11 | import geojson 12 | import hydra 13 | from omegaconf import DictConfig, OmegaConf 14 | 15 | from ee_utils.ee_data import ee_set 16 | from utils.utils import read_geojson, update_tile_info 17 | import logging 18 | import h5py 19 | import time 20 | import warnings 21 | import json 22 | warnings.filterwarnings("ignore") 23 | 24 | 25 | 26 | 27 | @hydra.main(config_path='config', config_name='config_data') 28 | def main(cfg: DictConfig) -> None: 29 | print(OmegaConf.to_yaml(cfg)) 30 | 31 | 32 | # setting up the logger. Logs both to the console and to a file inside outputs/ date/ time. This is created by hydra 33 | numeric_level = getattr(logging, cfg.log.upper(), None) 34 | if not isinstance(numeric_level, int): 35 | raise ValueError('Invalid log level: %s' % cfg.log) 36 | logging.basicConfig( 37 | # filename='log.txt', # comment this line if you want to log to console only 38 | level=numeric_level, 39 | format='%(levelname)s : %(message)s', 40 | filemode='w' 41 | ) 42 | 43 | # reading the geojson file 44 | gj = read_geojson(cfg.tiles_path) 45 | datasets = cfg.datasets 46 | cfg.update_geojson = True 47 | 48 | # if sentinel2 is in the datasets, then we need to update the geojson file to include the date of the image, crs, and other details. 49 | # if it is not present, this means that we are downloading other datasets, and hence need to use the geojson data instead of updating 50 | # cfg.update_geojson = 'sentinel2' in datasets 51 | cfg.read_tile_info = not 'sentinel2' in datasets # if we are downloading things other than s2, we need to read the tile information from the geojson. 52 | 53 | tile_info_dict = {} 54 | if cfg.read_tile_info: 55 | logging.info('Reading tile information from geojson. Please note that any errors with the tiles.geojson, implies that you did not download Sentinel 2 yet. Please download Sentinel 2 first. ') 56 | # tile_info = json.load(open(cfg.tile_info_path, 'r')) 57 | tile_info = json.load(open('/projects/dereeco/data/global-lr/data_1M_v001/data_1M_v001_tile_info.json', 'r')) 58 | else: 59 | tile_info = None 60 | 61 | 62 | i = cfg.start_from 63 | end = min(cfg.end_at, len(gj['features'])) 64 | 65 | start = time.time() 66 | 67 | while i < end: 68 | start_ = time.time() 69 | logging.info(f'####################### Processing tile [{i}/{len(gj["features"])}] #######################') 70 | tile = gj['features'][i] 71 | id = tile['properties']['tile_id'] 72 | if cfg.read_tile_info and id not in tile_info.keys(): 73 | # this is not in tile info, hence the s2 has not been downloaded yet. so we skip this tile 74 | logging.info(f"Tile {id} not in tile_info. Skipping") 75 | i += 1 76 | continue 77 | # creating the ee_set object, the function calls are inside the constructor, hence it will automatically download the data 78 | if cfg.read_tile_info: 79 | ee_set_ = ee_set(tile, cfg, tile_info=tile_info[id]) 80 | else: 81 | ee_set_ = ee_set(tile, cfg) 82 | logging.debug(f"Time taken for 1 tile: {time.time() - start_}") 83 | if cfg.update_geojson and not ee_set_.no_data: 84 | tile_info_dict[id] = update_tile_info(tile, ee_set_, tile_info[id] if tile_info is not None else None) 85 | os.makedirs(f"{cfg.tile_info_path}", exist_ok=True) 86 | with open(f"{cfg.tile_info_path}/tile_info_{cfg.start_from}_{cfg.end_at}.json", 'w') as f: 87 | geojson.dump(tile_info_dict, f) 88 | elif ee_set_.no_data: 89 | logging.info(f"no sentinel2 data for this tile. Skipping") 90 | gj['features'].pop(i) 91 | i -= 1 92 | 93 | 94 | 95 | i += 1 96 | # break # we only want to download one tile for now 97 | logging.info(f"TOTAL TIME TAKEN: {time.time() - start}") 98 | logging.info(f"AVG TIME TAKEN: {(time.time() - start)/(end - cfg.start_from)}") 99 | 100 | if __name__ == "__main__": 101 | main() 102 | 103 | 104 | -------------------------------------------------------------------------------- /post_download.py: -------------------------------------------------------------------------------- 1 | # a file to call other functions after the download is complete (post download) 2 | 3 | 4 | from utils.utils import merge_dicts 5 | from utils.normalization import compute_band_stats 6 | from utils.splits import create_splits 7 | import os 8 | import argparse 9 | 10 | def main(args): 11 | ''' 12 | A function to call other functions after the download is complete. 13 | merges all the tile_info files into a single file (these are temporary files created by the slurm jobs) 14 | converts the downloaded data to h5 format 15 | computes the band stats 16 | creates the splits (train and valid only) 17 | ''' 18 | 19 | # print('Merging the tile_info files for all slurm jobs into a single file') 20 | # out_path = os.path.join(args.data_dir, args.data_dir.split('/')[-1] + '_tile_info.json') if args.data_dir[-1] != '/' else os.path.join(args.data_dir, args.data_dir.split('/')[-2] + '_tile_info.json') 21 | # in_path = os.path.join(args.data_dir, 'tile_info') 22 | # merge_dicts(in_path, out_path) 23 | 24 | # print('converting to h5') 25 | # os.system(f'python -u utils/convert_to_h5.py --mode create --data_dir {args.data_dir}') 26 | 27 | print('computing band stats') 28 | compute_band_stats(data_folder = args.data_dir) 29 | 30 | # print('computing splits') 31 | # create_splits(data_folder = args.data_dir) 32 | 33 | 34 | 35 | 36 | 37 | 38 | if __name__ == '__main__': 39 | parser = argparse.ArgumentParser() 40 | # provide the name of the output folder, by default the path of the output json is the name followed by _tile_info.json 41 | parser.add_argument('--data_dir', type=str, help='path to the output folder', required=True) 42 | args = parser.parse_args() 43 | main(args) 44 | -------------------------------------------------------------------------------- /redownload.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import glob 4 | import argparse 5 | 6 | 7 | # set the following values based on the slurm script. 8 | # num_of_tiles = 1300000 9 | # num_of_jobs = 40 10 | # num_tiles_per_job = 33000 11 | # tile_info_path = 'data/tile_info/*.json' 12 | 13 | def main(args): 14 | # num_tiles_per_job = num_of_tiles // num_of_jobs 15 | num_of_tiles = args.num_of_tiles 16 | num_of_jobs = args.num_of_jobs 17 | num_tiles_per_job = args.num_tiles_per_job 18 | tile_info_path = args.tile_info_path 19 | 20 | 21 | tile_info_files = glob.glob(tile_info_path) 22 | 23 | start = [] 24 | stop = [] 25 | total_files = 0 26 | 27 | for i in range(num_tiles_per_job, num_of_tiles+1, num_tiles_per_job): 28 | files = [] 29 | for f in tile_info_files: 30 | # get all the files with the last number equal to i. This gets all the files 31 | # processed in that job or in subsequent redownloads 32 | if (f.split('.')[0].split('_')[-1] == str(i)): 33 | files.append(f) 34 | 35 | # read each file and append the count 36 | count = 0 37 | for f in files: 38 | print(f) 39 | count += len(json.load(open(f, 'r')).keys()) 40 | print(f"Number of tiles processed: {count}") 41 | start.append(i-num_tiles_per_job + count) 42 | stop.append(i) 43 | total_files += num_tiles_per_job - count 44 | 45 | # print(i + count - 1, i - 1)cl 46 | 47 | 48 | 49 | # break 50 | for i in range(len(start)): 51 | print(f"Job {i} : {start[i]} to {stop[i]}") 52 | 53 | print(f"Total number of files to download: {total_files}") 54 | 55 | # write the start and stop to a file 56 | with open('start_stop_redownload.txt', 'w') as f: 57 | for i in range(len(start)): 58 | f.write(f"{start[i]} {stop[i]}\n") 59 | 60 | 61 | 62 | if __name__ == '__main__': 63 | parser = argparse.ArgumentParser() 64 | 65 | parser.add_argument('--num_of_tiles', type=int, help='total number of tiles already downloaded', default=1300000) 66 | parser.add_argument('--num_of_jobs', type=int, help='total number of parallel slurm jobs when downloading the full tiles', default=40) 67 | parser.add_argument('--num_tiles_per_job', type=int, help='total number of tiles processed per job', default=33000) 68 | parser.add_argument('--tile_info_path', type=str, help='path to the tile_info files', default='data/tile_info/*.json', required=True) 69 | 70 | args = parser.parse_args() 71 | 72 | main(args) 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anyio==4.0.0 2 | appnope==0.1.3 3 | argon2-cffi==23.1.0 4 | argon2-cffi-bindings==21.2.0 5 | arrow==1.2.3 6 | asttokens==2.4.0 7 | async-lru==2.0.4 8 | attrs==23.1.0 9 | Babel==2.12.1 10 | backcall==0.2.0 11 | beautifulsoup4==4.12.2 12 | bleach==6.0.0 13 | braceexpand==0.1.7 14 | cachetools==5.3.1 15 | certifi==2023.7.22 16 | cffi==1.15.1 17 | charset-normalizer==3.2.0 18 | click==8.1.7 19 | click-plugins==1.1.1 20 | cligj==0.7.2 21 | comm==0.1.4 22 | contourpy==1.1.0 23 | cycler==0.11.0 24 | debugpy==1.7.0 25 | decorator==5.1.1 26 | defusedxml==0.7.1 27 | earthengine-api==0.1.367 28 | exceptiongroup==1.1.3 29 | executing==1.2.0 30 | fastjsonschema==2.18.0 31 | Fiona==1.9.4.post1 32 | fonttools==4.42.1 33 | fqdn==1.5.1 34 | geojson==3.0.1 35 | geopandas==0.13.2 36 | google-api-core==2.11.1 37 | google-api-python-client==2.97.0 38 | google-auth==2.22.0 39 | google-auth-httplib2==0.1.0 40 | google-cloud-core==2.3.3 41 | google-cloud-storage==2.10.0 42 | google-crc32c==1.5.0 43 | google-resumable-media==2.5.0 44 | googleapis-common-protos==1.60.0 45 | httplib2==0.22.0 46 | hydra-core==1.3.2 47 | idna==3.4 48 | imagecodecs==2023.9.4 49 | importlib-metadata==6.8.0 50 | importlib-resources==6.0.1 51 | ipykernel==6.25.2 52 | ipython==8.15.0 53 | ipython-genutils==0.2.0 54 | ipywidgets==8.1.0 55 | isoduration==20.11.0 56 | jedi==0.19.0 57 | Jinja2==3.1.2 58 | json5==0.9.14 59 | jsonpointer==2.4 60 | jsonschema==4.19.0 61 | jsonschema-specifications==2023.7.1 62 | kiwisolver==1.4.5 63 | MarkupSafe==2.1.3 64 | matplotlib==3.7.2 65 | matplotlib-inline==0.1.6 66 | mistune==3.0.1 67 | nbclient==0.8.0 68 | nbconvert==7.8.0 69 | nbformat==5.9.2 70 | nest-asyncio==1.5.7 71 | notebook==7.0.3 72 | notebook_shim==0.2.3 73 | numpy==1.25.2 74 | omegaconf==2.3.0 75 | overrides==7.4.0 76 | packaging==23.1 77 | pandas==2.1.0 78 | pandocfilters==1.5.0 79 | parso==0.8.3 80 | pexpect==4.8.0 81 | pickleshare==0.7.5 82 | Pillow==10.0.0 83 | platformdirs==3.10.0 84 | prometheus-client==0.17.1 85 | prompt-toolkit==3.0.39 86 | protobuf==4.24.2 87 | psutil==5.9.5 88 | ptyprocess==0.7.0 89 | pure-eval==0.2.2 90 | pyasn1==0.5.0 91 | pyasn1-modules==0.3.0 92 | pycparser==2.21 93 | Pygments==2.16.1 94 | pyparsing==3.0.9 95 | pyproj==3.6.0 96 | python-dateutil==2.8.2 97 | python-json-logger==2.0.7 98 | pytz==2023.3 99 | PyYAML==6.0.1 100 | pyzmq==25.1.1 101 | qtconsole==5.4.4 102 | QtPy==2.4.0 103 | referencing==0.30.2 104 | requests==2.31.0 105 | rfc3339-validator==0.1.4 106 | rfc3986-validator==0.1.1 107 | rpds-py==0.10.2 108 | rsa==4.9 109 | Send2Trash==1.8.2 110 | shapely==2.0.1 111 | six==1.16.0 112 | sniffio==1.3.0 113 | soupsieve==2.5 114 | stack-data==0.6.2 115 | terminado==0.17.1 116 | tifffile==2023.8.30 117 | tinycss2==1.2.1 118 | tomli==2.0.1 119 | tornado==6.3.3 120 | tqdm==4.66.1 121 | traitlets==5.9.0 122 | typing_extensions==4.7.1 123 | tzdata==2023.3 124 | uri-template==1.3.0 125 | uritemplate==4.1.1 126 | urllib3==1.26.16 127 | wcwidth==0.2.6 128 | webcolors==1.13 129 | webdataset==0.2.48 130 | webencodings==0.5.1 131 | websocket-client==1.6.3 132 | widgetsnbextension==4.0.8 133 | zipp==3.16.2 134 | retry 135 | h5py 136 | 137 | -------------------------------------------------------------------------------- /slurm_scripts/slurm_create_tiles.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=geojson-download 3 | 4 | #SBATCH --tasks=1 5 | #SBATCH --cpus-per-task=4 6 | #SBATCH --time=2-00:00:00 7 | # PATH TO SAVE SLURM LOGS 8 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/tiles-download-%A_%a_%x.out 9 | # TOTAL MEMORY PER NODE 10 | #SBATCH --mem=4G 11 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl 12 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 13 | 14 | 15 | 16 | python -u /home/qbk152/vishal/MMEarth-data/create_tiles_polygon.py\ 17 | tiles_geojson_path='/projects/dereeco/data/global-lr/geojson_files/tiles_1M_v001.geojson' \ 18 | num_of_images=1500000 \ 19 | tile_size=1300 \ 20 | uniform_type=0 \ 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /slurm_scripts/slurm_download_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=download 3 | 4 | #SBATCH --array=0-39 5 | #SBATCH --tasks=1 6 | #SBATCH --cpus-per-task=4 7 | #SBATCH --time=20-00:00:00 8 | # PATH TO SAVE SLURM LOGS 9 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/slurm-%A_%a_%x.out 10 | # TOTAL MEMORY PER NODE 11 | #SBATCH --mem=16G 12 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl 13 | 14 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 15 | 16 | 17 | task_per_job=37500 # this number is the total number of tiles divided by the number of jobs 18 | start_from=$((SLURM_ARRAY_TASK_ID * task_per_job)) 19 | end_at=$((start_from + task_per_job)) 20 | 21 | 22 | python /home/qbk152/vishal/MMEarth-data/main_download.py start_from=$start_from end_at=$end_at 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /slurm_scripts/slurm_download_seq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=data-download 3 | 4 | #SBATCH --tasks=1 5 | #SBATCH --cpus-per-task=4 6 | #SBATCH --time=8-00:00:00 7 | # PATH TO SAVE SLURM LOGS 8 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/slurm-%A_%a_%x.out 9 | # TOTAL MEMORY PER NODE 10 | #SBATCH --mem=4G 11 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl 12 | 13 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 14 | 15 | 16 | 17 | python /home/qbk152/vishal/global-lr/main_download.py start_from=4518 end_at=7500 18 | -------------------------------------------------------------------------------- /slurm_scripts/slurm_redownload_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=1-mil-download 3 | 4 | #SBATCH --array=0-39 5 | #SBATCH --tasks=1 6 | #SBATCH --cpus-per-task=4 7 | #SBATCH --time=8-00:00:00 8 | # PATH TO SAVE SLURM LOGS 9 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/slurm-%A_%a_%x.out 10 | # TOTAL MEMORY PER NODE 11 | #SBATCH --mem=4G 12 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl 13 | 14 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 15 | 16 | # Read start and stop values from the file for each task ID 17 | task_file="start_stop_redownload.txt" 18 | line_number=$((SLURM_ARRAY_TASK_ID + 1)) 19 | start_from=$(sed -n "${line_number}p" "$task_file" | awk '{print $1}') 20 | end_at=$(sed -n "${line_number}p" "$task_file" | awk '{print $2}') 21 | 22 | echo "Task ID: $SLURM_ARRAY_TASK_ID, Start from: $start_from, End at: $end_at" 23 | 24 | python /home/qbk152/vishal/global-lr/main_download.py start_from=$start_from end_at=$end_at 25 | -------------------------------------------------------------------------------- /slurm_scripts/slurm_temp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=chunk 3 | 4 | #SBATCH --tasks=1 5 | #SBATCH --cpus-per-task=4 6 | #SBATCH --time=2-00:00:00 7 | # PATH TO SAVE SLURM LOGS 8 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/temp-%A_%a_%x.out 9 | # TOTAL MEMORY PER NODE 10 | #SBATCH --mem=32G 11 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl 12 | 13 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" 14 | 15 | 16 | # use 16G of mem when running the below code 17 | # python -u convert_to_h5.py \ 18 | # --mode create \ 19 | # --data_dir /projects/dereeco/data/global-lr/data_300k_130/ \ 20 | # --tile_info /home/qbk152/vishal/global-lr/data/data_300k_130_tile_info.json \ 21 | # --output_file /projects/dereeco/data/global-lr/data_300k_130/data_300k_130.h5\ 22 | # --missing_tiles /home/qbk152/vishal/global-lr/data/missing_tiles_300k.csv 23 | 24 | # python -u utils/utils.py 25 | # python -u /home/qbk152/vishal/global-lr/normalization.py 26 | 27 | # python -u utils/convert_to_h5.py \ 28 | # --mode merge \ 29 | # --data_dir1 /projects/dereeco/data/global-lr/data_1M_130_new/ \ 30 | # --data_dir2 /projects/dereeco/data/global-lr/data_missing_130/ \ 31 | # --output_path /projects/dereeco/data/global-lr/data_1M_130_new/data_1M_130_new2.h5 \ 32 | 33 | python -u post_download.py \ 34 | --data_dir /projects/dereeco/data/global-lr/data_1M_v001/ 35 | 36 | # python -u data_exp/data_exp.py \ 37 | # --data_dir /projects/dereeco/data/global-lr/data_1M_130_new 38 | 39 | #python -u utils/chunking_h5.py \ 40 | # --h5_file_path /projects/dereeco/data/global-lr/data_1M_130_new/data_1M_130_new.h5 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /stats/biome_labels.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tundra": 0, 3 | "Tropical & Subtropical Moist Broadleaf Forests": 1, 4 | "Mediterranean Forests, Woodlands & Scrub": 2, 5 | "Deserts & Xeric Shrublands": 3, 6 | "Temperate Grasslands, Savannas & Shrublands": 4, 7 | "Boreal Forests/Taiga": 5, 8 | "Temperate Conifer Forests": 6, 9 | "Temperate Broadleaf & Mixed Forests": 7, 10 | "Montane Grasslands & Shrublands": 8, 11 | "Mangroves": 9, 12 | "Flooded Grasslands & Savannas": 10, 13 | "Tropical & Subtropical Grasslands, Savannas & Shrublands": 11, 14 | "Tropical & Subtropical Dry Broadleaf Forests": 12, 15 | "Tropical & Subtropical Coniferous Forests": 13 16 | } -------------------------------------------------------------------------------- /stats/biome_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "Tropical & Subtropical Moist Broadleaf Forests": 230, 3 | "Deserts & Xeric Shrublands": 102, 4 | "Temperate Broadleaf & Mixed Forests": 83, 5 | "Tropical & Subtropical Grasslands, Savannas & Shrublands": 58, 6 | "Tropical & Subtropical Dry Broadleaf Forests": 56, 7 | "Tundra": 51, 8 | "Temperate Grasslands, Savannas & Shrublands": 48, 9 | "Temperate Conifer Forests": 47, 10 | "Montane Grasslands & Shrublands": 46, 11 | "Mediterranean Forests, Woodlands & Scrub": 40, 12 | "Boreal Forests/Taiga": 26, 13 | "Flooded Grasslands & Savannas": 25, 14 | "Mangroves": 19, 15 | "Tropical & Subtropical Coniferous Forests": 15, 16 | "N/A": 1 17 | } -------------------------------------------------------------------------------- /stats/eco_labels.json: -------------------------------------------------------------------------------- 1 | { 2 | "Adelie Land tundra": 0, 3 | "Ahklun and Kilbuck Upland Tundra": 1, 4 | "Alaska-St. Elias Range tundra": 2, 5 | "Aleutian Islands tundra": 3, 6 | "Antipodes Subantarctic Islands tundra": 4, 7 | "Arctic coastal tundra": 5, 8 | "Russian Arctic desert": 6, 9 | "Arctic foothills tundra": 7, 10 | "Russian Bering tundra": 8, 11 | "Beringia lowland tundra": 9, 12 | "Beringia upland tundra": 10, 13 | "Brooks-British Range tundra": 11, 14 | "Canadian Low Arctic tundra": 12, 15 | "Central South Antarctic Peninsula tundra": 13, 16 | "Cherskii-Kolyma mountain tundra": 14, 17 | "Chukchi Peninsula tundra": 15, 18 | "Davis Highlands tundra": 16, 19 | "East Antarctic tundra": 17, 20 | "Ellsworth Land tundra": 18, 21 | "Ellsworth Mountains tundra": 19, 22 | "Enderby Land tundra": 20, 23 | "Canadian High Arctic tundra": 21, 24 | "Interior Yukon-Alaska alpine tundra": 22, 25 | "Kalaallit Nunaat High Arctic tundra": 23, 26 | "Kola Peninsula tundra": 24, 27 | "Canadian Middle Arctic Tundra": 25, 28 | "Northeast Antarctic Peninsula tundra": 26, 29 | "Northwest Antarctic Peninsula tundra": 27, 30 | "North Victoria Land tundra": 28, 31 | "Northeast Siberian coastal tundra": 29, 32 | "Northwest Russian-Novaya Zemlya tundra": 30, 33 | "Novosibirsk Islands Arctic desert": 31, 34 | "Ogilvie-MacKenzie alpine tundra": 32, 35 | "Pacific Coastal Mountain icefields and tundra": 33, 36 | "Prince Charles Mountains tundra": 34, 37 | "Scandinavian Montane Birch forest and grasslands": 35, 38 | "Scotia Sea Islands tundra": 36, 39 | "South Antarctic Peninsula tundra": 37, 40 | "South Orkney Islands tundra": 38, 41 | "South Victoria Land tundra": 39, 42 | "Southern Indian Ocean Islands tundra": 40, 43 | "Taimyr-Central Siberian tundra": 41, 44 | "Torngat Mountain tundra": 42, 45 | "Trans-Baikal Bald Mountain tundra": 43, 46 | "Transantarctic Mountains tundra": 44, 47 | "Wrangel Island Arctic desert": 45, 48 | "Yamal-Gydan tundra": 46, 49 | "Kamchatka tundra": 47, 50 | "Kalaallit Nunaat Arctic steppe": 48, 51 | "Dronning Maud Land tundra": 49, 52 | "Marie Byrd Land tundra": 50, 53 | "Admiralty Islands lowland rain forests": 51, 54 | "Albertine Rift montane forests": 52, 55 | "Alto Paran\u00e1 Atlantic forests": 53, 56 | "Andaman Islands rain forests": 54, 57 | "Araucaria moist forests": 55, 58 | "Atlantic Coast restingas": 56, 59 | "Congolian coastal forests": 57, 60 | "Bahia coastal forests": 58, 61 | "Bahia interior forests": 59, 62 | "Banda Sea Islands moist deciduous forests": 60, 63 | "Biak-Numfoor rain forests": 61, 64 | "Bolivian Yungas": 62, 65 | "Borneo lowland rain forests": 63, 66 | "Borneo peat swamp forests": 64, 67 | "Brahmaputra Valley semi-evergreen forests": 65, 68 | "Buru rain forests": 66, 69 | "Caatinga Enclaves moist forests": 67, 70 | "Cameroon Highlands forests": 68, 71 | "Caqueta moist forests": 69, 72 | "Cardamom Mountains rain forests": 70, 73 | "Carolines tropical moist forests": 71, 74 | "Catatumbo moist forests": 72, 75 | "Cauca Valley montane forests": 73, 76 | "Cayos Miskitos-San Andr\u00e9s and Providencia moist forests": 74, 77 | "Central American Atlantic moist forests": 75, 78 | "Central American montane forests": 76, 79 | "Central Congolian lowland forests": 77, 80 | "Central Polynesian tropical moist forests": 78, 81 | "Central Range Papuan montane rain forests": 79, 82 | "Chao Phraya freshwater swamp forests": 80, 83 | "Chao Phraya lowland moist deciduous forests": 81, 84 | "Chiapas montane forests": 82, 85 | "Chimalapas montane forests": 83, 86 | "Chin Hills-Arakan Yoma montane forests": 84, 87 | "Choc\u00f3-Dari\u00e9n moist forests": 85, 88 | "Christmas and Cocos Islands tropical forests": 86, 89 | "Cocos Island moist forests": 87, 90 | "Comoros forests": 88, 91 | "Cook Islands tropical moist forests": 89, 92 | "Cordillera La Costa montane forests": 90, 93 | "Cordillera Oriental montane forests": 91, 94 | "Costa Rican seasonal moist forests": 92, 95 | "Cross-Niger transition forests": 93, 96 | "Cross-Sanaga-Bioko coastal forests": 94, 97 | "Cuban moist forests": 95, 98 | "Eastern Congolian swamp forests": 96, 99 | "Eastern Cordillera Real montane forests": 97, 100 | "Eastern Guinean forests": 98, 101 | "East Deccan moist deciduous forests": 99, 102 | "Eastern Java-Bali montane rain forests": 100, 103 | "Eastern Java-Bali rain forests": 101, 104 | "Eastern Micronesia tropical moist forests": 102, 105 | "Eastern Panamanian montane forests": 103, 106 | "Ethiopian montane forests": 104, 107 | "Fernando de Noronha-Atol das Rocas moist forests": 105, 108 | "Fiji tropical moist forests": 106, 109 | "Granitic Seychelles forests": 107, 110 | "Greater Negros-Panay rain forests": 108, 111 | "Guianan freshwater swamp forests": 109, 112 | "Guianan Highlands moist forests": 110, 113 | "Guianan lowland moist forests": 111, 114 | "Guianan piedmont moist forests": 112, 115 | "Guinean montane forests": 113, 116 | "Guizhou Plateau broadleaf and mixed forests": 114, 117 | "Gurupa v\u00e1rzea": 115, 118 | "Hainan Island monsoon rain forests": 116, 119 | "Halmahera rain forests": 117, 120 | "Hawai'i tropical moist forests": 118, 121 | "Himalayan subtropical broadleaf forests": 119, 122 | "Hispaniolan moist forests": 120, 123 | "Huon Peninsula montane rain forests": 121, 124 | "Iquitos v\u00e1rzea": 122, 125 | "Irrawaddy freshwater swamp forests": 123, 126 | "Irrawaddy moist deciduous forests": 124, 127 | "Isthmian-Atlantic moist forests": 125, 128 | "Isthmian-Pacific moist forests": 126, 129 | "Jamaican moist forests": 127, 130 | "Japur\u00e1-Solim\u00f5es-Negro moist forests": 128, 131 | "Jian Nan subtropical evergreen forests": 129, 132 | "Juru\u00e1-Purus moist forests": 130, 133 | "Kayah-Karen montane rain forests": 131, 134 | "Kermadec Islands subtropical moist forests": 132, 135 | "Knysna-Amatole montane forests": 133, 136 | "Kwazulu Natal-Cape coastal forests": 134, 137 | "Leeward Islands moist forests": 135, 138 | "Lord Howe Island subtropical forests": 136, 139 | "Louisiade Archipelago rain forests": 137, 140 | "Lower Gangetic Plains moist deciduous forests": 138, 141 | "Luang Prabang montane rain forests": 139, 142 | "Luzon montane rain forests": 140, 143 | "Luzon rain forests": 141, 144 | "Madagascar humid forests": 142, 145 | "Madagascar subhumid forests": 143, 146 | "Madeira-Tapaj\u00f3s moist forests": 144, 147 | "Magdalena-Urab\u00e1 moist forests": 145, 148 | "Magdalena Valley montane forests": 146, 149 | "Malabar Coast moist forests": 147, 150 | "Maldives-Lakshadweep-Chagos Archipelago tropical moist forests": 148, 151 | "Maputaland coastal forests and woodlands": 149, 152 | "Maraj\u00f3 v\u00e1rzea": 150, 153 | "Mara\u00f1\u00f3n dry forests": 151, 154 | "Marquesas tropical moist forests": 152, 155 | "Mascarene forests": 153, 156 | "Mato Grosso tropical dry forests": 154, 157 | "Meghalaya subtropical forests": 155, 158 | "Mentawai Islands rain forests": 156, 159 | "Mindanao-Eastern Visayas rain forests": 157, 160 | "Mindanao montane rain forests": 158, 161 | "Mindoro rain forests": 159, 162 | "Mizoram-Manipur-Kachin rain forests": 160, 163 | "Monte Alegre v\u00e1rzea": 161, 164 | "Mount Cameroon and Bioko montane forests": 162, 165 | "Myanmar coastal rain forests": 163, 166 | "Nansei Islands subtropical evergreen forests": 164, 167 | "Napo moist forests": 165, 168 | "Negro-Branco moist forests": 166, 169 | "New Britain-New Ireland lowland rain forests": 167, 170 | "New Britain-New Ireland montane rain forests": 168, 171 | "New Caledonia rain forests": 169, 172 | "Nicobar Islands rain forests": 170, 173 | "Niger Delta swamp forests": 171, 174 | "Nigerian lowland forests": 172, 175 | "Norfolk Island subtropical forests": 173, 176 | "North Western Ghats moist deciduous forests": 174, 177 | "North Western Ghats montane rain forests": 175, 178 | "Northeast Brazil restingas": 176, 179 | "Northeast Congolian lowland forests": 177, 180 | "Northern Annamites rain forests": 178, 181 | "Northern Indochina subtropical forests": 179, 182 | "Northern Khorat Plateau moist deciduous forests": 180, 183 | "Northern New Guinea lowland rain and freshwater swamp forests": 181, 184 | "Northern New Guinea montane rain forests": 182, 185 | "Northern Swahili coastal forests": 183, 186 | "Northern Thailand-Laos moist deciduous forests": 184, 187 | "Northern Triangle subtropical forests": 185, 188 | "Northern Vietnam lowland rain forests": 186, 189 | "Northwest Andean montane forests": 187, 190 | "Northwest Congolian lowland forests": 188, 191 | "Oaxacan montane forests": 189, 192 | "Ogasawara subtropical moist forests": 190, 193 | "Orinoco Delta swamp forests": 191, 194 | "Orissa semi-evergreen forests": 192, 195 | "Palau tropical moist forests": 193, 196 | "Palawan rain forests": 194, 197 | "Pantanos de Centla": 195, 198 | "Pantepui forests & shrublands": 196, 199 | "Peninsular Malaysian montane rain forests": 197, 200 | "Peninsular Malaysian peat swamp forests": 198, 201 | "Peninsular Malaysian rain forests": 199, 202 | "Pernambuco coastal forests": 200, 203 | "Pernambuco interior forests": 201, 204 | "Peruvian Yungas": 202, 205 | "Pet\u00e9n-Veracruz moist forests": 203, 206 | "Puerto Rican moist forests": 204, 207 | "Purus-Madeira moist forests": 205, 208 | "Purus v\u00e1rzea": 206, 209 | "Queensland tropical rain forests": 207, 210 | "Rapa Nui and Sala y G\u00f3mez subtropical forests": 208, 211 | "Red River freshwater swamp forests": 209, 212 | "Rio Negro campinarana": 210, 213 | "Samoan tropical moist forests": 211, 214 | "Santa Marta montane forests": 212, 215 | "S\u00e3o Tom\u00e9, Pr\u00edncipe, and Annob\u00f3n forests": 213, 216 | "Seram rain forests": 214, 217 | "Serra do Mar coastal forests": 215, 218 | "Sierra de los Tuxtlas": 216, 219 | "Sierra Madre de Chiapas moist forests": 217, 220 | "Society Islands tropical moist forests": 218, 221 | "Solim\u00f5es-Japur\u00e1 moist forests": 219, 222 | "Solomon Islands rain forests": 220, 223 | "South China-Vietnam subtropical evergreen forests": 221, 224 | "South China Sea Islands": 222, 225 | "South Taiwan monsoon rain forests": 223, 226 | "South Western Ghats moist deciduous forests": 224, 227 | "South Western Ghats montane rain forests": 225, 228 | "Southeast Papuan rain forests": 226, 229 | "Southern Andean Yungas": 227, 230 | "Southern Annamites montane rain forests": 228, 231 | "Southern New Guinea freshwater swamp forests": 229, 232 | "Southern New Guinea lowland rain forests": 230, 233 | "Southern Swahili coastal forests and woodlands": 231, 234 | "Southwest Amazon moist forests": 232, 235 | "Southwest Borneo freshwater swamp forests": 233, 236 | "Sri Lanka lowland rain forests": 234, 237 | "Sri Lanka montane rain forests": 235, 238 | "Sulawesi montane rain forests": 236, 239 | "Sulu Archipelago rain forests": 237, 240 | "Sumatran freshwater swamp forests": 238, 241 | "Sumatran lowland rain forests": 239, 242 | "Sumatran montane rain forests": 240, 243 | "Sumatran peat swamp forests": 241, 244 | "Sundaland heath forests": 242, 245 | "Sundarbans freshwater swamp forests": 243, 246 | "Taiwan subtropical evergreen forests": 244, 247 | "Talamancan montane forests": 245, 248 | "Tapaj\u00f3s-Xingu moist forests": 246, 249 | "Tenasserim-South Thailand semi-evergreen rain forests": 247, 250 | "Tongan tropical moist forests": 248, 251 | "Tonle Sap-Mekong peat swamp forests": 249, 252 | "Tonle Sap freshwater swamp forests": 250, 253 | "Trinidad and Tobago moist forest": 251, 254 | "Trobriand Islands rain forests": 252, 255 | "Tuamotu tropical moist forests": 253, 256 | "Tubuai tropical moist forests": 254, 257 | "Uatum\u00e3-Trombetas moist forests": 255, 258 | "Ucayali moist forests": 256, 259 | "Upper Gangetic Plains moist deciduous forests": 257, 260 | "Vanuatu rain forests": 258, 261 | "Venezuelan Andes montane forests": 259, 262 | "Veracruz moist forests": 260, 263 | "Veracruz montane forests": 261, 264 | "Vogelkop-Aru lowland rain forests": 262, 265 | "Vogelkop montane rain forests": 263, 266 | "Western Congolian swamp forests": 264, 267 | "Western Ecuador moist forests": 265, 268 | "Western Guinean lowland forests": 266, 269 | "Western Java montane rain forests": 267, 270 | "Western Java rain forests": 268, 271 | "Western Polynesian tropical moist forests": 269, 272 | "Windward Islands moist forests": 270, 273 | "Xingu-Tocantins-Araguaia moist forests": 271, 274 | "Yapen rain forests": 272, 275 | "Yucat\u00e1n moist forests": 273, 276 | "Yunnan Plateau subtropical evergreen forests": 274, 277 | "Tocantins/Pindare moist forests": 275, 278 | "Trindade-Martin Vaz Islands tropical forests": 276, 279 | "Sulawesi lowland rain forests": 277, 280 | "East African montane forests": 278, 281 | "Eastern Arc forests": 279, 282 | "Borneo montane rain forests": 280, 283 | "Aegean and Western Turkey sclerophyllous and mixed forests": 281, 284 | "Albany thickets": 282, 285 | "Anatolian conifer and deciduous mixed forests": 283, 286 | "California coastal sage and chaparral": 284, 287 | "California interior chaparral and woodlands": 285, 288 | "California montane chaparral and woodlands": 286, 289 | "Canary Islands dry woodlands and forests": 287, 290 | "Chilean Matorral": 288, 291 | "Coolgardie woodlands": 289, 292 | "Corsican montane broadleaf and mixed forests": 290, 293 | "Crete Mediterranean forests": 291, 294 | "Cyprus Mediterranean forests": 292, 295 | "Eastern Mediterranean conifer-broadleaf forests": 293, 296 | "Esperance mallee": 294, 297 | "Eyre and York mallee": 295, 298 | "Flinders-Lofty montane woodlands": 296, 299 | "Fynbos shrubland": 297, 300 | "Hampton mallee and woodlands": 298, 301 | "Iberian conifer forests": 299, 302 | "Iberian sclerophyllous and semi-deciduous forests": 300, 303 | "Illyrian deciduous forests": 301, 304 | "Italian sclerophyllous and semi-deciduous forests": 302, 305 | "Jarrah-Karri forest and shrublands": 303, 306 | "Mediterranean Acacia-Argania dry woodlands and succulent thickets": 304, 307 | "Mediterranean dry woodlands and steppe": 305, 308 | "Mediterranean woodlands and forests": 306, 309 | "Murray-Darling woodlands and mallee": 307, 310 | "Naracoorte woodlands": 308, 311 | "Northeast Spain and Southern France Mediterranean forests": 309, 312 | "Northwest Iberian montane forests": 310, 313 | "Pindus Mountains mixed forests": 311, 314 | "Renosterveld shrubland": 312, 315 | "Santa Lucia Montane Chaparral & Woodlands": 313, 316 | "South Apennine mixed montane forests": 314, 317 | "Southeast Iberian shrubs and woodlands": 315, 318 | "Southern Anatolian montane conifer and deciduous forests": 316, 319 | "Southwest Australia savanna": 317, 320 | "Southwest Australia woodlands": 318, 321 | "Southwest Iberian Mediterranean sclerophyllous and mixed forests": 319, 322 | "Tyrrhenian-Adriatic sclerophyllous and mixed forests": 320, 323 | "Afghan Mountains semi-desert": 321, 324 | "Alashan Plateau semi-desert": 322, 325 | "Aldabra Island xeric scrub": 323, 326 | "Arabian sand desert": 324, 327 | "Araya and Paria xeric scrub": 325, 328 | "Atacama desert": 326, 329 | "Saharan Atlantic coastal desert": 327, 330 | "Azerbaijan shrub desert and steppe": 328, 331 | "Badghyz and Karabil semi-desert": 329, 332 | "Baja California desert": 330, 333 | "Baluchistan xeric woodlands": 331, 334 | "Caribbean shrublands": 332, 335 | "Carnarvon xeric shrublands": 333, 336 | "Caspian lowland desert": 334, 337 | "Central Afghan Mountains xeric woodlands": 335, 338 | "Central Asian northern desert": 336, 339 | "Central Asian riparian woodlands": 337, 340 | "Central Asian southern desert": 338, 341 | "Central Mexican matorral": 339, 342 | "Central Persian desert basins": 340, 343 | "Central Ranges xeric scrub": 341, 344 | "Chihuahuan desert": 342, 345 | "Colorado Plateau shrublands": 343, 346 | "Cuban cactus scrub": 344, 347 | "Deccan thorn scrub forests": 345, 348 | "Djibouti xeric shrublands": 346, 349 | "East Arabian fog shrublands and sand desert": 347, 350 | "East Sahara Desert": 348, 351 | "East Saharan montane xeric woodlands": 349, 352 | "Eastern Gobi desert steppe": 350, 353 | "Eritrean coastal desert": 351, 354 | "Gal\u00e1pagos Islands xeric scrub": 352, 355 | "Gariep Karoo": 353, 356 | "Gibson desert": 354, 357 | "Godavari-Krishna mangroves": 355, 358 | "Gobi Lakes Valley desert steppe": 356, 359 | "Great Basin shrub steppe": 357, 360 | "Great Lakes Basin desert steppe": 358, 361 | "Great Sandy-Tanami desert": 359, 362 | "Great Victoria desert": 360, 363 | "Guajira-Barranquilla xeric scrub": 361, 364 | "Gulf of California xeric scrub": 362, 365 | "Hobyo grasslands and shrublands": 363, 366 | "Indus Valley desert": 364, 367 | "Junggar Basin semi-desert": 365, 368 | "Kalahari xeric savanna": 366, 369 | "Kaokoveld desert": 367, 370 | "Kazakh semi-desert": 368, 371 | "Kopet Dag semi-desert": 369, 372 | "La Costa xeric shrublands": 370, 373 | "Madagascar spiny thickets": 371, 374 | "Madagascar succulent woodlands": 372, 375 | "Malpelo Island xeric scrub": 373, 376 | "Meseta Central matorral": 374, 377 | "Mesopotamian shrub desert": 375, 378 | "Mojave desert": 376, 379 | "Motagua Valley thornscrub": 377, 380 | "Nama Karoo shrublands": 378, 381 | "Namaqualand-Richtersveld steppe": 379, 382 | "Namib Desert": 380, 383 | "Namibian savanna woodlands": 381, 384 | "North Arabian desert": 382, 385 | "North Arabian highland shrublands": 383, 386 | "North Saharan Xeric Steppe and Woodland": 384, 387 | "Somali montane xeric woodlands": 385, 388 | "Aravalli west thorn scrub forests": 386, 389 | "Nullarbor Plains xeric shrublands": 387, 390 | "Paraguan\u00e1 xeric scrub": 388, 391 | "Paropamisus xeric woodlands": 389, 392 | "Pilbara shrublands": 390, 393 | "Qaidam Basin semi-desert": 391, 394 | "Red Sea-Arabian Desert shrublands": 392, 395 | "Red Sea coastal desert": 393, 396 | "Registan-North Pakistan sandy desert": 394, 397 | "San Lucan xeric scrub": 395, 398 | "Sechura desert": 396, 399 | "Simpson desert": 397, 400 | "Snake-Columbia shrub steppe": 398, 401 | "Socotra Island xeric shrublands": 399, 402 | "Sonoran desert": 400, 403 | "South Iran Nubo-Sindian desert and semi-desert": 401, 404 | "South Sahara desert": 402, 405 | "Southwest Arabian Escarpment shrublands and woodlands": 403, 406 | "Southwest Arabian highland xeric scrub": 404, 407 | "St. Peter and St. Paul Rocks": 405, 408 | "Succulent Karoo xeric shrublands": 406, 409 | "Taklimakan desert": 407, 410 | "Tamaulipan matorral": 408, 411 | "Tamaulipan mezquital": 409, 412 | "Tehuac\u00e1n Valley matorral": 410, 413 | "Thar desert": 411, 414 | "Tibesti-Jebel Uweinat montane xeric woodlands": 412, 415 | "Tirari-Sturt stony desert": 413, 416 | "West Sahara desert": 414, 417 | "West Saharan montane xeric woodlands": 415, 418 | "Western Australian Mulga shrublands": 416, 419 | "Wyoming Basin shrub steppe": 417, 420 | "Southwest Arabian coastal xeric shrublands": 418, 421 | "Arabian-Persian Gulf coastal plain desert": 419, 422 | "South Arabian plains and plateau desert": 420, 423 | "Arabian desert": 421, 424 | "Ile Europa and Bassas da India xeric scrub": 422, 425 | "Al-Hajar foothill xeric woodlands and shrublands": 423, 426 | "Al-Hajar montane woodlands and shrublands": 424, 427 | "Alai-Western Tian Shan steppe": 425, 428 | "Altai steppe and semi-desert": 426, 429 | "Amsterdam-Saint Paul Islands temperate grasslands": 427, 430 | "California Central Valley grasslands": 428, 431 | "Canadian Aspen forests and parklands": 429, 432 | "Canterbury-Otago tussock grasslands": 430, 433 | "Central Anatolian steppe": 431, 434 | "Central-Southern US mixed grasslands": 432, 435 | "Central US forest-grasslands transition": 433, 436 | "Central Tallgrass prairie": 434, 437 | "Cross-Timbers savanna-woodland": 435, 438 | "Daurian forest steppe": 436, 439 | "Eastern Anatolian montane steppe": 437, 440 | "Eastern Australia mulga shrublands": 438, 441 | "Edwards Plateau savanna": 439, 442 | "Emin Valley steppe": 440, 443 | "Espinal": 441, 444 | "Faroe Islands boreal grasslands": 442, 445 | "Flint Hills tallgrass prairie": 443, 446 | "Gissaro-Alai open woodlands": 444, 447 | "Humid Pampas": 445, 448 | "Kazakh forest steppe": 446, 449 | "Kazakh steppe": 447, 450 | "Kazakh upland steppe": 448, 451 | "Low Monte": 449, 452 | "Mid-Atlantic US coastal savannas": 450, 453 | "Mongolian-Manchurian grassland": 451, 454 | "Montana Valley and Foothill grasslands": 452, 455 | "Nebraska Sand Hills mixed grasslands": 453, 456 | "Northern Shortgrass prairie": 454, 457 | "Northern Tallgrass prairie": 455, 458 | "Palouse prairie": 456, 459 | "Patagonian steppe": 457, 460 | "Pontic steppe": 458, 461 | "Sayan Intermontane steppe": 459, 462 | "Selenge-Orkhon forest steppe": 460, 463 | "South Siberian forest steppe": 461, 464 | "Southeast Australia temperate savanna": 462, 465 | "Southeast US mixed woodlands and savannas": 463, 466 | "Southeast US conifer savannas": 464, 467 | "Syrian xeric grasslands and shrublands": 465, 468 | "Texas blackland prairies": 466, 469 | "Tian Shan foothill arid steppe": 467, 470 | "Tristan Da Cunha-Gough Islands shrub and grasslands": 468, 471 | "Western shortgrass prairie": 469, 472 | "Willamette Valley oak savanna": 470, 473 | "Alaska Peninsula montane taiga": 471, 474 | "Central Canadian Shield forests": 472, 475 | "Cook Inlet taiga": 473, 476 | "Copper Plateau taiga": 474, 477 | "East Siberian taiga": 475, 478 | "Eastern Canadian forests": 476, 479 | "Eastern Canadian Shield taiga": 477, 480 | "Iceland boreal birch forests and alpine tundra": 478, 481 | "Interior Alaska-Yukon lowland taiga": 479, 482 | "Kamchatka-Kurile meadows and sparse forests": 480, 483 | "Kamchatka taiga": 481, 484 | "Mid-Canada Boreal Plains forests": 482, 485 | "Midwest Canadian Shield forests": 483, 486 | "Muskwa-Slave Lake taiga": 484, 487 | "Northeast Siberian taiga": 485, 488 | "Northern Canadian Shield taiga": 486, 489 | "Okhotsk-Manchurian taiga": 487, 490 | "Sakhalin Island taiga": 488, 491 | "Scandinavian and Russian taiga": 489, 492 | "Southern Hudson Bay taiga": 490, 493 | "Trans-Baikal conifer forests": 491, 494 | "Urals montane forest and taiga": 492, 495 | "Watson Highlands taiga": 493, 496 | "West Siberian taiga": 494, 497 | "Northern Cordillera forests": 495, 498 | "Northwest Territories taiga": 496, 499 | "Alberta-British Columbia foothills forests": 497, 500 | "Alps conifer and mixed forests": 498, 501 | "Altai montane forest and forest steppe": 499, 502 | "Arizona Mountains forests": 500, 503 | "Atlantic coastal pine barrens": 501, 504 | "Blue Mountains forests": 502, 505 | "British Columbia coastal conifer forests": 503, 506 | "Caledon conifer forests": 504, 507 | "Carpathian montane forests": 505, 508 | "Central-Southern Cascades Forests": 506, 509 | "Central British Columbia Mountain forests": 507, 510 | "Central Pacific Northwest coastal forests": 508, 511 | "Colorado Rockies forests": 509, 512 | "Da Hinggan-Dzhagdy Mountains conifer forests": 510, 513 | "East Afghan montane conifer forests": 511, 514 | "Eastern Cascades forests": 512, 515 | "Eastern Himalayan subalpine conifer forests": 513, 516 | "Elburz Range forest steppe": 514, 517 | "Fraser Plateau and Basin conifer forests": 515, 518 | "Great Basin montane forests": 516, 519 | "Helanshan montane conifer forests": 517, 520 | "Hengduan Mountains subalpine conifer forests": 518, 521 | "Hokkaido montane conifer forests": 519, 522 | "Honshu alpine conifer forests": 520, 523 | "Khangai Mountains conifer forests": 521, 524 | "Klamath-Siskiyou forests": 522, 525 | "Mediterranean conifer and mixed forests": 523, 526 | "Northeast Himalayan subalpine conifer forests": 524, 527 | "Northern Anatolian conifer and deciduous forests": 525, 528 | "Northern California coastal forests": 526, 529 | "Nujiang Langcang Gorge alpine conifer and mixed forests": 527, 530 | "Okanogan dry forests": 528, 531 | "Piney Woods": 529, 532 | "Puget lowland forests": 530, 533 | "Qilian Mountains conifer forests": 531, 534 | "Qionglai-Minshan conifer forests": 532, 535 | "Sayan montane conifer forests": 533, 536 | "Scandinavian coastal conifer forests": 534, 537 | "Sierra Nevada forests": 535, 538 | "South Central Rockies forests": 536, 539 | "Tian Shan montane conifer forests": 537, 540 | "Wasatch and Uinta montane forests": 538, 541 | "Western Himalayan subalpine conifer forests": 539, 542 | "Queen Charlotte Islands conifer forests": 540, 543 | "Northern Pacific Alaskan coastal forests": 541, 544 | "North Cascades conifer forests": 542, 545 | "Northern Rockies conifer forests": 543, 546 | "Allegheny Highlands forests": 544, 547 | "Appalachian-Blue Ridge forests": 545, 548 | "Appalachian mixed mesophytic forests": 546, 549 | "Appalachian Piedmont forests": 547, 550 | "Appenine deciduous montane forests": 548, 551 | "European Atlantic mixed forests": 549, 552 | "Azores temperate mixed forests": 550, 553 | "Balkan mixed forests": 551, 554 | "Baltic mixed forests": 552, 555 | "Cantabrian mixed forests": 553, 556 | "Caspian Hyrcanian mixed forests": 554, 557 | "Caucasus mixed forests": 555, 558 | "Celtic broadleaf forests": 556, 559 | "Central Anatolian steppe and woodlands": 557, 560 | "Central China Loess Plateau mixed forests": 558, 561 | "Central European mixed forests": 559, 562 | "Central Korean deciduous forests": 560, 563 | "Changbai Mountains mixed forests": 561, 564 | "Changjiang Plain evergreen forests": 562, 565 | "Chatham Island temperate forests": 563, 566 | "Crimean Submediterranean forest complex": 564, 567 | "Daba Mountains evergreen forests": 565, 568 | "Dinaric Mountains mixed forests": 566, 569 | "East Central Texas forests": 567, 570 | "East European forest steppe": 568, 571 | "Eastern Anatolian deciduous forests": 569, 572 | "Eastern Australian temperate forests": 570, 573 | "Eastern Canadian Forest-Boreal transition": 571, 574 | "Eastern Great Lakes lowland forests": 572, 575 | "Eastern Himalayan broadleaf forests": 573, 576 | "English Lowlands beech forests": 574, 577 | "Euxine-Colchic broadleaf forests": 575, 578 | "Fiordland temperate forests": 576, 579 | "Gulf of St. Lawrence lowland forests": 577, 580 | "Hokkaido deciduous forests": 578, 581 | "Huang He Plain mixed forests": 579, 582 | "Interior Plateau US Hardwood Forests": 580, 583 | "Juan Fern\u00e1ndez Islands temperate forests": 581, 584 | "Madeira evergreen forests": 582, 585 | "Magellanic subpolar forests": 583, 586 | "Manchurian mixed forests": 584, 587 | "Mississippi lowland forests": 585, 588 | "Nelson Coast temperate forests": 586, 589 | "New England-Acadian forests": 587, 590 | "Nihonkai evergreen forests": 588, 591 | "Nihonkai montane deciduous forests": 589, 592 | "New Zealand North Island temperate forests": 590, 593 | "Northeast China Plain deciduous forests": 591, 594 | "Northeast US Coastal forests": 592, 595 | "Northern Triangle temperate forests": 593, 596 | "Northland temperate kauri forests": 594, 597 | "Ozark Highlands mixed forests": 595, 598 | "Ozark Mountain forests": 596, 599 | "Pannonian mixed forests": 597, 600 | "Po Basin mixed forests": 598, 601 | "Pyrenees conifer and mixed forests": 599, 602 | "Qin Ling Mountains deciduous forests": 600, 603 | "Rakiura Island temperate forests": 601, 604 | "Richmond temperate forests": 602, 605 | "Rodope montane mixed forests": 603, 606 | "San F\u00e9lix-San Ambrosio Islands temperate forests": 604, 607 | "Sarmatic mixed forests": 605, 608 | "Sichuan Basin evergreen broadleaf forests": 606, 609 | "New Zealand South Island temperate forests": 607, 610 | "Southeast Australia temperate forests": 608, 611 | "Southern Great Lakes forests": 609, 612 | "Southern Korea evergreen forests": 610, 613 | "Taiheiyo evergreen forests": 611, 614 | "Taiheiyo montane deciduous forests": 612, 615 | "Tarim Basin deciduous forests and steppe": 613, 616 | "Tasmanian Central Highland forests": 614, 617 | "Tasmanian temperate forests": 615, 618 | "Tasmanian temperate rain forests": 616, 619 | "Upper Midwest US forest-savanna transition": 617, 620 | "Ussuri broadleaf and mixed forests": 618, 621 | "Valdivian temperate forests": 619, 622 | "Western European broadleaf forests": 620, 623 | "Western Great Lakes forests": 621, 624 | "Western Himalayan broadleaf forests": 622, 625 | "Western Siberian hemiboreal forests": 623, 626 | "Westland temperate forests": 624, 627 | "Zagros Mountains forest steppe": 625, 628 | "North Atlantic moist mixed forests": 626, 629 | "Altai alpine meadow and tundra": 627, 630 | "Angolan montane forest-grassland": 628, 631 | "Australian Alps montane grasslands": 629, 632 | "Central Andean dry puna": 630, 633 | "Central Andean puna": 631, 634 | "Central Andean wet puna": 632, 635 | "Papuan Central Range sub-alpine grasslands": 633, 636 | "Central Tibetan Plateau alpine steppe": 634, 637 | "Cordillera Central p\u00e1ramo": 635, 638 | "Cordillera de Merida p\u00e1ramo": 636, 639 | "Eastern Himalayan alpine shrub and meadows": 637, 640 | "Ethiopian montane grasslands and woodlands": 638, 641 | "Ethiopian montane moorlands": 639, 642 | "Ghorat-Hazarajat alpine meadow": 640, 643 | "High Monte": 641, 644 | "Highveld grasslands": 642, 645 | "Hindu Kush alpine meadow": 643, 646 | "Jos Plateau forest-grassland": 644, 647 | "Karakoram-West Tibetan Plateau alpine steppe": 645, 648 | "Khangai Mountains alpine meadow": 646, 649 | "Kopet Dag woodlands and forest steppe": 647, 650 | "Kuh Rud and Eastern Iran montane woodlands": 648, 651 | "Madagascar ericoid thickets": 649, 652 | "Mediterranean High Atlas juniper steppe": 650, 653 | "Mulanje Montane forest-grassland": 651, 654 | "North Tibetan Plateau-Kunlun Mountains alpine desert": 652, 655 | "Northern Andean p\u00e1ramo": 653, 656 | "Northwestern Himalayan alpine shrub and meadows": 654, 657 | "Nyanga-Chimanimani Montane forest-grassland": 655, 658 | "Ordos Plateau steppe": 656, 659 | "Pamir alpine desert and tundra": 657, 660 | "Qilian Mountains subalpine meadows": 658, 661 | "Rwenzori-Virunga montane moorlands": 659, 662 | "Santa Marta p\u00e1ramo": 660, 663 | "Sayan alpine meadows and tundra": 661, 664 | "New Zealand South Island montane grasslands": 662, 665 | "Southeast Tibet shrublands and meadows": 663, 666 | "Southern Andean steppe": 664, 667 | "Southern Rift Montane forest-grassland": 665, 668 | "Sulaiman Range alpine meadows": 666, 669 | "Tian Shan montane steppe and meadows": 667, 670 | "Tibetan Plateau alpine shrublands and meadows": 668, 671 | "Western Himalayan alpine shrub and meadows": 669, 672 | "Yarlung Zanbo arid steppe": 670, 673 | "East African montane moorlands": 671, 674 | "Kinabalu montane alpine meadows": 672, 675 | "Amazon-Orinoco-Southern Caribbean mangroves": 673, 676 | "Bahamian-Antillean mangroves": 674, 677 | "Central African mangroves": 675, 678 | "East African mangroves": 676, 679 | "Guinean mangroves": 677, 680 | "Indochina mangroves": 678, 681 | "Indus River Delta-Arabian Sea mangroves": 679, 682 | "Madagascar mangroves": 680, 683 | "Mesoamerican Gulf-Caribbean mangroves": 681, 684 | "Myanmar Coast mangroves": 682, 685 | "New Guinea mangroves": 683, 686 | "Northern Mesoamerican Pacific mangroves": 684, 687 | "Red Sea mangroves": 685, 688 | "South American Pacific mangroves": 686, 689 | "Southern Africa mangroves": 687, 690 | "Southern Atlantic Brazilian mangroves": 688, 691 | "Southern Mesoamerican Pacific mangroves": 689, 692 | "Sunda Shelf mangroves": 690, 693 | "Sundarbans mangroves": 691, 694 | "Amur meadow steppe": 692, 695 | "Bohai Sea saline meadow": 693, 696 | "Cuban wetlands": 694, 697 | "East African halophytics": 695, 698 | "Enriquillo wetlands": 696, 699 | "Etosha Pan halophytics": 697, 700 | "Everglades flooded grasslands": 698, 701 | "Guayaquil flooded grasslands": 699, 702 | "Inner Niger Delta flooded savanna": 700, 703 | "Lake Chad flooded savanna": 701, 704 | "Makgadikgadi halophytics": 702, 705 | "Nenjiang River grassland": 703, 706 | "Nile Delta flooded savanna": 704, 707 | "Orinoco wetlands": 705, 708 | "Pantanal": 706, 709 | "Paran\u00e1 flooded savanna": 707, 710 | "Rann of Kutch seasonal salt marsh": 708, 711 | "Saharan halophytics": 709, 712 | "Southern Cone Mesopotamian savanna": 710, 713 | "Sudd flooded grasslands": 711, 714 | "Suiphun-Khanka meadows and forest meadows": 712, 715 | "Tigris-Euphrates alluvial salt marsh": 713, 716 | "Yellow Sea saline meadow": 714, 717 | "Zambezian coastal flooded savanna": 715, 718 | "Zambezian flooded grasslands": 716, 719 | "Angolan mopane woodlands": 717, 720 | "Angolan scarp savanna and woodlands": 718, 721 | "Angolan wet miombo woodlands": 719, 722 | "Arnhem Land tropical savanna": 720, 723 | "Ascension scrub and grasslands": 721, 724 | "Belizian pine savannas": 722, 725 | "Beni savanna": 723, 726 | "Brigalow tropical savanna": 724, 727 | "Campos Rupestres montane savanna": 725, 728 | "Cape York Peninsula tropical savanna": 726, 729 | "Carpentaria tropical savanna": 727, 730 | "Central bushveld": 728, 731 | "Central Zambezian wet miombo woodlands": 729, 732 | "Cerrado": 730, 733 | "Clipperton Island shrub and grasslands": 731, 734 | "Drakensberg Escarpment savanna and thicket": 732, 735 | "Drakensberg grasslands": 733, 736 | "Dry Chaco": 734, 737 | "East Sudanian savanna": 735, 738 | "Einasleigh upland savanna": 736, 739 | "Guianan savanna": 737, 740 | "Guinean forest-savanna": 738, 741 | "Hawai'i tropical high shrublands": 739, 742 | "Hawai'i tropical low shrublands": 740, 743 | "Horn of Africa xeric bushlands": 741, 744 | "Humid Chaco": 742, 745 | "Itigi-Sumbu thicket": 743, 746 | "Kalahari Acacia woodlands": 744, 747 | "Kimberly tropical savanna": 745, 748 | "Limpopo lowveld": 746, 749 | "Llanos": 747, 750 | "Mandara Plateau woodlands": 748, 751 | "Masai xeric grasslands and shrublands": 749, 752 | "Miskito pine forests": 750, 753 | "Mitchell Grass Downs": 751, 754 | "Northern Acacia-Commiphora bushlands and thickets": 752, 755 | "Northern Congolian Forest-Savanna": 753, 756 | "Northwest Hawai'i scrub": 754, 757 | "Sahelian Acacia savanna": 755, 758 | "Serengeti volcanic grasslands": 756, 759 | "Somali Acacia-Commiphora bushlands and thickets": 757, 760 | "Southern Acacia-Commiphora bushlands and thickets": 758, 761 | "Southern Congolian forest-savanna": 759, 762 | "Terai-Duar savanna and grasslands": 760, 763 | "Trans Fly savanna and grasslands": 761, 764 | "Uruguayan savanna": 762, 765 | "Victoria Plains tropical savanna": 763, 766 | "West Sudanian savanna": 764, 767 | "Western Gulf coastal grasslands": 765, 768 | "Zambezian-Limpopo mixed woodlands": 766, 769 | "Zambezian Baikiaea woodlands": 767, 770 | "Dry miombo woodlands": 768, 771 | "Zambezian mopane woodlands": 769, 772 | "St. Helena scrub and woodlands": 770, 773 | "Victoria Basin forest-savanna": 771, 774 | "Western Congolian forest-savanna": 772, 775 | "Southwest Arabian montane woodlands and grasslands": 773, 776 | "South Arabian fog woodlands, shrublands, and dune": 774, 777 | "Apure-Villavicencio dry forests": 775, 778 | "Baj\u00edo dry forests": 776, 779 | "Balsas dry forests": 777, 780 | "Bolivian montane dry forests": 778, 781 | "Caatinga": 779, 782 | "Cape Verde Islands dry forests": 780, 783 | "Cauca Valley dry forests": 781, 784 | "Central American dry forests": 782, 785 | "Central Deccan Plateau dry deciduous forests": 783, 786 | "Central Indochina dry forests": 784, 787 | "Chhota-Nagpur dry deciduous forests": 785, 788 | "Chiapas Depression dry forests": 786, 789 | "Chiquitano dry forests": 787, 790 | "Cuban dry forests": 788, 791 | "East Deccan dry-evergreen forests": 789, 792 | "Ecuadorian dry forests": 790, 793 | "Fiji tropical dry forests": 791, 794 | "Hawai'i tropical dry forests": 792, 795 | "Hispaniolan dry forests": 793, 796 | "Irrawaddy dry forests": 794, 797 | "Islas Revillagigedo dry forests": 795, 798 | "Jalisco dry forests": 796, 799 | "Jamaican dry forests": 797, 800 | "Khathiar-Gir dry deciduous forests": 798, 801 | "Lara-Falc\u00f3n dry forests": 799, 802 | "Lesser Sundas deciduous forests": 800, 803 | "Madagascar dry deciduous forests": 801, 804 | "Magdalena Valley dry forests": 802, 805 | "Maracaibo dry forests": 803, 806 | "Maranh\u00e3o Baba\u00e7u forests": 804, 807 | "Marianas tropical dry forests": 805, 808 | "Narmada Valley dry deciduous forests": 806, 809 | "New Caledonia dry forests": 807, 810 | "North Deccan dry deciduous forests": 808, 811 | "Panamanian dry forests": 809, 812 | "Pat\u00eda valley dry forests": 810, 813 | "Puerto Rican dry forests": 811, 814 | "Sierra de la Laguna dry forests": 812, 815 | "Sinaloan dry forests": 813, 816 | "Sin\u00fa Valley dry forests": 814, 817 | "Sonoran-Sinaloan subtropical dry forest": 815, 818 | "South Deccan Plateau dry deciduous forests": 816, 819 | "Southeast Indochina dry evergreen forests": 817, 820 | "Southern Pacific dry forests": 818, 821 | "Southern Vietnam lowland dry forests": 819, 822 | "Sri Lanka dry-zone dry evergreen forests": 820, 823 | "Sumba deciduous forests": 821, 824 | "Timor and Wetar deciduous forests": 822, 825 | "Tumbes-Piura dry forests": 823, 826 | "Veracruz dry forests": 824, 827 | "Yap tropical dry forests": 825, 828 | "Yucat\u00e1n dry forests": 826, 829 | "Zambezian evergreen dry forests": 827, 830 | "Brazilian Atlantic dry forests": 828, 831 | "Trinidad and Tobago dry forest": 829, 832 | "Lesser Antillean dry forests": 830, 833 | "Bermuda subtropical conifer forests": 831, 834 | "Central American pine-oak forests": 832, 835 | "Cuban pine forests": 833, 836 | "Himalayan subtropical pine forests": 834, 837 | "Hispaniolan pine forests": 835, 838 | "Luzon tropical pine forests": 836, 839 | "Northeast India-Myanmar pine forests": 837, 840 | "Sierra de la Laguna pine-oak forests": 838, 841 | "Sierra Madre de Oaxaca pine-oak forests": 839, 842 | "Sierra Madre del Sur pine-oak forests": 840, 843 | "Sierra Madre Occidental pine-oak forests": 841, 844 | "Sierra Madre Oriental pine-oak forests": 842, 845 | "Sumatran tropical pine forests": 843, 846 | "Trans-Mexican Volcanic Belt pine-oak forests": 844, 847 | "Bahamian pineyards": 845 848 | } -------------------------------------------------------------------------------- /stats/realm_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "realm1": { 3 | "Tundra": 36, 4 | "Tropical & Subtropical Moist Broadleaf Forests": 81, 5 | "Mediterranean Forests, Woodlands & Scrub": 5, 6 | "Deserts & Xeric Shrublands": 27, 7 | "Temperate Grasslands, Savannas & Shrublands": 23, 8 | "Boreal Forests/Taiga": 15, 9 | "Temperate Conifer Forests": 24, 10 | "Temperate Broadleaf & Mixed Forests": 21, 11 | "Montane Grasslands & Shrublands": 9, 12 | "Mangroves": 7, 13 | "Flooded Grasslands & Savannas": 8, 14 | "Tropical & Subtropical Grasslands, Savannas & Shrublands": 12, 15 | "Tropical & Subtropical Dry Broadleaf Forests": 33, 16 | "Tropical & Subtropical Coniferous Forests": 11 17 | }, 18 | "realm2": { 19 | "Tundra": 15, 20 | "Tropical & Subtropical Moist Broadleaf Forests": 60, 21 | "Mediterranean Forests, Woodlands & Scrub": 35, 22 | "Deserts & Xeric Shrublands": 70, 23 | "Temperate Grasslands, Savannas & Shrublands": 25, 24 | "Boreal Forests/Taiga": 11, 25 | "Temperate Conifer Forests": 21, 26 | "Temperate Broadleaf & Mixed Forests": 59, 27 | "Montane Grasslands & Shrublands": 36, 28 | "Mangroves": 7, 29 | "Flooded Grasslands & Savannas": 16, 30 | "Tropical & Subtropical Grasslands, Savannas & Shrublands": 42, 31 | "Tropical & Subtropical Dry Broadleaf Forests": 7 32 | } 33 | } -------------------------------------------------------------------------------- /stats/total_area_biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "Deserts & Xeric Shrublands": 26323247.410199884, 3 | "Tropical & Subtropical Grasslands, Savannas & Shrublands": 21391107.38756527, 4 | "Tropical & Subtropical Moist Broadleaf Forests": 19580104.74197682, 5 | "Boreal Forests/Taiga": 15316577.636974096, 6 | "Temperate Broadleaf & Mixed Forests": 12559792.49171971, 7 | "Temperate Grasslands, Savannas & Shrublands": 10571603.918123659, 8 | "Tundra": 8526394.453534998, 9 | "Montane Grasslands & Shrublands": 4872379.374186906, 10 | "Tropical & Subtropical Dry Broadleaf Forests": 3877948.9100976833, 11 | "Temperate Conifer Forests": 3761948.527841549, 12 | "Mediterranean Forests, Woodlands & Scrub": 3296014.700088584, 13 | "Flooded Grasslands & Savannas": 1153800.8216760682, 14 | "Tropical & Subtropical Coniferous Forests": 683071.888304758, 15 | "Mangroves": 332287.848897852 16 | } -------------------------------------------------------------------------------- /tmp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/tmp.py -------------------------------------------------------------------------------- /utils/biome_data_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | File that has code to read the resolve ecoregions geojson file and extract the biome names and eco-regions. 3 | It also writes the biome names to a json file. 4 | ''' 5 | 6 | import geojson 7 | from tqdm import tqdm 8 | import json 9 | 10 | def read_geojson(filename): 11 | with open(filename) as f: 12 | gj = geojson.load(f) 13 | return gj 14 | 15 | def get_biome_data(): 16 | path = "../datasets/RESOLVE_ecoregions.geojson" 17 | gj = read_geojson(path) 18 | 19 | data = gj['features'][0]['properties'] 20 | 21 | # getting all the biome names, and the list of eco_regions 22 | print('List of keys: ', data.keys()) 23 | 24 | biome_names = {} 25 | 26 | eco_count = 0 27 | for i in tqdm(range(len(gj['features']))): 28 | if gj['features'][i]['properties']['BIOME_NAME'] not in biome_names.keys(): 29 | biome_names[gj['features'][i]['properties']['BIOME_NAME']] = [] 30 | 31 | biome_names[gj['features'][i]['properties']['BIOME_NAME']].append([gj['features'][i]['properties']['ECO_NAME'], gj['features'][i]['properties']['REALM']]) 32 | eco_count += 1 33 | 34 | print('Total number of biomes', len(biome_names.keys())) 35 | print('Total number of eco-regions', eco_count) 36 | # writing the biome names to a json file 37 | import json 38 | 39 | with open('stats/biome_names.json', 'w') as fp: 40 | json.dump(biome_names, fp) 41 | 42 | 43 | 44 | if __name__ == '__main__': 45 | import os 46 | os.makedirs('stats', exist_ok=True) 47 | get_biome_data() 48 | -------------------------------------------------------------------------------- /utils/chunking_h5.py: -------------------------------------------------------------------------------- 1 | # take an existing h5 file, and create a new one with the same data but by using chunks. 2 | 3 | import h5py 4 | import numpy as np 5 | import os 6 | import argparse 7 | 8 | 9 | def create_h5_file_with_chunks(h5_file_path = '', chunk_size = 1): 10 | """ 11 | Create a new h5 file with the same data as the original file, but with the specified chunk size. 12 | :param h5_file_path: path to the original h5 file. 13 | :param chunk_size: chunk size to use. 14 | :return: None 15 | """ 16 | # open the original file 17 | h5 = h5py.File(h5_file_path, 'r') 18 | keys = list(h5.keys()) 19 | # create a new file 20 | name = h5_file_path.split('/')[-1][:-3] 21 | new_h5_file_path = os.path.join(os.path.dirname(h5_file_path), name + '_chunked_gzip.h5') 22 | print(new_h5_file_path) 23 | if os.path.exists(new_h5_file_path): 24 | os.remove(new_h5_file_path) 25 | 26 | new_h5 = h5py.File(new_h5_file_path, 'w') 27 | # the dataset is too big to load into memory, so we will iterate over the keys 28 | meta = h5['metadata'] 29 | num_samples = meta.shape[0] 30 | for key in keys: 31 | print('creating dataset for key: ', key) 32 | shape = h5[key].shape 33 | tmp = new_h5.create_dataset(key, shape = h5[key].shape, dtype = h5[key].dtype, chunks = (chunk_size, *shape[1:]), compression = 'gzip') 34 | 35 | # iterate over the samples 36 | for i in range(num_samples): 37 | if i % 1000 == 0: 38 | print(key, i) 39 | # get the sample 40 | sample = h5[key][i] 41 | # write the sample to the new file 42 | tmp[i] = sample 43 | 44 | 45 | 46 | 47 | # close the files 48 | h5.close() 49 | new_h5.close() 50 | 51 | 52 | if __name__ == '__main__': 53 | 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('--h5_file_path', type=str, help='path to the h5 file', required=True) 56 | parser.add_argument('--chunk_size', type=int, help='chunk size to use', default=1) 57 | args = parser.parse_args() 58 | 59 | create_h5_file_with_chunks(h5_file_path = args.h5_file_path, chunk_size = args.chunk_size) -------------------------------------------------------------------------------- /utils/convert_to_h5.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A python script to do the pre-processing and convert the data to h5 format. It will then be ready to be used by the dataloader for training. 3 | ''' 4 | 5 | 6 | import argparse 7 | import os 8 | import h5py 9 | import numpy as np 10 | import json 11 | import tifffile as tiff 12 | 13 | 14 | 15 | 16 | MODALITIES = { 17 | 'sentinel2': {'dtype': 'uint16', 'n_bands': 13, 'bands': ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8A', 'B8', 'B9', 'B10', 'B11', 'B12']}, 18 | 'sentinel2_cloudmask': {'dtype': 'uint16', 'n_bands': 1, 'bands': ['QA60']}, 19 | 'sentinel2_cloudprod': {'dtype': 'uint16', 'n_bands': 1, 'bands': ['MSK_CLDPRB']}, 20 | 'sentinel2_scl': {'dtype': 'uint8', 'n_bands': 1, 'bands': ['SCL']}, 21 | 'sentinel1': {'dtype': 'float32', 'n_bands': 8, 'bands': ['asc_VV', 'asc_VH', 'asc_HH', 'asc_HV', 'desc_VV', 'desc_VH', 'desc_HH', 'desc_HV']}, 22 | 'aster': {'dtype': 'int16', 'n_bands': 2, 'bands': ['elevation', 'slope']}, 23 | 'era5': {'dtype': 'float32', 'n_bands': 12, 'bands': ['prev_month_avg_temp', 'prev_month_min_temp', 'prev_month_max_temp', 'prev_month_total_precip', 'curr_month_avg_temp', 'curr_month_min_temp', 'curr_month_max_temp', 'curr_month_total_precip', 'year_avg_temp', 'year_min_temp', 'year_max_temp', 'year_total_precip']}, 24 | 'dynamic_world': {'dtype': 'uint8', 'n_bands': 1, 'bands': ['landcover']}, 25 | 'canopy_height_eth': {'dtype': 'int8', 'n_bands': 2, 'bands': ['height', 'std']}, 26 | 'lat': {'dtype': 'float32', 'n_bands': 2, 'bands': ['sin', 'cos']}, 27 | 'lon': {'dtype': 'float32', 'n_bands': 2, 'bands': ['sin', 'cos']}, 28 | 'biome': {'dtype': 'uint8', 'n_bands': 1}, 29 | 'eco_region': {'dtype': 'uint16', 'n_bands': 1}, 30 | 'month': {'dtype': 'float32', 'n_bands': 2, 'bands': ['sin', 'cos']}, 31 | 'esa_worldcover':{ 'dtype': 'uint8', 'n_bands': 1, 'bands': ['map']} 32 | } 33 | 34 | sentinel2_bands = [ 35 | 'B1', 36 | 'B2', 37 | 'B3', 38 | 'B4', 39 | 'B5', 40 | 'B6', 41 | 'B7', 42 | 'B8A', 43 | 'B8', 44 | 'B9', 45 | 'B10', 46 | 'B11', 47 | 'B12', 48 | ] 49 | 50 | # MODALITIES_IN_IMAGE = [ 51 | # 'sentinel2', 52 | # 'sentinel1_asc', 53 | # 'sentinel1_desc', 54 | # 'aster', 55 | # 'dynamic_world', 56 | # 'canopy_height_eth', 57 | # 'esa_worldcover' 58 | 59 | # ] 60 | 61 | variables = {} 62 | remove = [] 63 | 64 | 65 | 66 | def read_data(tile_id, tile_info, data_dir, img_size, exisiting_datasets=None): 67 | tile_info_bands = tile_info['BANDS'] 68 | type = tile_info['S2_type'] 69 | try: 70 | data = tiff.imread(os.path.join(data_dir, tile_id + '.tif')) 71 | except: 72 | return None 73 | return_data_dict = {} 74 | count = 0 # this represents the count of the bands, we use this since all the bands are stacked in the same order 75 | 76 | # creating a center crop of size img_size 77 | start_x = (data.shape[0] - img_size) // 2 78 | start_y = (data.shape[1] - img_size) // 2 79 | if len(data.shape) == 2: 80 | data = np.expand_dims(data, axis=2) 81 | data = data[start_x:start_x + img_size, start_y:start_y + img_size, :] 82 | for modality, modality_info in MODALITIES.items(): 83 | if exisiting_datasets is not None and modality not in exisiting_datasets: 84 | continue 85 | 86 | 87 | ### SENTINEL 2 ### 88 | if modality == 'sentinel2': 89 | placeholder = np.zeros((13, img_size, img_size), dtype='uint16') 90 | count += len(tile_info_bands['sentinel2']) - 3 if type == 'l2a' else len(tile_info_bands['sentinel2']) - 1 91 | for i, band in enumerate(sentinel2_bands): 92 | if band in tile_info_bands['sentinel2']: 93 | placeholder[i] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index(band)], 2).transpose(2, 0, 1) 94 | return_data_dict['sentinel2'] = placeholder 95 | 96 | if modality == 'sentinel2_cloudmask': 97 | if 'QA60' in tile_info_bands['sentinel2']: 98 | return_data_dict['sentinel2_cloudmask'] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index('QA60')], 2).transpose(2, 0, 1) 99 | count += 1 100 | else: 101 | return_data_dict['sentinel2_cloudmask'] = np.ones((1, img_size, img_size), dtype='uint16') * 65535 102 | 103 | if modality == 'sentinel2_cloudprod': 104 | 105 | if 'MSK_CLDPRB' in tile_info_bands['sentinel2']: 106 | count += 1 107 | return_data_dict['sentinel2_cloudprod'] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index('MSK_CLDPRB')], 2).transpose(2, 0, 1) 108 | else: 109 | return_data_dict['sentinel2_cloudprod'] = np.ones((1, img_size, img_size), dtype='uint16') * 65535 110 | 111 | 112 | if modality == 'sentinel2_scl': 113 | if 'SCL' in tile_info_bands['sentinel2']: 114 | count += 1 115 | return_data_dict['sentinel2_scl'] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index('SCL')], 2).transpose(2, 0, 1) 116 | else: 117 | return_data_dict['sentinel2_scl'] = np.ones((1, img_size, img_size), dtype='uint8') * 255 118 | 119 | 120 | 121 | ### SENTINEL 1 ### 122 | if modality == 'sentinel1': 123 | bands_map = {'VV': 0, 'VH': 1, 'HH': 2, 'HV': 3} 124 | orbit_map = {'asc': 0, 'desc': 4} 125 | tmp = np.ones((8, img_size, img_size), dtype='float32') * float('-inf') 126 | for orbit in ['asc', 'desc']: 127 | if tile_info_bands[f'sentinel1_{orbit}'] is not None: 128 | bands_img = tile_info_bands[f'sentinel1_{orbit}'] 129 | count += len(bands_img) 130 | for i, band in enumerate(bands_img): 131 | tmp[orbit_map[orbit] + bands_map[band]] = data[:, :, count - len(bands_img) + i] 132 | 133 | return_data_dict['sentinel1'] = tmp 134 | 135 | ### ASTER ### 136 | if modality == 'aster': 137 | count += len(tile_info_bands['aster']) 138 | return_data_dict['aster'] = data[:, :, count - len(tile_info_bands['aster']):count].transpose(2, 0, 1) 139 | 140 | ### ERA5 ### 141 | if modality == 'era5': 142 | 143 | era_data = tile_info['era5'] 144 | 145 | return_data_dict['era5'] = np.stack([era_data['month1'] + era_data['month2'] + era_data['year']], axis=0).astype('float32') 146 | 147 | ### DYNAMIC WORLD ### 148 | if modality == 'dynamic_world': 149 | try: 150 | count += len(tile_info_bands['dynamic_world']) 151 | return_data_dict['dynamic_world'] = np.expand_dims(data[:, :, count - len(tile_info_bands['dynamic_world'])], axis = 0) 152 | except Exception as e: 153 | print('dynamic_world not found') 154 | return_data_dict['dynamic_world'] = np.zeros((1, img_size, img_size), dtype='uint8') 155 | 156 | 157 | ### CANOPY HEIGHT ### 158 | if modality == 'canopy_height_eth': 159 | count += len(tile_info_bands['canopy_height_eth']) 160 | tmp = data[:, :, count - len(tile_info_bands['canopy_height_eth']):count].transpose(2, 0, 1) 161 | if tmp.shape[0] == 0: 162 | return_data_dict['canopy_height_eth'] = np.ones((2, img_size, img_size), dtype='int8') * 255 163 | else: 164 | return_data_dict['canopy_height_eth'] = data[:, :, count - len(tile_info_bands['canopy_height_eth']):count].transpose(2, 0, 1) 165 | 166 | 167 | 168 | ### LATITUDE ### 169 | if modality == 'lat': 170 | return_data_dict['lat'] = np.stack([np.sin(np.deg2rad(tile_info['lat'])), np.cos(np.deg2rad(tile_info['lat']))], axis=0).astype('float32') 171 | 172 | 173 | ### LONGITUDE ### 174 | if modality == 'lon': 175 | return_data_dict['lon'] = np.stack([np.sin(np.deg2rad(tile_info['lon'])), np.cos(np.deg2rad(tile_info['lon']))], axis=0).astype('float32') 176 | 177 | ### BIOME ### 178 | if modality == 'biome': 179 | biome = tile_info['biome'] 180 | one_hot = np.zeros(14) 181 | one_hot[biome] = 1 182 | return_data_dict['biome'] = one_hot.astype('uint8') 183 | 184 | ### ECO-REGION ### 185 | if modality == 'eco_region': 186 | eco_region = tile_info['eco_region'] 187 | one_hot = np.zeros(846) 188 | one_hot[eco_region] = 1 189 | return_data_dict['eco_region'] = one_hot.astype('uint16') 190 | 191 | ### MONTH ### 192 | if modality == 'month': 193 | month = tile_info['S2_DATE'].split('-')[1] 194 | month = int(month) 195 | return_data_dict['month'] = np.stack([np.sin(2 * np.pi * month / 12), np.cos(2 * np.pi * month / 12)], axis=0).astype('float32') 196 | 197 | ## ESA WORLD COVER ### 198 | if modality == 'esa_worldcover': 199 | try: 200 | count += len(tile_info_bands['esa_worldcover']) 201 | return_data_dict['esa_worldcover'] = np.expand_dims(data[:, :, count - len(tile_info_bands['esa_worldcover'])], axis = 0) 202 | 203 | except Exception as e: 204 | print('esa_worldcover not found') 205 | print('exception: ',e) 206 | exit() 207 | return_data_dict['esa_worldcover'] = np.ones((1, img_size, img_size), dtype='uint8') * 255 208 | 209 | return return_data_dict 210 | 211 | 212 | def main(args): 213 | 214 | mode = args.mode 215 | print('Mode: ', mode) 216 | if mode == 'create': 217 | data_dir = args.data_dir 218 | data_dir = os.path.join(data_dir, 'merged') 219 | img_size = args.image_size 220 | 221 | # we first check if the output file already exists, if it does, we delete it 222 | if os.path.exists(args.output_file): 223 | os.remove(args.output_file) 224 | 225 | hdf5_file = h5py.File(args.output_file, 'a') 226 | 227 | num_tiles = len(os.listdir(data_dir)) 228 | 229 | tile_info = json.load(open(args.tile_info, 'r')) 230 | 231 | 232 | # we calculate the real number of tiles by verifying how many tiles 233 | # have number of bands equal to the sum of the bands in the tile_info.json file 234 | 235 | num_tiles = 0 236 | for i, tile_id in enumerate(tile_info): 237 | count_t = 0 238 | for b in ['sentinel2', 'sentinel1_asc', 'sentinel1_desc', 'aster', 'dynamic_world', 'canopy_height_eth', 'esa_worldcover']: 239 | if b in tile_info[tile_id]['BANDS']: 240 | count_t += len(tile_info[tile_id]['BANDS'][b]) if tile_info[tile_id]['BANDS'][b] is not None else 0 241 | 242 | try: 243 | data = tiff.imread(os.path.join(data_dir, tile_id + '.tif')) 244 | except: 245 | # sometimes the data is not downloaded, so we skip it 246 | continue 247 | if data.shape[-1] == count_t: 248 | num_tiles += 1 249 | else: 250 | # we store the lat lon in a csv file to check the tiles that have not been downloaded 251 | print('Tile shape mismatch: ', tile_id) 252 | lat, lon = tile_info[tile_id]['lat'], tile_info[tile_id]['lon'] 253 | # with open(args.missing_tiles, 'a') as f: 254 | # f.write(f'{tile_id},{lat},{lon}\n') 255 | 256 | # num_tiles = len(tile_info) 257 | print('Number of tiles: ', num_tiles) 258 | print('Number of entries in tile_info: ', len(tile_info)) 259 | print('Number of tiles skipped due to mismatch: ', len(tile_info) - num_tiles) 260 | # exit() 261 | 262 | 263 | 264 | # creating a dataset for each modality 265 | for modality, modality_info in MODALITIES.items(): 266 | if modality == 'lat' or modality == 'lon' or modality == 'month': 267 | variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, 2), dtype=modality_info['dtype'], compression='gzip', chunks=(1, 2)) 268 | elif modality == 'biome': 269 | variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, 14), dtype=modality_info['dtype'], compression='gzip', chunks=(1, 14)) 270 | elif modality == 'eco_region': 271 | variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, 846), dtype=modality_info['dtype'], compression='gzip', chunks=(1, 846)) 272 | elif modality == 'era5': 273 | variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, modality_info['n_bands']), dtype=modality_info['dtype'], compression='gzip', chunks=(1, modality_info['n_bands'])) 274 | else: 275 | variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, modality_info['n_bands'], img_size, img_size), dtype=modality_info['dtype'], compression='gzip', chunks=(1, modality_info['n_bands'], img_size, img_size)) 276 | 277 | 278 | # create a new meta data with tile_id and s2 type which is either l2a or l1c 279 | metadata_dt = np.dtype([('tile_id', 'S100'), ('S2_type', 'S10')]) # string of length 100 280 | ds_metadata = hdf5_file.create_dataset('metadata', shape=(num_tiles,), dtype=metadata_dt, compression='gzip', chunks=(1,)) 281 | 282 | # metadata_dt = np.dtype([('tile_id', 'S100')]) # string of length 100 283 | # ds_metadata = hdf5_file.create_dataset('metadata', shape=(num_tiles,), dtype=metadata_dt, compression='gzip', chunks=(1,)) 284 | 285 | j = 0 286 | count_s = 0 287 | for i, tile_id in enumerate(tile_info): 288 | 289 | print(f'Processing tile {i}/{num_tiles}, {tile_id}') 290 | count_t = 0 291 | 292 | for b in ['sentinel2', 'sentinel1_asc', 'sentinel1_desc', 'aster', 'dynamic_world', 'canopy_height_eth', 'esa_worldcover']: 293 | if b in tile_info[tile_id]['BANDS']: 294 | count_t += len(tile_info[tile_id]['BANDS'][b]) if tile_info[tile_id]['BANDS'][b] is not None else 0 295 | 296 | try: 297 | data = tiff.imread(os.path.join(data_dir, tile_id + '.tif')) 298 | except: 299 | print('Skipping tile: ', tile_id) 300 | continue 301 | 302 | if data.shape[-1] != count_t: 303 | print('mismatch') 304 | count_s += 1 305 | continue 306 | 307 | 308 | data_ = read_data(tile_id, tile_info[tile_id], data_dir, img_size) 309 | if data_ is None: 310 | print('Skipping tile: ', tile_id) 311 | continue 312 | for modality, _ in MODALITIES.items(): 313 | try: 314 | variables[modality][j] = data_[modality] 315 | except Exception as e: 316 | print('Error in modality: ', modality) 317 | print(e) 318 | exit() 319 | # breakpoint() 320 | # ds_metadata[j] = tile_id 321 | ds_metadata[j] = (tile_id, tile_info[tile_id]['S2_type']) 322 | j += 1 323 | # exit() ## testing 324 | hdf5_file.close() 325 | print('Done!') 326 | print('number of tiles: ', num_tiles) 327 | 328 | print('Number of tiles skipped due to mismatch: ', count_s) 329 | else: 330 | # we are now merging 2 h5 files 331 | if os.path.exists(args.output_file): 332 | print('Output file already exists. ') 333 | 334 | img_size = args.image_size 335 | 336 | file1 = h5py.File(args.path1, 'r') 337 | file2 = h5py.File(args.path2, 'r') 338 | tile_info1 = json.load(open(args.path1_tile_info, 'r')) 339 | tile_info2 = json.load(open(args.path2_tile_info, 'r')) 340 | 341 | size = len(file1['metadata']) + len(file2['metadata']) 342 | 343 | hdf5_file = h5py.File(args.output_path, 'a') 344 | 345 | # creating a dataset for each modality 346 | for modality, modality_info in MODALITIES.items(): 347 | if modality == 'lat' or modality == 'lon' or modality == 'month': 348 | variables[modality] = hdf5_file.create_dataset(modality, shape=(size, 2), dtype=modality_info['dtype']) 349 | elif modality == 'biome': 350 | variables[modality] = hdf5_file.create_dataset(modality, shape=(size, 14), dtype=modality_info['dtype']) 351 | elif modality == 'eco_region': 352 | variables[modality] = hdf5_file.create_dataset(modality, shape=(size, 846), dtype=modality_info['dtype']) 353 | elif modality == 'era5': 354 | variables[modality] = hdf5_file.create_dataset(modality, shape=(size, modality_info['n_bands']), dtype=modality_info['dtype']) 355 | else: 356 | variables[modality] = hdf5_file.create_dataset(modality, shape=(size, modality_info['n_bands'], img_size, img_size), dtype=modality_info['dtype']) 357 | 358 | metadata_dt = np.dtype([('tile_id', 'S100')]) # string of length 100 359 | ds_metadata = hdf5_file.create_dataset('metadata', shape=(size,), dtype=metadata_dt) 360 | 361 | j = 0 362 | for i in range(len(file1['metadata'])): 363 | print(f'Processing tile {i}/{len(file1["metadata"])}') 364 | # we append the data from the first file, and then the data from the second file 365 | for modality, _ in MODALITIES.items(): 366 | variables[modality][j] = file1[modality][i] 367 | ds_metadata[j] = file1['metadata'][i] 368 | j += 1 369 | 370 | for i in range(len(file2['metadata'])): 371 | print(f'Processing tile {i}/{len(file2["metadata"])}') 372 | # we append the data from the first file, and then the data from the second file 373 | for modality, _ in MODALITIES.items(): 374 | variables[modality][j] = file2[modality][i] 375 | ds_metadata[j] = file2['metadata'][i] 376 | j += 1 377 | 378 | hdf5_file.close() 379 | print('Done!') 380 | 381 | # we also need to merge the tile_info files 382 | tile_info = {} 383 | for i in range(len(file1['metadata'])): 384 | tile = file1['metadata'][i][0].decode('utf-8') 385 | tile_info[tile] = tile_info1[tile] 386 | for i in range(len(file2['metadata'])): 387 | tile = file2['metadata'][i][0].decode('utf-8') 388 | tile_info[tile] = tile_info2[tile] 389 | with open(args.output_path.split('.')[0] + '_tile_info.json', 'w') as f: 390 | json.dump(tile_info, f) 391 | print('number of tiles: ', size) 392 | 393 | 394 | if __name__ == '__main__': 395 | parser = argparse.ArgumentParser(description='Convert the data to h5 format') 396 | parser.add_argument('--mode', type=str, required=True, help='append or create', choices= ['merge', 'create']) 397 | 398 | # args for create mode 399 | # required args 400 | parser.add_argument('--data_dir', type=str, default='', help='path to the data directory') 401 | # optional args 402 | parser.add_argument('--tile_info', type=str, default='', help='path to the tile info json file') 403 | parser.add_argument('--output_file', type=str, default='', help='path to the output h5 file') 404 | parser.add_argument('--missing_tiles', type=str, default='', help='path to the csv file containing the missing tiles') 405 | parser.add_argument('--image_size', type=int, default=128, help='size of the image') 406 | 407 | 408 | # args for merge mode 409 | # required args 410 | parser.add_argument('--data_dir1', type=str, default='', help='path to the first folder') 411 | parser.add_argument('--data_dir2', type=str, default='', help='path to the first folder') 412 | parser.add_argument('--output_path', type=str, help='path to the output h5 file') 413 | # optional args 414 | parser.add_argument('--path1', type=str, default='', help='path to the first h5 file') 415 | parser.add_argument('--path1_tile_info', type=str, default='', help='path to the tile info json file for the first h5 file') 416 | parser.add_argument('--path2', type=str, default='', help='path to the second h5 file') 417 | parser.add_argument('--path2_tile_info', type=str, default='', help='path to the tile info json file for the second h5 file') 418 | 419 | 420 | args = parser.parse_args() 421 | if args.mode == 'merge': 422 | assert args.output_path != '', 'Please provide the output path' 423 | assert args.data_dir1 != '', 'Please provide the path to the first folder' 424 | assert args.data_dir2 != '', 'Please provide the path to the second folder' 425 | 426 | name1 = args.data_dir1.split('/')[-1] if args.data_dir1[-1] != '/' else args.data_dir1.split('/')[-2] 427 | name2 = args.data_dir2.split('/')[-1] if args.data_dir2[-1] != '/' else args.data_dir2.split('/')[-2] 428 | if args.path1 == '': 429 | args.path1 = os.path.join(args.data_dir1, name1 + '.h5') 430 | if args.path1_tile_info == '': 431 | args.path1_tile_info = os.path.join(args.data_dir1, name1 + '_tile_info.json') 432 | if args.path2 == '': 433 | args.path2 = os.path.join(args.data_dir2, name2 + '.h5') 434 | if args.path2_tile_info == '': 435 | args.path2_tile_info = os.path.join(args.data_dir2, name2 + '_tile_info.json') 436 | 437 | if args.mode == 'create': 438 | assert args.data_dir != '', 'Please provide the path to the data directory' 439 | name = args.data_dir.split('/')[-1] if args.data_dir[-1] != '/' else args.data_dir.split('/')[-2] 440 | if args.tile_info == '': 441 | args.tile_info = os.path.join(args.data_dir, name + '_tile_info.json') 442 | if args.output_file == '': 443 | args.output_file = os.path.join(args.data_dir, name + '.h5') 444 | if args.missing_tiles == '': 445 | args.missing_tiles = os.path.join(args.data_dir, name + 'missing_tiles.csv') 446 | 447 | 448 | 449 | main(args) -------------------------------------------------------------------------------- /utils/normalization.py: -------------------------------------------------------------------------------- 1 | # this function reads the h5 file and computes the mean and std of each band 2 | # including the min and max, and saves it in a json file. 3 | 4 | 5 | import os 6 | import json 7 | import numpy as np 8 | import h5py 9 | from math import inf 10 | 11 | 12 | NO_DATA_VAL = { 13 | 'sentinel2': 0, 14 | 'sentinel2_cloudmask': 65535, 15 | 'sentinel2_cloudprod': 65535, 16 | 'sentinel2_scl': 255, 17 | 'sentinel1': float('-inf'), 18 | 'aster': float('-inf'), 19 | 'canopy_height_eth': 255, 20 | 'dynamic_world': 0, 21 | 'esa_worldcover': 255, 22 | 'lat': float('-inf'), 23 | 'lon': float('-inf'), 24 | 'month': float('-inf'), 25 | 'era5': float('inf') 26 | } 27 | 28 | # DATA_PATH = "/home/qbk152/vishal/global-lr/data/data_1M_130/data_1M_130.h5" 29 | # TILE_INFO = "/home/qbk152/vishal/global-lr/data/data_1M_130/data_1M_130_tile_info.json" 30 | # SUBSET_SIZE = 100000 ##### define subset size here, we only compute mean and std for a subset 31 | # STORE_PATH = "/home/qbk152/vishal/global-lr/data/data_1M_130/data_1M_130_band_stats.json" 32 | 33 | # DATA_PATH = "/projects/dereeco/data/global-lr/data_1M_130/data_1M_130.h5" 34 | # TILE_INFO = "/projects/dereeco/data/global-lr/data_1M_130/data_1M_130_tile_info.json" 35 | # SUBSET_SIZE = 100000 ##### define subset size here, we only compute mean and std for a subset 36 | # STORE_PATH = "/projects/dereeco/data/global-lr/data_1M_130/data_1M_130_band_stats.json" 37 | 38 | 39 | 40 | def compute_band_stats(data_folder = '', tile_info = '', store_path = ''): 41 | SUBSET_SIZE = 100000 ##### define subset size here, we only compute mean and std for a subset 42 | 43 | 44 | if data_folder == '': 45 | raise ValueError("Please provide the path to the data folder") 46 | 47 | name = data_folder.split('/')[-1] if data_folder[-1] != '/' else data_folder.split('/')[-2] 48 | data_path = os.path.join(data_folder, name + '.h5') 49 | tile_info = os.path.join(data_folder, name + '_tile_info.json') if tile_info == '' else tile_info 50 | store_path = os.path.join(data_folder, name + '_band_stats.json') if store_path == '' else store_path 51 | 52 | # since the number of images are large, we compute the rolling mean and std 53 | 54 | # read the tile info 55 | with open(tile_info, 'r') as f: 56 | tile_info = json.load(f) 57 | 58 | # read the h5 file 59 | f = h5py.File(data_path, 'r') 60 | 61 | meta = f['metadata'] 62 | bands = list(i for i in f.keys() if i != 'metadata') 63 | print(bands) 64 | 65 | 66 | print("number of images: ", len(meta)) 67 | 68 | return_dict = {} 69 | 70 | 71 | 72 | 73 | for band in bands: 74 | if band in ['lat', 'lon', 'month', 'era5']: 75 | print('computing stats for band: ', band) 76 | if band == 'era5': 77 | num_images = np.count_nonzero(~np.isnan(f[band]), axis=0) 78 | else: 79 | num_images = len(meta) 80 | data = f[band] 81 | mean = np.nansum(data, axis=0) / num_images 82 | std = np.sqrt(np.nansum((data - mean)**2, axis = 0) / num_images) 83 | min_val = np.nanmin(data, axis=0) 84 | max_val = np.nanmax(data, axis=0) 85 | return_dict[band] = { 86 | 'mean': list(mean.astype(float)), 87 | 'std': list(std.astype(float)), 88 | 'min': list(min_val.astype(float)), 89 | 'max': list(max_val.astype(float)) 90 | } 91 | print(return_dict[band]) 92 | continue 93 | 94 | 95 | if band not in ['sentinel2_cloudmask', 'sentinel2_cloudprod', 'sentinel2', 'sentinel1', 'aster', 'canopy_height_eth']: 96 | continue 97 | 98 | 99 | print('computing stats for band: ', band) 100 | num_images = len(meta) 101 | 102 | subset_size = min(SUBSET_SIZE, num_images) 103 | C = f[band].shape[1] 104 | channel_sums = np.zeros(C, dtype=np.float64) 105 | channel_sums_squared = np.zeros(C, dtype=np.float64) 106 | count_ = np.zeros(C, dtype=np.float64) 107 | min_val = np.ones(C, dtype=np.float64)*float('inf') 108 | max_val = np.ones(C, dtype=np.float64)*float('-inf') 109 | max_range = 1.7e308 110 | 111 | 112 | inf_values = 0 113 | # set numpy seed 114 | np.random.seed(0) 115 | indices = np.random.randint(0, num_images, size=subset_size) 116 | 117 | if 'sentinel2' in band: 118 | channel_sums_l2a = np.zeros(C, dtype=np.float64) 119 | channel_sums_squared_l2a = np.zeros(C, dtype=np.float64) 120 | count_l2a = np.zeros(C, dtype=np.float64) 121 | channel_sums_l1c = np.zeros(C, dtype=np.float64) 122 | channel_sums_squared_l1c = np.zeros(C, dtype=np.float64) 123 | count_l1c = np.zeros(C, dtype=np.float64) 124 | min_val_l2a = np.ones(C, dtype=np.float64)*float('inf') 125 | max_val_l2a = np.ones(C, dtype=np.float64)*float('-inf') 126 | min_val_l1c = np.ones(C, dtype=np.float64)*float('inf') 127 | max_val_l1c = np.ones(C, dtype=np.float64)*float('-inf') 128 | max_range = 1.7e308 129 | 130 | for idx, i in enumerate(indices): 131 | name = meta[i][0].decode('utf-8') 132 | if tile_info[name]['S2_type'] == "l2a": 133 | image = np.float64(f[band][i]) 134 | channel_sums_l2a += np.sum(image, axis=(1, 2), where=(image != NO_DATA_VAL[band])) 135 | channel_sums_squared_l2a += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band])) 136 | count_l2a += np.sum(image != NO_DATA_VAL[band], axis=(1, 2)) 137 | tmp_img = np.where(image == NO_DATA_VAL[band], np.nan, image) 138 | min_val_l2a = np.nanmin([min_val_l2a, np.nanmin(tmp_img, axis=(1, 2))], axis=0) 139 | max_val_l2a = np.nanmax([max_val_l2a, np.nanmax(tmp_img, axis=(1, 2))], axis=0) 140 | 141 | 142 | 143 | elif tile_info[name]['S2_type'] == "l1c": 144 | image = np.float64(f[band][i]) 145 | channel_sums_l1c += np.sum(image, axis=(1, 2), where=(image != NO_DATA_VAL[band])) 146 | channel_sums_squared_l1c += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band])) 147 | count_l1c += np.sum(image != NO_DATA_VAL[band], axis=(1, 2)) 148 | tmp_img = np.where(image == NO_DATA_VAL[band], np.nan, image) 149 | min_val_l1c = np.nanmin([min_val_l1c, np.nanmin(tmp_img, axis=(1, 2))], axis=0) 150 | max_val_l1c = np.nanmax([max_val_l1c, np.nanmax(tmp_img, axis=(1, 2))], axis=0) 151 | 152 | mean_l2a = channel_sums_l2a/count_l2a 153 | std_l2a = np.sqrt((channel_sums_squared_l2a/count_l2a) - mean_l2a**2) 154 | mean_l1c = channel_sums_l1c/count_l1c 155 | std_l1c = np.sqrt((channel_sums_squared_l1c/count_l1c) - mean_l1c**2) 156 | return_dict[band + "_l2a"] = { 157 | 'mean': list(mean_l2a.astype(float)), 158 | 'std': list(std_l2a.astype(float)), 159 | 'min': list(min_val_l2a.astype(float)), 160 | 'max': list(max_val_l2a.astype(float)) 161 | } 162 | return_dict[band + "_l1c"] = { 163 | 'mean': list(mean_l1c.astype(float)), 164 | 'std': list(std_l1c.astype(float)), 165 | 'min': list(min_val_l1c.astype(float)), 166 | 'max': list(max_val_l1c.astype(float)) 167 | } 168 | print(return_dict[band + "_l2a"]) 169 | print(return_dict[band + "_l1c"]) 170 | 171 | else: 172 | 173 | for i in indices: 174 | 175 | image = np.float64(f[band][i]) 176 | tmp = np.sum(image, axis=(1, 2), where=(image != NO_DATA_VAL[band])) 177 | 178 | channel_sums += tmp 179 | # if band == 'sentinel1': 180 | # channel_sums_squared += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band] or image != float('-inf'))) 181 | # else: 182 | channel_sums_squared += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band])) 183 | if np.any(channel_sums_squared > max_range): 184 | print("channel_sums_squared: ", channel_sums_squared) 185 | print("index", idx) 186 | raise OverflowError("channel_sums_squared is greater than max_range") 187 | 188 | # computing min and max 189 | # replace all no data with nan 190 | tmp_img = np.where(image == NO_DATA_VAL[band], np.nan, image) 191 | min_val = np.nanmin([min_val, np.nanmin(tmp_img, axis=(1, 2))], axis=0) 192 | max_val = np.nanmax([max_val, np.nanmax(tmp_img, axis=(1, 2))], axis=0) 193 | count_ += np.sum(image != NO_DATA_VAL[band], axis=(1, 2)) 194 | 195 | 196 | mean = channel_sums/count_ 197 | std = np.sqrt((channel_sums_squared/count_) - mean**2) 198 | 199 | return_dict[band] = { 200 | 'mean': list(mean.astype(float)), 201 | 'std': list(std.astype(float)), 202 | 'min': list(min_val.astype(float)), 203 | 'max': list(max_val.astype(float)) 204 | } 205 | 206 | print(return_dict[band]) 207 | # exit() 208 | 209 | with open(store_path, 'w') as f: 210 | json.dump(return_dict, f) 211 | 212 | 213 | 214 | if __name__ == "__main__": 215 | compute_band_stats(DATA_PATH, TILE_INFO, STORE_PATH) -------------------------------------------------------------------------------- /utils/splits.py: -------------------------------------------------------------------------------- 1 | # a file that has functions to create the train, val, and test splits from a h5 file. 2 | # we only create a new file, with indices of the train, val, and test splits. 3 | 4 | 5 | import h5py 6 | import numpy as np 7 | import os 8 | import json 9 | 10 | # data_path = "/home/qbk152/vishal/global-lr/data/data_100k_130.h5" 11 | # tile_info = "/home/qbk152/vishal/global-lr/data/data_100k_130_tile_info.json" 12 | # store_path = "/home/qbk152/vishal/global-lr/data/data_100k_130_splits.json" 13 | 14 | 15 | def create_splits(data_folder = '', tile_info = '', store_path = '', train_split=1.0, val_split=0.0, test_split=0): 16 | 17 | if data_folder == '': 18 | raise ValueError("Please provide the path to the data folder") 19 | name = data_folder.split('/')[-1] if data_folder[-1] != '/' else data_folder.split('/')[-2] 20 | data_path = os.path.join(data_folder, name + '.h5') 21 | tile_info = os.path.join(data_folder, name + '_tile_info.json') if tile_info == '' else tile_info 22 | store_path = os.path.join(data_folder, name + '_splits.json') if store_path == '' else store_path 23 | 24 | 25 | # read the tile info 26 | with open(tile_info, 'r') as f: 27 | tile_info = json.load(f) 28 | 29 | # read the h5 file 30 | f = h5py.File(data_path, 'r') 31 | 32 | meta = f['metadata'] 33 | bands = list(i for i in f.keys() if i != 'metadata') 34 | print(bands) 35 | 36 | 37 | print("number of images: ", len(meta)) 38 | 39 | # create the splits 40 | num_images = len(meta) 41 | num_train = int(train_split * num_images) 42 | num_val = int(val_split * num_images) 43 | num_test = num_images - num_train - num_val 44 | 45 | # create the indices 46 | indices = np.arange(num_images) 47 | np.random.shuffle(indices) 48 | 49 | train_indices = indices[:num_train] 50 | val_indices = indices[num_train:num_train + num_val] 51 | test_indices = indices[num_train + num_val:] 52 | 53 | 54 | # create the splits 55 | splits = { 56 | 'train': train_indices.tolist(), 57 | 'val': val_indices.tolist(), 58 | 'test': test_indices.tolist() 59 | } 60 | 61 | # store the splits in a json file 62 | with open(store_path, 'w') as f: 63 | json.dump(splits, f) 64 | 65 | 66 | # # create the splits 67 | # train_split = {} 68 | # val_split = {} 69 | # test_split = {} 70 | 71 | # for band in bands: 72 | # print('creating splits for band: ', band) 73 | # data = f[band] 74 | # train_split[band] = data[train_indices] 75 | # val_split[band] = data[val_indices] 76 | # test_split[band] = data[test_indices] 77 | 78 | # # create the metadata splits 79 | # train_meta = meta[train_indices] 80 | # val_meta = meta[val_indices] 81 | # test_meta = meta[test_indices] 82 | 83 | # # create the output file 84 | # f_out = h5py.File(store_path, 'w') 85 | 86 | # # write the metadata 87 | # train_meta_out = f_out.create_dataset('train_metadata', data=train_meta) 88 | # val_meta_out = f_out.create_dataset('val_metadata', data=val_meta) 89 | # test_meta_out = f_out.create_dataset('test_metadata', data=test_meta) 90 | 91 | # # write the data 92 | # for band in bands: 93 | # print('writing band: ', band) 94 | # train_out = f_out.create_dataset('train_' + band, data=train_split[band]) 95 | # val_out = f_out.create_dataset('val_' + band, data=val_split[band]) 96 | # test_out = f_out.create_dataset('test_' + band, data=test_split[band]) 97 | 98 | # # close the files 99 | # f.close() 100 | # f_out.close() 101 | 102 | 103 | if __name__ == '__main__': 104 | create_splits(data_path, tile_info, store_path) -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import geojson 3 | import logging 4 | import ee 5 | import os 6 | import glob 7 | import config.ee_init 8 | import json 9 | 10 | 11 | def read_geojson(path): 12 | ''' 13 | Reads the geojson file 14 | ''' 15 | with open(path) as f: 16 | gj = geojson.load(f) 17 | return gj 18 | 19 | def merge_dicts(in_path, out_path = 'data/data_100k_130_tile_info.json'): 20 | ''' 21 | Merges the dictionaries from the tile_info json files. 22 | ''' 23 | 24 | # reading all the tile json files 25 | tile_info_dict = {} 26 | for tile_name in glob.glob(f'{in_path}/tile_info_*'): 27 | tmp = read_geojson(tile_name) 28 | tile_info_dict = tmp | tile_info_dict 29 | 30 | # writing the merged dictionary to a file 31 | print(len(tile_info_dict)) 32 | with open(out_path, 'w') as f: 33 | geojson.dump(tile_info_dict, f) 34 | 35 | # remove the individual tile_info files 36 | # for tile_name in glob.glob('data/tile_info/tile_info_*'): 37 | # os.remove(tile_name) 38 | 39 | def create_missing_tiles_geojson(): 40 | ''' 41 | Creates the missing tiles geojson file. 42 | ''' 43 | 44 | missing_tiles_csv = 'data/missing_tiles_1M.csv' 45 | tile_geojson = '/home/qbk152/vishal/global-lr/tile_polygons/uni_biomes_only/tiles_1M_130.geojson' 46 | 47 | geojson = { 48 | 'type': 'FeatureCollection', 49 | 'features': [] 50 | } 51 | 52 | # reading the tile geojson file 53 | gj = read_geojson(tile_geojson) 54 | features = gj['features'] 55 | 56 | # reading the missing tiles csv file 57 | for line in open(missing_tiles_csv): 58 | line = line.strip() 59 | line = line.split(',') 60 | tile_name = line[0] 61 | print('Processing tile: ', tile_name) 62 | 63 | # finding the tile in the tile geojson file 64 | for feature in features: 65 | if feature['properties']['tile_id'] == tile_name: 66 | geojson['features'].append(feature) 67 | break 68 | 69 | # writing the geojson file 70 | print('Total missing tiles: ', len(geojson['features'])) 71 | with open('data/missing_tiles_1M.geojson', 'w') as f: 72 | json.dump(geojson, f) 73 | 74 | 75 | 76 | 77 | 78 | 79 | def update_tile_info(tile, ee_set_, tile_info = None): 80 | ''' 81 | Updates the tile information in the geojson file. 82 | ''' 83 | 84 | if tile_info is not None: 85 | # this implies that there already exists a tile_info file, and we need to read the information from that file and update it with the new information which for now is only the bands 86 | id = ee_set_.id 87 | existing_bands = tile_info['BANDS'] 88 | new_bands = ee_set_.img_bands 89 | bands = existing_bands | new_bands 90 | tile_info['BANDS'] = bands 91 | 92 | # # HARDCODED: adding the era5 data to the tile_info 93 | # if len(ee_set_.era5_data) > 0: 94 | # tile_info['era5'] = ee_set_.era5_data 95 | return tile_info 96 | else: 97 | return_dict = {} 98 | return_dict['S2_DATE'] = ee_set_.s2_date 99 | # return_dict['S2_IMAGEID'] = ee_set_.s2_imageid 100 | return_dict['S2_type'] = ee_set_.s2_type 101 | return_dict['CRS'] = ee_set_.crs 102 | return_dict['lat'] = ee_set_.lat 103 | return_dict['lon'] = ee_set_.lon 104 | return_dict['biome'] = ee_set_.biome 105 | return_dict['eco_region'] = ee_set_.eco_region 106 | return_dict['NO_DATA'] = ee_set_.no_data 107 | return_dict['BANDS'] = ee_set_.img_bands 108 | if len(ee_set_.era5_data) > 0: 109 | return_dict['era5'] = ee_set_.era5_data 110 | return return_dict 111 | 112 | 113 | 114 | def get_points_filter(roi, buffer_size=0): 115 | pnt_roi = roi.buffer(buffer_size, ee.ErrorMargin(1)).bounds() 116 | coord_list = ee.List(pnt_roi.coordinates().get(0)) 117 | b_left = ee.Geometry.Point(coord_list.get(0)) 118 | b_right = ee.Geometry.Point(coord_list.get(1)) 119 | t_right = ee.Geometry.Point(coord_list.get(2)) 120 | t_left = ee.Geometry.Point(coord_list.get(3)) 121 | 122 | points_filter = ee.Filter.And( 123 | ee.Filter.geometry(b_right), 124 | ee.Filter.geometry(t_left) 125 | ) 126 | 127 | return points_filter 128 | 129 | 130 | def get_ee_task_list(): 131 | ''' 132 | Gets the list of all the tasks in the EE project. 133 | ''' 134 | tasks = [] 135 | task_list = ee.data.getTaskList() 136 | for task in task_list: 137 | if task['state'] in ['RUNNING', 'READY']: 138 | tasks.append(task['id']) 139 | return tasks 140 | 141 | 142 | def read_txt(path): 143 | ''' 144 | Reads the txt file and returns a list of the lines in the file. 145 | ''' 146 | string_list = [] 147 | with open(path, 'r') as f: 148 | for line in f: 149 | string_list.append(line.strip()) 150 | return string_list 151 | 152 | def write_json(path, data): 153 | ''' 154 | Writes the data to the json file. 155 | ''' 156 | with open(path, 'w') as f: 157 | json.dump(data, f) 158 | 159 | def read_json(path): 160 | ''' 161 | Reads the json file and returns the data. 162 | ''' 163 | with open(path, 'r') as f: 164 | data = json.load(f) 165 | return data 166 | 167 | 168 | if __name__ == '__main__': 169 | merge_dicts('/home/qbk152/vishal/global-lr/data/data_300k_130_tile_info.json') 170 | # create_missing_tiles_geojson() --------------------------------------------------------------------------------