├── .DS_Store
├── .gitignore
├── LICENSE
├── LICENSE-data
├── README.md
├── bash_scripts
    ├── data_100k_128.sh
    ├── data_1M_128.sh
    └── data_1M_64.sh
├── config
    ├── .DS_Store
    ├── config_data.yaml
    ├── config_tiles.yaml
    └── ee_init.py
├── create_tiles_polygon.py
├── data_exp
    ├── data_exp.py
    ├── density_maps.py
    └── view_h5.py
├── ee_utils
    ├── .DS_Store
    └── ee_data.py
├── main_download.py
├── post_download.py
├── redownload.py
├── requirements.txt
├── slurm_scripts
    ├── slurm_create_tiles.sh
    ├── slurm_download_parallel.sh
    ├── slurm_download_seq.sh
    ├── slurm_redownload_parallel.sh
    └── slurm_temp.sh
├── stats
    ├── biome_labels.json
    ├── biome_names.json
    ├── biome_stats.json
    ├── eco_labels.json
    ├── realm_stats.json
    ├── total_area_biome.json
    └── total_area_eco_region.json
├── tmp.py
└── utils
    ├── biome_data_utils.py
    ├── chunking_h5.py
    ├── convert_to_h5.py
    ├── normalization.py
    ├── splits.py
    └── utils.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | outputs/
 3 | 1M_v001_plots/
 4 | data/
 5 | # data_*/
 6 | tests/
 7 | tile_polygons/
 8 | data_1000/
 9 | tile_info_*
10 | tiles_*
11 | tile_info/
12 | __pycache__/
13 | config/__pycache__/
14 | ee_utils/__pycache__/
15 | utils/__pycache__/
16 | .DS_Store
17 | ee_utils/.DS_Store
18 | config/.DS_Store
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Vishal Nedungadi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE-data:
--------------------------------------------------------------------------------
  1 | Attribution 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More_considerations
 52 |      for the public:
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution 4.0 International Public License
 58 | 
 59 | By exercising the Licensed Rights (defined below), You accept and agree
 60 | to be bound by the terms and conditions of this Creative Commons
 61 | Attribution 4.0 International Public License ("Public License"). To the
 62 | extent this Public License may be interpreted as a contract, You are
 63 | granted the Licensed Rights in consideration of Your acceptance of
 64 | these terms and conditions, and the Licensor grants You such rights in
 65 | consideration of benefits the Licensor receives from making the
 66 | Licensed Material available under these terms and conditions.
 67 | 
 68 | 
 69 | Section 1 -- Definitions.
 70 | 
 71 |   a. Adapted Material means material subject to Copyright and Similar
 72 |      Rights that is derived from or based upon the Licensed Material
 73 |      and in which the Licensed Material is translated, altered,
 74 |      arranged, transformed, or otherwise modified in a manner requiring
 75 |      permission under the Copyright and Similar Rights held by the
 76 |      Licensor. For purposes of this Public License, where the Licensed
 77 |      Material is a musical work, performance, or sound recording,
 78 |      Adapted Material is always produced where the Licensed Material is
 79 |      synched in timed relation with a moving image.
 80 | 
 81 |   b. Adapter's License means the license You apply to Your Copyright
 82 |      and Similar Rights in Your contributions to Adapted Material in
 83 |      accordance with the terms and conditions of this Public License.
 84 | 
 85 |   c. Copyright and Similar Rights means copyright and/or similar rights
 86 |      closely related to copyright including, without limitation,
 87 |      performance, broadcast, sound recording, and Sui Generis Database
 88 |      Rights, without regard to how the rights are labeled or
 89 |      categorized. For purposes of this Public License, the rights
 90 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 91 |      Rights.
 92 | 
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. Share means to provide material to the public by any means or
116 |      process that requires permission under the Licensed Rights, such
117 |      as reproduction, public display, public performance, distribution,
118 |      dissemination, communication, or importation, and to make material
119 |      available to the public including in ways that members of the
120 |      public may access the material from a place and at a time
121 |      individually chosen by them.
122 | 
123 |   j. Sui Generis Database Rights means rights other than copyright
124 |      resulting from Directive 96/9/EC of the European Parliament and of
125 |      the Council of 11 March 1996 on the legal protection of databases,
126 |      as amended and/or succeeded, as well as other essentially
127 |      equivalent rights anywhere in the world.
128 | 
129 |   k. You means the individual or entity exercising the Licensed Rights
130 |      under this Public License. Your has a corresponding meaning.
131 | 
132 | 
133 | Section 2 -- Scope.
134 | 
135 |   a. License grant.
136 | 
137 |        1. Subject to the terms and conditions of this Public License,
138 |           the Licensor hereby grants You a worldwide, royalty-free,
139 |           non-sublicensable, non-exclusive, irrevocable license to
140 |           exercise the Licensed Rights in the Licensed Material to:
141 | 
142 |             a. reproduce and Share the Licensed Material, in whole or
143 |                in part; and
144 | 
145 |             b. produce, reproduce, and Share Adapted Material.
146 | 
147 |        2. Exceptions and Limitations. For the avoidance of doubt, where
148 |           Exceptions and Limitations apply to Your use, this Public
149 |           License does not apply, and You do not need to comply with
150 |           its terms and conditions.
151 | 
152 |        3. Term. The term of this Public License is specified in Section
153 |           6(a).
154 | 
155 |        4. Media and formats; technical modifications allowed. The
156 |           Licensor authorizes You to exercise the Licensed Rights in
157 |           all media and formats whether now known or hereafter created,
158 |           and to make technical modifications necessary to do so. The
159 |           Licensor waives and/or agrees not to assert any right or
160 |           authority to forbid You from making technical modifications
161 |           necessary to exercise the Licensed Rights, including
162 |           technical modifications necessary to circumvent Effective
163 |           Technological Measures. For purposes of this Public License,
164 |           simply making modifications authorized by this Section 2(a)
165 |           (4) never produces Adapted Material.
166 | 
167 |        5. Downstream recipients.
168 | 
169 |             a. Offer from the Licensor -- Licensed Material. Every
170 |                recipient of the Licensed Material automatically
171 |                receives an offer from the Licensor to exercise the
172 |                Licensed Rights under the terms and conditions of this
173 |                Public License.
174 | 
175 |             b. No downstream restrictions. You may not offer or impose
176 |                any additional or different terms or conditions on, or
177 |                apply any Effective Technological Measures to, the
178 |                Licensed Material if doing so restricts exercise of the
179 |                Licensed Rights by any recipient of the Licensed
180 |                Material.
181 | 
182 |        6. No endorsement. Nothing in this Public License constitutes or
183 |           may be construed as permission to assert or imply that You
184 |           are, or that Your use of the Licensed Material is, connected
185 |           with, or sponsored, endorsed, or granted official status by,
186 |           the Licensor or others designated to receive attribution as
187 |           provided in Section 3(a)(1)(A)(i).
188 | 
189 |   b. Other rights.
190 | 
191 |        1. Moral rights, such as the right of integrity, are not
192 |           licensed under this Public License, nor are publicity,
193 |           privacy, and/or other similar personality rights; however, to
194 |           the extent possible, the Licensor waives and/or agrees not to
195 |           assert any such rights held by the Licensor to the limited
196 |           extent necessary to allow You to exercise the Licensed
197 |           Rights, but not otherwise.
198 | 
199 |        2. Patent and trademark rights are not licensed under this
200 |           Public License.
201 | 
202 |        3. To the extent possible, the Licensor waives any right to
203 |           collect royalties from You for the exercise of the Licensed
204 |           Rights, whether directly or through a collecting society
205 |           under any voluntary or waivable statutory or compulsory
206 |           licensing scheme. In all other cases the Licensor expressly
207 |           reserves any right to collect such royalties.
208 | 
209 | 
210 | Section 3 -- License Conditions.
211 | 
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 | 
215 |   a. Attribution.
216 | 
217 |        1. If You Share the Licensed Material (including in modified
218 |           form), You must:
219 | 
220 |             a. retain the following if it is supplied by the Licensor
221 |                with the Licensed Material:
222 | 
223 |                  i. identification of the creator(s) of the Licensed
224 |                     Material and any others designated to receive
225 |                     attribution, in any reasonable manner requested by
226 |                     the Licensor (including by pseudonym if
227 |                     designated);
228 | 
229 |                 ii. a copyright notice;
230 | 
231 |                iii. a notice that refers to this Public License;
232 | 
233 |                 iv. a notice that refers to the disclaimer of
234 |                     warranties;
235 | 
236 |                  v. a URI or hyperlink to the Licensed Material to the
237 |                     extent reasonably practicable;
238 | 
239 |             b. indicate if You modified the Licensed Material and
240 |                retain an indication of any previous modifications; and
241 | 
242 |             c. indicate the Licensed Material is licensed under this
243 |                Public License, and include the text of, or the URI or
244 |                hyperlink to, this Public License.
245 | 
246 |        2. You may satisfy the conditions in Section 3(a)(1) in any
247 |           reasonable manner based on the medium, means, and context in
248 |           which You Share the Licensed Material. For example, it may be
249 |           reasonable to satisfy the conditions by providing a URI or
250 |           hyperlink to a resource that includes the required
251 |           information.
252 | 
253 |        3. If requested by the Licensor, You must remove any of the
254 |           information required by Section 3(a)(1)(A) to the extent
255 |           reasonably practicable.
256 | 
257 |        4. If You Share Adapted Material You produce, the Adapter's
258 |           License You apply must not prevent recipients of the Adapted
259 |           Material from complying with this Public License.
260 | 
261 | 
262 | Section 4 -- Sui Generis Database Rights.
263 | 
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 | 
267 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 |      to extract, reuse, reproduce, and Share all or a substantial
269 |      portion of the contents of the database;
270 | 
271 |   b. if You include all or a substantial portion of the database
272 |      contents in a database in which You have Sui Generis Database
273 |      Rights, then the database in which You have Sui Generis Database
274 |      Rights (but not its individual contents) is Adapted Material; and
275 | 
276 |   c. You must comply with the conditions in Section 3(a) if You Share
277 |      all or a substantial portion of the contents of the database.
278 | 
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 | 
283 | 
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 | 
286 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 | 
297 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 | 
307 |   c. The disclaimer of warranties and limitation of liability provided
308 |      above shall be interpreted in a manner that, to the extent
309 |      possible, most closely approximates an absolute disclaimer and
310 |      waiver of all liability.
311 | 
312 | 
313 | Section 6 -- Term and Termination.
314 | 
315 |   a. This Public License applies for the term of the Copyright and
316 |      Similar Rights licensed here. However, if You fail to comply with
317 |      this Public License, then Your rights under this Public License
318 |      terminate automatically.
319 | 
320 |   b. Where Your right to use the Licensed Material has terminated under
321 |      Section 6(a), it reinstates:
322 | 
323 |        1. automatically as of the date the violation is cured, provided
324 |           it is cured within 30 days of Your discovery of the
325 |           violation; or
326 | 
327 |        2. upon express reinstatement by the Licensor.
328 | 
329 |      For the avoidance of doubt, this Section 6(b) does not affect any
330 |      right the Licensor may have to seek remedies for Your violations
331 |      of this Public License.
332 | 
333 |   c. For the avoidance of doubt, the Licensor may also offer the
334 |      Licensed Material under separate terms or conditions or stop
335 |      distributing the Licensed Material at any time; however, doing so
336 |      will not terminate this Public License.
337 | 
338 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 |      License.
340 | 
341 | 
342 | Section 7 -- Other Terms and Conditions.
343 | 
344 |   a. The Licensor shall not be bound by any additional or different
345 |      terms or conditions communicated by You unless expressly agreed.
346 | 
347 |   b. Any arrangements, understandings, or agreements regarding the
348 |      Licensed Material not stated herein are separate from and
349 |      independent of the terms and conditions of this Public License.
350 | 
351 | 
352 | Section 8 -- Interpretation.
353 | 
354 |   a. For the avoidance of doubt, this Public License does not, and
355 |      shall not be interpreted to, reduce, limit, restrict, or impose
356 |      conditions on any use of the Licensed Material that could lawfully
357 |      be made without permission under this Public License.
358 | 
359 |   b. To the extent possible, if any provision of this Public License is
360 |      deemed unenforceable, it shall be automatically reformed to the
361 |      minimum extent necessary to make it enforceable. If the provision
362 |      cannot be reformed, it shall be severed from this Public License
363 |      without affecting the enforceability of the remaining terms and
364 |      conditions.
365 | 
366 |   c. No term or condition of this Public License will be waived and no
367 |      failure to comply consented to unless expressly agreed to by the
368 |      Licensor.
369 | 
370 |   d. Nothing in this Public License constitutes or may be interpreted
371 |      as a limitation upon, or waiver of, any privileges and immunities
372 |      that apply to the Licensor or You, including from the legal
373 |      processes of any jurisdiction or authority.
374 | 
375 | 
376 | =======================================================================
377 | 
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 | 
395 | Creative Commons may be contacted at creativecommons.org.
396 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ![MMEarth-logo](https://github.com/vishalned/MMEarth-data/assets/27778126/09675b82-ff9e-43be-9160-3267b948e941)
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | # MMEarth - Data Downloading
  9 | [![Project Website](https://img.shields.io/badge/Project%20Website-8A2BE2)](https://vishalned.github.io/mmearth)
 10 | [![Paper](https://img.shields.io/badge/arXiv-2405.02771-blue)](https://arxiv.org/abs/2405.02771)
 11 | [![Code - Models](https://img.shields.io/badge/Code%20--%20Model-darkgreen)](https://github.com/vishalned/MMEarth-train/tree/main)
 12 | 
 13 | 
 14 | This repository contains scripts to download the data presented in the paper [MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial Representation Learning](https://arxiv.org/abs/2405.02771). The scripts are used  to download large scale satellite data from different sensors and satellites (Sentinel-2, Sentinel-1, ERA5 - temperature & precipitation, Aster GDEM etc) which we call modalities. The data is downloaded from [Google Earth Engine](https://earthengine.google.com/).
 15 | 
 16 | ## 📢 Latest Updates
 17 | :fire::fire::fire: Last Updated on 2024.11.07 :fire::fire::fire:
 18 | - MMEarth has been added to the [TorchGeo](https://torchgeo.readthedocs.io/en/latest/api/datasets.html#mmearth) datasets class.
 19 | - **Paper accepted to ECCV 2024 !!**
 20 | - Updated datasets to version v001.
 21 |     - Dataset fix: Removed duplicates and corrected ERA5 yearly statistics.
 22 | - Fixed downloading scripts.
 23 | 
 24 | 
 25 | ## Table of contents
 26 | 1. [Data Download](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#data-download)
 27 | 2. [Data Loading](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#data-loading)
 28 | 3. [Getting Started](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#getting-started)
 29 | 4. [Data Stacks](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#data-stacks)
 30 | 5. [Code Structure](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#code-structure)
 31 | 6. [Slurm Execution](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#slurm-execution)
 32 | 7. [Citation](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#citation)
 33 | 
 34 | ## Data Download
 35 | The MMEarth data can be downloaded using the following links. To enable more easier development with Multi-Modal data, we also provide 2 more "taster" datasets along with the original MMEarth data. The license for the data is [CC BY 4.0](https://github.com/vishalned/MMEarth-data/blob/main/LICENSE-data). 
 36 | 
 37 | :bangbang:  **UPDATE: The new Version 001 data is ready to download.** 
 38 | 
 39 | | **Dataset** | **Image Size** | **Number of Tiles** | **Dataset size** | **Data Link** | **Bash Script** |
 40 | | :---: | :---: | :---: | :---: | :---: | :---: |
 41 | | MMEarth | 128x128 | 1.2M | 597GB | [download](https://sid.erda.dk/sharelink/ChL1BoVEyH) | [bash](https://github.com/vishalned/MMEarth-data/blob/main/bash_scripts/data_1M_128.sh)|
 42 | | MMEarth64 | 64x64 | 1.2M | 152GB | [download](https://sid.erda.dk/sharelink/bX5JzPuwJF) | [bash](https://github.com/vishalned/MMEarth-data/blob/main/bash_scripts/data_1M_64.sh)|
 43 | | MMEarth100k | 128x128 | 100k | 48GB | [download](https://sid.erda.dk/sharelink/CoaUojVXzu) | [bash](https://github.com/vishalned/MMEarth-data/blob/main/bash_scripts/data_100k_128.sh)|
 44 | 
 45 | All 3 dataset have a similar structure as below:
 46 | 
 47 |     .
 48 |     ├── data_1M_v001/                      # root data directory
 49 |     │   ├── data_1M_v001.h5                # h5 file containing the data
 50 |     │   ├── data_1M_v001_band_stats.json   # json file containing information about the bands present in the h5 file for each data stack
 51 |     │   ├── data_1M_v001_splits.json       # json file containing information for train, val, test splits
 52 |     │   └── data_1M_v001_tile_info.json    # json file containing additional meta information of each tile that was downloaded. 
 53 |   
 54 | 
 55 | ## Data Loading
 56 | A sample Jupyter Notebook that shows an example to load the data using pytorch is [here](https://github.com/vishalned/MMEarth-train/blob/main/examples/data_loader_example.ipynb). Alternatively, the dataloader has also been added to [TorchGeo](https://torchgeo.readthedocs.io/en/latest/api/datasets.html#mmearth).
 57 | 
 58 | ## Getting Started
 59 | To get started with this repository, you can install the dependencies and packages with this command 
 60 | 
 61 | ```sh
 62 | pip install -r requirements.txt
 63 | ```
 64 | 
 65 | Once this is done, you need to setup gcloud and earthengine to make the code work. Follow the below steps:
 66 | - Earthengine requires the initialization of gcloud, so install gcloud by following the instructions from [here](https://cloud.google.com/sdk/docs/install).
 67 | - Setting up earthengine on your local machine: To setup earthengine on your local machine run `earthengine authenticate`.
 68 | - Setting up earthengine on a remote cluster: Many times `earthengine authenticate` doesnt directly work since you will get multiple links to click, and these links
 69 |   wouldnt work when opening them from the browser on your local machine. Hence run this command `earthengine authenticate --quiet`. Follow the instructions on your terminal
 70 |   and everything should work. An additional step is to add the project name in every file that has `earthengine.initialize(project = '$PROJECT_NAME')`.
 71 | 
 72 | ## Data Stacks
 73 | This repository allows downloading data from various sensors. Currently the code is written to download the following sensors/modalities:
 74 | - [Sentinel-2](https://developers.google.com/earth-engine/datasets/catalog/sentinel-2)
 75 | - [Sentinel-1](https://developers.google.com/earth-engine/datasets/catalog/COPERNICUS_S1_GRD)
 76 | - [ERA5 (Temperature and precipitation)](https://developers.google.com/earth-engine/datasets/catalog/ECMWF_ERA5_LAND_MONTHLY_AGGR)
 77 | - [Aster GDEM (Elevation and Slope)](https://gee-community-catalog.org/projects/aster/)
 78 | - [Dynamic world (LULC)](https://developers.google.com/earth-engine/datasets/catalog/GOOGLE_DYNAMICWORLD_V1)
 79 | - [Canopy Height](https://gee-community-catalog.org/projects/canopy/)
 80 | - [ESA WorldCover](https://developers.google.com/earth-engine/datasets/catalog/ESA_WorldCover_v100)
 81 | 
 82 |   
 83 | 
 84 | ## Code Structure
 85 | The data downloading happens only when you have a geojson file with all the tiles you want to download. Here tiles represent ROI (or polygons) for each location that you want. Once you have the tiles, the data stacks (data for each modality) are downloaded for each tile in the geojson. The data can be downloaded by following this broad structure, and each of these points are further explained below:
 86 | * creating tiles (small ROIs sampled globally)
 87 | * download data stacks for each of the tiles
 88 | * post processing of the downloaded data
 89 | * redownload (if needed)
 90 | 
 91 | #### Creating Tiles
 92 | - `create_tiles_polygon.py` is the file used to create the tiles. The corresponding config is `config/config_tiles.yaml`.  For a global sample, the various sampling techniques are based on the biomes and ecoregions from the [RESOLVE ECOREGIONS](https://developers.google.com/earth-engine/datasets/catalog/RESOLVE_ECOREGIONS_2017).
 93 | - In the config you can set the size of the tile in meters along with the number of tiles to download and the sampling method (how to sample the tiles in a region).
 94 | 
 95 | #### Downloading Data Stacks 
 96 | - `main_download.py` is the main script to download the data. The corresponding config is `config/config_data.yaml`. The config file contains various parameter to be set regarding the different modalities, and paths. Based on the geojson file created from the above step, this file downloads the data stacks for each tile.
 97 | - The `ee_utils/ee_data.py` file contains custom functions for retrieving each modality in the data stack from GEE. It merges all these modalities into one array, and export it as a GeoTIFF file. The band information and other tile information is stored in a json file (`tile_info.json`).
 98 | 
 99 | #### Post Processing
100 | - The `post_download.py` file performs 4 operations sequentially:
101 |   - Mergining multiple `tile_info.json` files (these files are created when parallely downloading using slurm - explained more below)
102 |   - Converting the GeoTIFFs to single hdf5 file.
103 |   - Obtaining statistics for each band. (used for normalization purposes)
104 |   - Computing the splits (train, val splits - only if needed).
105 |  
106 | 
107 | #### Redownload
108 | - `redownload.py` is the file that can be used to redownload any tiles that failed to download. Sometimes when downloading the data stacks, the script can skip tiles due to various reasons (lack of sentinel-2 reference image, network issues, GEE issues). Hence if needed, we have an option to redownload these tile. (An alternative is to just download more tiles than needed).
109 | 
110 | 
111 | (**NOTE**: The files are executed by making use of SLURM. More information on this is provided in the [Slurm Execution](https://github.com/vishalned/MMEarth-data?tab=readme-ov-file#slurm-execution) section)
112 | 
113 | ## Slurm Execution
114 | 
115 | <img width="815" alt="MMEarth-data" src="https://github.com/vishalned/MMEarth-data/assets/27778126/02764bda-7384-4359-bdae-01c4456239a0">
116 | 
117 | 
118 | **Downloading Data Stacks:** GEE provides a function called `getDownloadUrl()` that allows you to export images as GeoTIFF files. We extend this by merging all modalities for a single location into one image, and export this as a single GeoTIFF file. To further speed up the data downloading, we make use of parallel processing using SLURM. The above figures give an idea of how this is done. The tile information (tile GeoJSON) contains location information and more about N tiles we need to download. N/40 tiles are downloaded by 40 slurm jobs (we set the max jobs as 40 since this is the maximum number of concurrent requests by the GEE API). 
119 |   
120 | To run the slurm parallel download, execute the following command
121 | ```sh
122 | sbatch slurm_scripts/slurm_download_parallel.sh
123 | ```
124 | 
125 | 
126 | ## Citation 
127 | Please cite our paper if you use this code or any of the provided data.
128 | 
129 | Vishal Nedungadi, Ankit Kariryaa, Stefan Oehmcke, Serge Belongie, Christian Igel, & Nico Lang (2024). MMEarth: Exploring Multi-Modal Pretext Tasks For Geospatial Representation Learning.
130 | ```
131 | @inproceedings{nedungadi2024mmearth,
132 |   title={MMEarth: Exploring multi-modal pretext tasks for geospatial representation learning},
133 |   author={Nedungadi, Vishal and Kariryaa, Ankit and Oehmcke, Stefan and Belongie, Serge and Igel, Christian and Lang, Nico},
134 |   booktitle={European Conference on Computer Vision},
135 |   pages={164--182},
136 |   year={2024},
137 |   organization={Springer}
138 | }
139 | ```
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 |   
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/bash_scripts/data_100k_128.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # URLs of the files to download
 4 | file_urls=(
 5 |     "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001_band_stats.json"
 6 |     "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001_tile_info.json"
 7 |     "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001_splits.json"
 8 |     "https://sid.erda.dk/share_redirect/CoaUojVXzu/data_100k_v001.h5"
 9 |     "https://sid.erda.dk/share_redirect/CoaUojVXzu/LICENSE-data"
10 | )
11 | 
12 | # Destination folder to save the downloaded files
13 | destination_folder="./data_100k_v001/"
14 | 
15 | # Create the destination folder if it doesn't exist
16 | mkdir -p "$destination_folder"
17 | 
18 | # Loop through each URL and download the corresponding file
19 | for url in "${file_urls[@]}"; do
20 |     # Extract filename from URL
21 |     filename=$(basename "$url")
22 |     # Download the file using curl
23 |     curl -o "${destination_folder}${filename}" "$url"
24 |     # Check if the download was successful
25 |     if [ $? -eq 0 ]; then
26 |         echo "File '${filename}' downloaded successfully."
27 |     else
28 |         echo "Failed to download the file '${filename}'."
29 |     fi
30 | done
31 | 
32 | 


--------------------------------------------------------------------------------
/bash_scripts/data_1M_128.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # URLs of the files to download
 4 | file_urls=(
 5 |     "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001_band_stats.json"
 6 |     "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001_tile_info.json"
 7 |     "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001_splits.json"
 8 |     "https://sid.erda.dk/share_redirect/ChL1BoVEyH/data_1M_v001.h5"
 9 |     "https://sid.erda.dk/share_redirect/ChL1BoVEyH/LICENSE-data"
10 | )
11 | 
12 | # Destination folder to save the downloaded files
13 | destination_folder="./data_1M_v001/"
14 | 
15 | # Create the destination folder if it doesn't exist
16 | mkdir -p "$destination_folder"
17 | 
18 | # Loop through each URL and download the corresponding file
19 | for url in "${file_urls[@]}"; do
20 |     # Extract filename from URL
21 |     filename=$(basename "$url")
22 |     # Download the file using curl
23 |     curl -o "${destination_folder}${filename}" "$url"
24 |     # Check if the download was successful
25 |     if [ $? -eq 0 ]; then
26 |         echo "File '${filename}' downloaded successfully."
27 |     else
28 |         echo "Failed to download the file '${filename}'."
29 |     fi
30 | done
31 | 
32 | 


--------------------------------------------------------------------------------
/bash_scripts/data_1M_64.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # URLs of the files to download
 4 | file_urls=(
 5 |     "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64_band_stats.json"
 6 |     "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64_tile_info.json"
 7 |     "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64_splits.json"
 8 |     "https://sid.erda.dk/share_redirect/bX5JzPuwJF/data_1M_v001_64.h5"
 9 |     "https://sid.erda.dk/share_redirect/bX5JzPuwJF/LICENSE-data"
10 | )
11 | 
12 | # Destination folder to save the downloaded files
13 | destination_folder="./data_1M_v001_64/"
14 | 
15 | # Create the destination folder if it doesn't exist
16 | mkdir -p "$destination_folder"
17 | 
18 | # Loop through each URL and download the corresponding file
19 | for url in "${file_urls[@]}"; do
20 |     # Extract filename from URL
21 |     filename=$(basename "$url")
22 |     # Download the file using curl
23 |     curl -o "${destination_folder}${filename}" "$url"
24 |     # Check if the download was successful
25 |     if [ $? -eq 0 ]; then
26 |         echo "File '${filename}' downloaded successfully."
27 |     else
28 |         echo "Failed to download the file '${filename}'."
29 |     fi
30 | done
31 | 
32 | 


--------------------------------------------------------------------------------
/config/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/config/.DS_Store


--------------------------------------------------------------------------------
/config/config_data.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # general config
 3 | export_folder: /projects/dereeco/data/global-lr/data_1M_v001_era5/ # name of the main folder inside the bucket or the local folder
 4 | # bucket: global-dataset # name of the bucket
 5 | start_from: 0 # start from the 0th tile or start from a custom tile (useful if the script fails and you want to start from where it left off)
 6 | end_at: 1000 # end at the 1000th tile or end at a custom tile (useful if the script fails and you want to start from where it left off)
 7 | log: INFO # log level #DEBUG, INFO, ERROR
 8 | tiles_path: '/projects/dereeco/data/global-lr/geojson_files/tiles_1M_v001.geojson' #1000 tiles
 9 | tile_info_path: '/projects/dereeco/data/global-lr/data_1M_v001_era5/data' # this is the path that contains all the tile info - useful if you want to start with a new data apart from s2
10 | seed: 42 # seed for random image selection in S2  
11 | 
12 | 
13 | # dataset config
14 | 
15 | # dataset to download
16 | # THINGS TO NOTE: for this version of the code, you need to put sentinel2 first if you want to download it along with other datasets. This is because s2 is the base dataset and we use it to get the tile information.
17 | # Incase you have already downloaded s2, then you can put any dataset first, and it downloads the other datasets. The name of the dataset should be the same name as the function call. Here are the names of the functions:
18 | # sentinel2, sentinel1, srtm, era5
19 | # datasets: ["sentinel2", "sentinel1", "aster", "era5", "dynamic_world", "canopy_height_eth", "esa_worldcover"] 
20 | datasets: ["era5"]
21 | 
22 | 
23 | # config for the data
24 | # make sure there is no space in the 'name' field
25 | sentinel2:
26 |   name: "sentinel2" 
27 |   BANDS: [["B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8A", "B8", "B9", "B11", "B12", "SCL", "MSK_CLDPRB", "QA60"],
28 |           ["B1", "B2", "B3", "B4", "B5", "B6", "B7", "B8A", "B8", "B9", "B10", "B11", "B12", "QA60"]]
29 |   collection: ["COPERNICUS/S2_SR_HARMONIZED",
30 |               "COPERNICUS/S2_HARMONIZED"]
31 | 
32 | sentinel1:
33 |   name: "sentinel1" 
34 |   BANDS: ["VV", "VH", "HH", "HV"] # we download all the bands and both orbits
35 |   collection: "COPERNICUS/S1_GRD"
36 | 
37 | aster:
38 |   name: "aster"
39 |   BANDS: ["b1"]
40 |   collection: "projects/sat-io/open-datasets/ASTER/GDEM"
41 | 
42 | era5:
43 |   name: "era5"
44 |   BANDS: ["temperature_2m", "temperature_2m_min", "temperature_2m_max", "total_precipitation_sum"]
45 |   collection: "ECMWF/ERA5_LAND/MONTHLY_AGGR"
46 | 
47 | 
48 | dynamic_world:
49 |   name: "dynamic_world"
50 |   BANDS: ["label"]
51 |   collection: "GOOGLE/DYNAMICWORLD/V1"
52 | 
53 | canopy_height_eth:
54 |   name: "canopy_height_eth"
55 |   COLLECTIONS: ["users/nlang/ETH_GlobalCanopyHeight_2020_10m_v1", "users/nlang/ETH_GlobalCanopyHeightSD_2020_10m_v1"]
56 | 
57 | esa_worldcover:
58 |   name: "esa_worldcover"
59 |   BANDS: ["Map"]
60 |   collection: "ESA/WorldCover/v100"
61 | 
62 | 
63 | 
64 | 
65 | 
66 | # do not change anything below this line, the main script will automatically update these values
67 | update_geojson: False
68 | read_tile_info: False
69 | defaults:
70 |   - hydra/job_logging: disabled # by default hydra has a logging config file, comment this line if you want to use that instead. That will print the logs to the console, and also save it to a file


--------------------------------------------------------------------------------
/config/config_tiles.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | biome_names_path: '/home/qbk152/vishal/global-lr/stats/biome_names.json' # path to the file containing the names of the biomes
 3 | tiles_geojson_path: '/home/qbk152/vishal/global-lr/tiles_1M_v001.geojson' # path to the file containing the tiles 
 4 | failed_eco_regions_path: '/home/qbk152/vishal/global-lr/failed_eco_regions.txt' # path to the file containing the eco regions that failed to download
 5 | 
 6 | # files that contain the total area of each biome and eco-region (these are precomputed)
 7 | area_biome_path: '/home/qbk152/vishal/global-lr/stats/total_area_biome.json'
 8 | area_eco_path: '/home/qbk152/vishal/global-lr/stats/total_area_eco_region.json'
 9 | 
10 | 
11 | tile_size: 1300 # 1.3km
12 | num_of_images: 1400000 # always set this number more than the number of tiles required. This is because some tiles might fail to download
13 | num_of_biomes: 14 # number of biomes in the world (do not change)
14 | 
15 | # uniform sampling type:
16 | # 0: uniform across biomes without equal sampling within each eco-region inside a biome
17 | # 1: uniform across biomes and equal sampling within each eco-region inside a biome
18 | # 2: uniform across eco-regions
19 | uniform_type: 0
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/config/ee_init.py:
--------------------------------------------------------------------------------
1 | import ee
2 | 
3 | # ee.Initialize(project='global-rl-2', opt_url='https://earthengine-highvolume.googleapis.com')
4 | ee.Initialize(project='global-rl-2')


--------------------------------------------------------------------------------
/create_tiles_polygon.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A script to create a geojson with the tiles from various ecoregions or biomes of the world based on the CONFIG file
  3 | '''
  4 | 
  5 | import hashlib
  6 | import ee
  7 | from datetime import datetime
  8 | import json
  9 | # Initialize Earth Engine
 10 | ee.Initialize(project = 'global-rl-2')
 11 | import hydra
 12 | from omegaconf import DictConfig, OmegaConf
 13 | import random
 14 | 
 15 | 
 16 | 
 17 | @hydra.main(config_path='config', config_name='config_tiles')
 18 | def main(cfg: DictConfig) -> None:
 19 |     print(OmegaConf.to_yaml(cfg))
 20 | 
 21 |     eco_region = ee.FeatureCollection("RESOLVE/ECOREGIONS/2017")
 22 | 
 23 |     NUM_IMAGES = cfg.num_of_images
 24 |     if cfg.uniform_type == 1:
 25 |         print('------- Unform across biomes --------')
 26 |         NUM_IMAGES_PER_BIOME = NUM_IMAGES // cfg.num_of_biomes
 27 |     elif cfg.uniform_type == 2:
 28 |         print('------- Unform across eco-regions --------')
 29 |     elif cfg.uniform_type == 0:
 30 |         print('------- Unform across biomes only --------')
 31 |         NUM_IMAGES_PER_BIOME = NUM_IMAGES // cfg.num_of_biomes
 32 |         area_biome = json.load(open(cfg.aree_biome_path))
 33 |         area_eco = json.load(open(cfg.area_eco_path))
 34 | 
 35 |     # getting the list of biomes
 36 |     biome_names = json.load(open(cfg.biome_names_path))
 37 |     
 38 |     tile_id_count = 0 # we use a simple number to keep track of the tile_id. This is just a number that is incremented by 1 for each tile
 39 | 
 40 |     tiles_list = []
 41 |     failed_eco_regions = []
 42 |     biomes = list(biome_names.keys())[0:-1] # skipping the last one because it is rock and ice
 43 | 
 44 |     # biome loop
 45 |     for j, biome in enumerate(biomes): 
 46 |         print(f'Biome {j+1}/14: {biome}')
 47 |         if cfg.uniform_type == 1 or cfg.uniform_type == 0:
 48 |             print('Number of images per biome: ', NUM_IMAGES_PER_BIOME)
 49 |         print('Number of eco-regions: ', len(biome_names[biome]))
 50 | 
 51 |         # eco region loop
 52 |         for i, eco in enumerate(biome_names[biome]):
 53 |             try:
 54 |                 eco_region_name, realm = eco[0], eco[1]
 55 |                 print(f'Eco-region {i}/{len(biome_names[biome])}: {eco_region_name} ')
 56 |                 if cfg.uniform_type == 1:
 57 |                     num_of_tiles = NUM_IMAGES_PER_BIOME // len(biome_names[biome]) 
 58 |                 elif cfg.uniform_type == 2:
 59 |                     num_of_tiles = NUM_IMAGES // 846 # 846 is the total number of eco-regions in the RESOLVE ecoregions dataset
 60 |                 elif cfg.uniform_type == 0:
 61 |                     num_of_tiles = int(NUM_IMAGES_PER_BIOME * (area_eco[eco_region_name] / area_biome[biome]))
 62 | 
 63 |                 print('Number of tiles in the eco-region: ', num_of_tiles)
 64 | 
 65 |                 # gee only allows max of 5000 features to be exported at a time. So we need to split the eco-regions into smaller batches
 66 |                 num_while_loops = 0 # a variable to keep track of the number of while loops we have done
 67 |                 while num_of_tiles > 0:
 68 |                     
 69 |                     tiles_to_export = min(num_of_tiles, 5000)
 70 | 
 71 |                     print('Tiles to export inside the while loop: ', tiles_to_export)
 72 | 
 73 | 
 74 |                     single_region = eco_region.filter(ee.Filter.eq('ECO_NAME', eco_region_name))
 75 |                     
 76 |                     # the following 2 lines is just to generate a number based on a string. This ensure that the number is the same for the same string. 
 77 |                     # we mod it by 10^5 to keep it small.
 78 |                     coord_string = f"{i}{eco_region_name}{tiles_to_export}{num_while_loops}" 
 79 |                     seed = int(hashlib.sha256(coord_string.encode('utf-8')).hexdigest(), 16) % 10**5 
 80 |                     
 81 |                     # adding a line to make the seed new as compared to the previous one
 82 |                     # seed += 42
 83 |                     random_points = ee.FeatureCollection.randomPoints(single_region, tiles_to_export, seed)
 84 | 
 85 |                     tiles = random_points.map(lambda point: point.buffer(cfg.tile_size / 2).bounds())
 86 |                     tile_features = tiles.getInfo()['features']
 87 | 
 88 |                     for idx in range(len(tile_features)):
 89 |                         tile_features[idx]['properties'] = {
 90 |                             'tile_id': f"{tile_id_count}",
 91 |                             'biome': biome,
 92 |                             'eco_region': eco_region_name,
 93 |                             
 94 |                         }
 95 |                         tiles_list.append(tile_features[idx])
 96 |                         tile_id_count += 1 # incrementing the tile_id
 97 |                     
 98 |                     # shuffling the tiles_list
 99 |                     random.shuffle(tiles_list)
100 |                     geojson_collection = {
101 |                         'type': 'FeatureCollection',
102 |                         'features': tiles_list
103 |                     }
104 | 
105 |                     with open(cfg.tiles_geojson_path, 'w') as f:
106 |                         json.dump(geojson_collection, f)
107 | 
108 |                     num_of_tiles -= tiles_to_export
109 |                     num_while_loops += 1
110 |                     
111 |             except ee.ee_exception.EEException as e:
112 |                 print('Could not get this eco-region. Skipping...')
113 |                 print(e)
114 |                 failed_eco_regions.append(eco_region_name)
115 |                 continue
116 | 
117 | 
118 |     with open(cfg.failed_eco_regions_path, 'w') as f:
119 |         f.write('\n'.join(failed_eco_regions))
120 | 
121 | if __name__ == '__main__':
122 |     main()
123 | 
124 | 


--------------------------------------------------------------------------------
/data_exp/data_exp.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import json
  3 | import numpy as np
  4 | import h5py
  5 | 
  6 | 
  7 | # tile_path = "/home/qbk152/vishal/global-lr/data/data_1M_130_tile_info.json"
  8 | # tile_path = "/home/qbk152/vishal/global-lr/data/data_tmp_tile_info.json"
  9 | # data_path = '/home/qbk152/vishal/global-lr/data/tmp.h5'
 10 | # tile_info = json.load(open(tile_path))
 11 | 
 12 | def month_only(args):
 13 |     '''
 14 |     Plot the number of tiles per month
 15 |     '''
 16 | 
 17 |     tile_info = json.load(open(args.tile_info_path))
 18 |     month = np.arange(1, 13)
 19 |     month_counts = np.zeros(12)
 20 |     for tile in tile_info:
 21 |         month_counts[int(tile_info[tile]['S2_DATE'].split('-')[1]) - 1] += 1
 22 | 
 23 |     plt.bar(month, month_counts)
 24 |     plt.xlabel('Month')
 25 |     plt.ylabel('Number of tiles')
 26 |     plt.title('Number of tiles per month')
 27 | 
 28 |     plt.savefig(os.path.join(args.store_path, 'month_counts.png'))
 29 | 
 30 |     plt.clf()
 31 |     # stats about which months in a year are present in the dataset
 32 | 
 33 | def s2_type(args):
 34 |     '''
 35 |     Plot the number of tiles per month per year
 36 |     '''
 37 |     tile_info = json.load(open(args.tile_info_path))
 38 |     month = np.arange(1, 12*4 + 1)
 39 |     month_counts_l1c = np.zeros(12*4)
 40 |     month_counts_l2a = np.zeros(12*4)
 41 | 
 42 |     for tile in tile_info:
 43 |         m = int(tile_info[tile]['S2_DATE'].split('-')[1])
 44 |         y = int(tile_info[tile]['S2_DATE'].split('-')[0])
 45 |         if tile_info[tile]['S2_type'] == 'l1c':
 46 |             month_counts_l1c[(y - 2017) * 12 + m - 1] += 1
 47 |         else:
 48 |             month_counts_l2a[(y - 2017) * 12 + m - 1] += 1
 49 | 
 50 |     years = np.arange(2017, 2021)
 51 |     yearly_counts_l1c = [month_counts_l1c[i:i+12] for i in range(0, len(month_counts_l1c), 12)]
 52 |     yearly_counts_l2a = [month_counts_l2a[i:i+12] for i in range(0, len(month_counts_l2a), 12)]
 53 | 
 54 | 
 55 | 
 56 |     month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] * 4
 57 |     month_labels = [f"{years[i // 12]} {month_names[i]}" for i in range(12*4)]
 58 |     year_colors = ['b', 'g', 'r', 'c']
 59 | 
 60 | 
 61 |     # print(np.sum(yearly_counts_l1c))
 62 | 
 63 |     # Create a bar plot for each year
 64 |     for i, year_count in enumerate(yearly_counts_l1c):
 65 |         plt.bar(np.arange(12*i + 1, 12*i + 13), year_count, label=str(years[i]), color=year_colors[i], alpha=0.7)
 66 | 
 67 |     plt.xticks(np.arange(1, 12 * 4 + 1), month_labels, rotation=90, fontsize=6)
 68 |     plt.xlabel('Month')
 69 |     plt.ylabel('Count')
 70 |     plt.title('Monthly Counts by Year')
 71 |     plt.legend(title='Year')
 72 |     # increase the spacing between each bar plot
 73 |     plt.tight_layout()
 74 |     plt.savefig(os.path.join(args.store_path, 'yearly_counts_l1c.png'))
 75 |     plt.clf()
 76 | 
 77 |     for i, year_count in enumerate(yearly_counts_l2a):
 78 |         plt.bar(np.arange(12*i + 1, 12*i + 13), year_count, label=str(years[i]), color=year_colors[i], alpha=0.7)
 79 | 
 80 |     plt.xticks(np.arange(1, 12 * 4 + 1), month_labels, rotation=90, fontsize=6)
 81 |     plt.xlabel('Month')
 82 |     plt.ylabel('Count')
 83 |     plt.title('Monthly Counts by Year')
 84 |     plt.legend(title='Year')
 85 |     # increase the spacing between each bar plot
 86 |     plt.tight_layout()
 87 |     plt.savefig(os.path.join(args.store_path, 'yearly_counts_l2a.png'))
 88 | 
 89 | 
 90 | 
 91 | 
 92 | def month_year(args):
 93 |     '''
 94 |     Plot the number of tiles per month per year
 95 |     '''
 96 |     import matplotlib.cm as cm
 97 |     tile_info = json.load(open('/projects/dereeco/data/global-lr/data_1M_v001/data_1M_v001_tile_info.json'))
 98 |     month = np.arange(1, 12*4 + 1)
 99 |     month_counts = np.zeros(12*4)
100 | 
101 |     for tile in tile_info:
102 |         # only choosing either l1c or l2a
103 |         # if tile_info[tile]['S2_type'] == 'l1c':
104 |         m = int(tile_info[tile]['S2_DATE'].split('-')[1])
105 |         y = int(tile_info[tile]['S2_DATE'].split('-')[0])
106 |         month_counts[(y - 2017) * 12 + m - 1] += 1
107 |     years = np.arange(2017, 2021)
108 |     yearly_counts = [month_counts[i:i+12] for i in range(0, len(month_counts), 12)]
109 | 
110 |     month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] * 4
111 |     month_labels = [f"{years[i // 12]} {month_names[i]}" for i in range(12*4)]
112 |     # month_labels = [i if 'Jan' in i else '' for i in month_labels]
113 |     # we name the months 1, 5, 9 for each year
114 |     for id, i in enumerate(month_labels):
115 |         if 'Jan' in i:
116 |             month_labels[id] = '1'
117 |         elif 'May' in i:
118 |             month_labels[id] = '5'
119 |         elif 'Sep' in i:
120 |             month_labels[id] = '9'
121 |         else:
122 |             month_labels[id] = ''
123 |     # year_colors = ['b', 'g', 'r', 'c']
124 |     year_colors = [cm.inferno(i/4) for i in range(4)]
125 |     plt.rcParams.update({'figure.figsize': (14, 10)})
126 |     # Create a bar plot for each year
127 |     for i, year_count in enumerate(yearly_counts):
128 |         plt.bar(np.arange(12*i + 1, 12*i + 13), year_count, label=str(years[i]), color=year_colors[i], alpha=0.7)
129 | 
130 |     plt.xticks(np.arange(1, 12 * 4 + 1), month_labels, fontsize=30)
131 |     plt.xlabel('Date in months', labelpad=8, fontsize=30)
132 |     plt.ylabel('Number of samples', labelpad=8, fontsize=30)
133 |     plt.legend(loc='upper center', ncols = 4, fontsize=24)
134 |     plt.yticks(fontsize=30)
135 |     # limit the ylimit to 30000
136 |     plt.ylim(0, 30000)
137 |     # increase the spacing between each bar plot
138 |     # plt.rcParams.update({'font.size': 18})
139 |     
140 |     # plt.tight_layout()
141 |     plt.savefig(os.path.join(args.store_path, 'yearly.png'), dpi=300, format='png', bbox_inches='tight')
142 |     plt.savefig(os.path.join(args.store_path, 'yearly.pdf'), dpi=300, format='pdf', bbox_inches='tight')
143 |     plt.clf()
144 | 
145 | def dynamic_world(args):
146 |     '''
147 |     Plot the number of pixels per class in the dynamic world dataset
148 |     '''
149 |     class_names = [
150 |         "No data",
151 |         "Water",
152 |         "Trees",
153 |         "Grass",
154 |         "Flooded vegetation",
155 |         "Crops",
156 |         "Shrub and scrub",
157 |         "Built",
158 |         "Bare",
159 |         "Snow and ice"
160 |     ]
161 | 
162 |     hdf5_file = h5py.File(args.data_path, 'r')
163 |     meta = hdf5_file['metadata']    
164 |     dw_count = {i:0 for i in range(0, 10)}
165 | 
166 |     num_tiles = len(meta)
167 |     for i in range(num_tiles):
168 |         tile_id = meta[i][0].decode('utf-8')
169 |         img = hdf5_file['dynamic_world'][i]
170 | 
171 |         # obtain the number of pixels in each class
172 |         for j in range(10):
173 |             dw_count[j] += np.sum(img == j)
174 | 
175 |         if i % 1000 == 0:
176 |             print(f"Processed {i} tiles")
177 | 
178 | 
179 |     plt.bar(dw_count.keys(), dw_count.values())
180 |     plt.xticks(np.arange(0, 10), class_names, rotation=90, fontsize=8)
181 |     plt.subplots_adjust(bottom=0.4)
182 |     
183 |     plt.xlabel('Class')
184 |     plt.ylabel('Number of pixels')
185 |     plt.title('Number of pixels per class')
186 | 
187 |     plt.savefig(os.path.join(args.store_path, 'dw_counts.png'))
188 | 
189 | 
190 | def esa_worldcover(args):
191 |     class_names = [
192 |         'Tree cover',
193 |         'Shrubland',
194 |         'Grassland',
195 |         'Cropland',
196 |         'Built-up',
197 |         'Bare / sparse vegetation',
198 |         'Snow and ice',
199 |         'Permanent water bodies',
200 |         'Herbaceous wetland',
201 |         'Mangroves',
202 |         'Moss and lichen'
203 |     ]
204 |     class_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]
205 |     hdf5_file = h5py.File(args.data_path, 'r')
206 |     meta = hdf5_file['metadata'] 
207 |     esa_count = {i:0 for i in class_values}
208 | 
209 |     num_tiles = len(meta)
210 |     for i in range(num_tiles):
211 |         tile_id = meta[i][0].decode('utf-8')
212 |         img = hdf5_file['esa_worldcover'][i]
213 | 
214 |         # obtain the number of pixels in each class
215 |         for j in class_values:
216 |             esa_count[j] += np.sum(img == j)
217 | 
218 | 
219 | 
220 |         if i % 1000 == 0:
221 |             print(f"Processed {i} tiles")
222 |             
223 |     plt.bar(esa_count.keys(), esa_count.values(), width=7)
224 |     plt.xticks(class_values, class_names, rotation=90, fontsize=8)
225 |     plt.subplots_adjust(bottom=0.4)
226 | 
227 |     plt.xlabel('Class')
228 |     plt.ylabel('Number of pixels')
229 |     plt.title('Number of pixels per class')
230 | 
231 |     plt.savefig(os.path.join(args.store_path, 'esa_counts.png'))
232 | 
233 |         
234 | 
235 | def custom(args):
236 |     '''
237 |     Trying to plot the number of tiles per grid cell for the tiles that have the date as dec 2018
238 |     '''
239 |     minx, miny, maxx, maxy = -179.9863841350967, -86.78204367236995, 180.05963072575278, 83.48337010728358
240 |     grid_size = 1
241 |     tile_info = json.load(open(args.tile_info_path))
242 | 
243 |     x = np.arange(minx, maxx + grid_size, grid_size)
244 |     y = np.arange(miny, maxy + grid_size, grid_size)
245 | 
246 |     grid_counts = np.zeros((len(x) - 1, len(y) - 1))
247 |     count = 0
248 | 
249 |     for tile in tile_info:
250 |         m = int(tile_info[tile]['S2_DATE'].split('-')[1])
251 |         y = int(tile_info[tile]['S2_DATE'].split('-')[0])
252 | 
253 |         if m == 12 and y == 2018:
254 |             lon, lat = tile_info[tile]['lon'], tile_info[tile]['lat']
255 | 
256 |         
257 |             x_idx = int((lon - minx) / grid_size)
258 |             y_idx = int((lat - miny) / grid_size)
259 | 
260 |             grid_counts[x_idx][y_idx] += 1
261 |             count += 1
262 | 
263 | 
264 |     print(count)
265 |     
266 |     fig, ax = plt.subplots()
267 |     grid_counts = np.transpose(grid_counts)
268 | 
269 |     im = ax.imshow(grid_counts, extent=(minx, maxx, miny, maxy), origin='lower', cmap='inferno')
270 |     im.set_clim(0, 50) # this is hardcoded for grid_size = 1
271 |     # im.set_clim(-2, 15) # this is hardcoded for grid_size = 0.1
272 | 
273 |     plt.colorbar(im, fraction=0.020, pad=0.04)
274 | 
275 | 
276 |     # plt.show()
277 |     plt.savefig(os.path.join(args.store_path, f"grid_{grid_size}.png"), dpi=300)
278 | 
279 | 
280 | def dw_custom(args):
281 |     '''
282 |     Plot the tiles per grid for a specific dw class
283 |     '''
284 |     minx, miny, maxx, maxy = -179.9863841350967, -86.78204367236995, 180.05963072575278, 83.48337010728358
285 |     grid_size = 1
286 |     tile_info = json.load(open(args.tile_info_path))
287 | 
288 | 
289 |     x = np.arange(minx, maxx + grid_size, grid_size)
290 |     y = np.arange(miny, maxy + grid_size, grid_size)
291 | 
292 |     grid_counts = np.zeros((len(x) - 1, len(y) - 1))
293 |     count = 0
294 | 
295 |     hdf5_file = h5py.File(args.data_path, 'r')
296 |     meta = hdf5_file['metadata']    
297 | 
298 | 
299 |     num_tiles = len(meta)
300 |     for i in range(num_tiles):
301 |         tile = meta[i][0].decode('utf-8')
302 |         img = hdf5_file['dynamic_world'][i]
303 |         if np.sum(img == 0) > 1000:
304 |             lon, lat = tile_info[tile]['lon'], tile_info[tile]['lat']
305 | 
306 |             x_idx = int((lon - minx) / grid_size)
307 |             y_idx = int((lat - miny) / grid_size)
308 | 
309 |             grid_counts[x_idx][y_idx] += 1
310 |             count += 1
311 | 
312 | 
313 |     print(count)
314 |     
315 |     fig, ax = plt.subplots()
316 |     grid_counts = np.transpose(grid_counts)
317 | 
318 |     im = ax.imshow(grid_counts, extent=(minx, maxx, miny, maxy), origin='lower', cmap='inferno')
319 |     im.set_clim(0, 50) # this is hardcoded for grid_size = 1
320 |     # im.set_clim(-2, 15) # this is hardcoded for grid_size = 0.1
321 |     
322 |     # plt.show()
323 |     plt.colorbar(im, fraction=0.020, pad=0.04)
324 |     plt.savefig(os.path.join(args.store_path, f"grid_{grid_size}.png"), dpi=300)
325 | 
326 | 
327 | def era_stats(args):
328 |     '''
329 |     Plot the distribution of temperature for each tile.
330 |     '''
331 | 
332 |     hdf5_file = h5py.File(args.data_path, 'r')
333 |     meta = hdf5_file['metadata']
334 |     num_tiles = len(meta)
335 |     data_month = []
336 | 
337 |     for i in range(num_tiles):
338 |         tile = meta[i][0].decode('utf-8')
339 |         data = hdf5_file['era5'][i][4:8]
340 |         data_month.append(data[0])
341 | 
342 | 
343 |     data_month = np.array(data_month) - 273.15
344 | 
345 |     # plot the distribution of temperature for each tile
346 | 
347 |     fig, ax = plt.subplots()
348 |     ax.hist(data_month, bins=100)
349 | 
350 |     plt.xlabel('Temperature (C)')
351 |     plt.ylabel('Number of tiles')
352 |     plt.title('Distribution of Monthly Temperature for Each Tile')
353 | 
354 |     plt.savefig(os.path.join(args.store_path, 'era5_temperature.png'))
355 | 
356 | 
357 | def aster_stats(args):
358 |     '''
359 |     Plot the distribution of elevation for each tile.
360 |     '''
361 |     hdf5_file = h5py.File(args.data_path, 'r')
362 |     meta = hdf5_file['metadata']
363 |     num_tiles = len(meta)
364 | 
365 |     bins = np.arange(-170, 6500, 10)
366 | 
367 |     hist_counts = np.zeros(len(bins))
368 |     min_, max_ = 100000, -100000
369 |     for i in range(num_tiles):
370 |         img = hdf5_file['aster'][i]
371 |         # Extract the elevation band
372 |         data = img[0, :, :]
373 | 
374 |         data = data.flatten()
375 |         # if np.min(data) < min_:
376 |         #     min_ = np.min(data)
377 | 
378 |         # if np.max(data) > max_:
379 |         #     max_ = np.max(data)
380 |         ind = np.digitize(data, bins=bins)
381 |         ind = ind - 1
382 |         hist_counts[ind] += 1
383 | 
384 |         if i % 1000 == 0:
385 |             print(f"Processed {i} tiles")
386 | 
387 | 
388 |     hdf5_file.close()
389 | 
390 |     # print("Min and max elevation")
391 |     # print(min_, max_)
392 |     # Plot the histogram with bin labels
393 |     fig, ax = plt.subplots()
394 |     ax.bar(bins, hist_counts, width=100)
395 |     plt.xlabel('Elevation (m)')
396 |     plt.ylabel('Number of pixels')
397 |     plt.title('Distribution of Elevation for All Tiles')
398 | 
399 |     plt.savefig(os.path.join(args.store_path, 'aster_elevation.png'))
400 |     
401 | 
402 | 
403 | 
404 | 
405 |     
406 | 
407 | if __name__ == '__main__':
408 | 
409 |     import argparse
410 |     import os
411 |     parser = argparse.ArgumentParser()
412 |     parser.add_argument('--data_dir', type=str, required=True)
413 |     parser.add_argument('--store_path', type=str, default='/home/qbk152/vishal/MMEarth-data/1M_v001_plots/')
414 |     args = parser.parse_args()
415 | 
416 | 
417 |     name = args.data_dir.split('/')[-1] if args.data_dir[-1] != '/' else args.data_dir.split('/')[-2]
418 |     args.tile_info_path = os.path.join(args.data_dir, name + '_tile_info.json')
419 |     args.data_path = os.path.join(args.data_dir, name + '.h5')
420 |     args.store_path = os.path.join(args.store_path, name)
421 |     if not os.path.exists(args.store_path):
422 |         os.makedirs(args.store_path)
423 | 
424 |     print('storing plots in', args.store_path)
425 | 
426 | 
427 |     
428 |     # month_only(args)
429 |     month_year(args)
430 |     # s2_type(args)
431 |     # dynamic_world(args)
432 |     # # era_stats()
433 |     # # aster_stats()
434 |     # esa_worldcover(args)
435 | 
436 | 
437 | 
438 | 


--------------------------------------------------------------------------------
/data_exp/density_maps.py:
--------------------------------------------------------------------------------
  1 | import geopandas as gpd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from matplotlib.colors import LinearSegmentedColormap
  5 | import json
  6 | # world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
  7 | 
  8 | 
  9 | def create_density(grid_size):
 10 | 
 11 |     gdf = gpd.read_file("/projects/dereeco/data/global-lr/geojson_files/tiles_1M_v001.geojson")
 12 |     json_ = json.load(open("/projects/dereeco/data/global-lr/data_1M_v001/data_1M_v001_tile_info.json"))
 13 |     # gdf = gpd.read_file("/home/qbk152/vishal/global-lr/tile_polygons/uni_biomes_only/tiles_100k_130.geojson")
 14 |     grid_size = grid_size
 15 |     min_x, min_y, max_x, max_y = gdf.total_bounds
 16 |     # print(min_x, min_y, max_x, max_y)
 17 |     # exit()
 18 | 
 19 |     # Create grid cells
 20 |     x = np.arange(min_x, max_x + grid_size, grid_size)
 21 |     y = np.arange(min_y, max_y + grid_size, grid_size)
 22 |     grid_counts = np.zeros((len(x) - 1, len(y) - 1))
 23 | 
 24 | 
 25 |     gdf_sindex = gdf.sindex
 26 | 
 27 |     for index, row in gdf.iterrows():
 28 |         # if index % 10000 == 0:
 29 |         tile_center = row['geometry'].centroid
 30 |         possible_matches_index = list(gdf_sindex.intersection(tile_center.bounds))
 31 |         name = row['tile_id']
 32 |         
 33 |         
 34 |         # we only want to count the number of samples for l2a tiles
 35 |         try:
 36 |             if json_[name]['S2_type'] == "l2a":
 37 |                 continue
 38 |         except:
 39 |             # the name is not in the json file
 40 |             continue
 41 |         
 42 |         for i in possible_matches_index:
 43 |             # ensure the possible matches are l2a tiles
 44 |             try:
 45 |                 if json_[gdf.loc[i, 'tile_id']]['S2_type'] == "l2a":
 46 |                     continue
 47 |             except:
 48 |                 continue
 49 |             if gdf.loc[i, 'geometry'].contains(tile_center):
 50 |                 x_idx = int((tile_center.x - min_x) / grid_size)
 51 |                 y_idx = int((tile_center.y - min_y) / grid_size)
 52 |                 grid_counts[x_idx][y_idx] += 1
 53 | 
 54 |     # fig, ax = plt.subplots()
 55 |     # grid_counts = np.transpose(grid_counts)
 56 |     # world.boundary.plot(
 57 |     # ax=ax,
 58 |     # color="gray",
 59 |     # # edgecolor="black",
 60 |     # linewidth=0.4
 61 |     # )
 62 | 
 63 | 
 64 |     # cmap = plt.cm.get_cmap('inferno')
 65 | 
 66 |     # ax.set_xticks([])
 67 |     # ax.set_yticks([])
 68 |     
 69 |     # im = ax.imshow(grid_counts, extent=(min_x, max_x, min_y, max_y), origin='lower', cmap='inferno')
 70 | 
 71 | 
 72 |     # im.set_clim(1, 1000) 
 73 |     # cbar = plt.colorbar(im, fraction=0.020, pad=0.04)
 74 |     # cbar.ax.set_ylabel('Number of samples', rotation=90, labelpad=2)
 75 |     # y_tick_labels = [str(i) for i in range(0, 801, 200)]
 76 |     # y_tick_labels.append(">1k")
 77 |     # cbar.ax.set_yticklabels(y_tick_labels)
 78 |     # plt.rcParams.update({'font.size': 6})
 79 | 
 80 |     # # plt.show()
 81 |     # plt.savefig(f"/home/qbk152/vishal/global-lr/data_exp/t-grid_{grid_size}_uni_biomes.png", dpi=300, format='png')
 82 |     # plt.savefig(f"/home/qbk152/vishal/global-lr/data_exp/t-grid_{grid_size}_uni_biomes.pdf", dpi=300, format='pdf')
 83 |     from matplotlib.colors import ListedColormap
 84 |     fig, ax = plt.subplots()
 85 |     plt.rcParams.update({'font.size': 9})
 86 |     plt.rcParams.update({'figure.figsize': (8, 6)})
 87 |     grid_counts = np.transpose(grid_counts)
 88 |     # world.boundary.plot(
 89 |     #     ax=ax,
 90 |     #     color="white",
 91 |     #     linewidth=0.5
 92 |     # )
 93 | 
 94 |     # Create custom colormap
 95 |     colors = plt.cm.inferno(np.linspace(0, 1, 256))
 96 |     colors[0] = (1, 1, 1, 0)  # Set color for 0 to white (or (1,1,1,0) for transparent)
 97 |     custom_cmap = ListedColormap(colors)
 98 | 
 99 |     ax.set_xticks([])
100 |     ax.set_yticks([])
101 | 
102 |     im = ax.imshow(grid_counts, extent=(min_x, max_x, min_y, max_y), origin='lower', cmap=custom_cmap)
103 | 
104 |     
105 |     # Adjust color limits
106 | 
107 |     # im.set_clim(0, 1000)  # this is hardcoded for grid_size = 1
108 |     im.set_clim(0, 600)
109 | 
110 |     cbar = plt.colorbar(im, fraction=0.020, pad=0.04)
111 |     cbar.ax.set_ylabel('Number of samples', rotation=90, labelpad=2)
112 |     y_tick_labels = ["1", "100", "200", "300", "400", "500", ">600"]
113 |     y_tick_loc = [1, 100, 200, 300, 400, 500, 600]
114 |     cbar.set_ticks(y_tick_loc)
115 |     cbar.ax.set_yticklabels(y_tick_labels)
116 | 
117 |     plt.savefig(f"/home/qbk152/vishal/MMEarth-data/1M_v001_plots/data_1M_v001/grid_{grid_size}_uni_biomes_whitebg_L1C.png", dpi=300, format='png', bbox_inches='tight')
118 |     plt.savefig(f"/home/qbk152/vishal/MMEarth-data/1M_v001_plots/data_1M_v001/grid_{grid_size}_uni_biomes_whitebg_L1C.pdf", dpi=300, format='pdf', bbox_inches='tight')
119 |     # plt.show()
120 | 
121 | 
122 | 
123 | def create_density_custom(grid_size):
124 |     file = open("data/missing_tiles_1M.csv")
125 |     lines = file.readlines()
126 | 
127 |     minx, miny, maxx, maxy = -179.9863841350967, -86.78204367236995, 180.05963072575278, 83.48337010728358
128 |     grid_size = 1
129 | 
130 |     x = np.arange(minx, maxx + grid_size, grid_size)
131 |     y = np.arange(miny, maxy + grid_size, grid_size)
132 | 
133 |     grid_counts = np.zeros((len(x) - 1, len(y) - 1))
134 |     count = 0
135 | 
136 |     for line in lines:
137 |         tile_id, lat, lon = line.split(",")
138 |         lat = float(lat)
139 |         lon = float(lon)
140 | 
141 |         x_idx = int((lon - minx) / grid_size)
142 |         y_idx = int((lat - miny) / grid_size)
143 | 
144 |         grid_counts[x_idx][y_idx] += 1
145 |         count += 1
146 | 
147 | 
148 |     print(count)
149 |     
150 |     fig, ax = plt.subplots()
151 |     grid_counts = np.transpose(grid_counts)
152 | 
153 |     im = ax.imshow(grid_counts, extent=(minx, maxx, miny, maxy), origin='lower', cmap='inferno')
154 |     im.set_clim(0, 50) # this is hardcoded for grid_size = 1
155 |     # im.set_clim(-2, 15) # this is hardcoded for grid_size = 0.1
156 | 
157 |     plt.colorbar(im, fraction=0.020, pad=0.04)
158 | 
159 | 
160 |     # plt.show()
161 |     plt.savefig(f"data_exp/missing_tiles_1M_{grid_size}.png", dpi=300)
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     create_density(1.1)
166 |     # create_density(0.1)
167 |     # create_density(0.01)
168 |     # create_density_custom(1)
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/data_exp/view_h5.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A script to view the contents of the h5 file.
  3 | '''
  4 | 
  5 | 
  6 | import h5py
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import random
 10 | import os
 11 | import json
 12 | from matplotlib.colors import ListedColormap, BoundaryNorm
 13 | 
 14 | 
 15 | 
 16 | h5_path = '/projects/dereeco/data/global-lr/data_1M_130_new/data_1M_130_new.h5'
 17 | tile_info_path = '/home/qbk152/vishal/global-lr/data/data_1M_130_new/data_1M_130_new_tile_info.json'
 18 | save_dir = '/home/qbk152/vishal/global-lr/data/visualizations/130'
 19 | splits_path = '/home/qbk152/vishal/global-lr/data/data_1M_130_new/data_1M_130_new_splits.json'
 20 | 
 21 | # display_num = 20 # number of tiles to display
 22 | # display_id = 'tsmbfsngfsf_29'
 23 | display_id = 'mfwsnsasfmf_485'
 24 | 
 25 | 
 26 | save_tif = False
 27 | 
 28 | 
 29 | 
 30 | def save_img(path, img, cmap = None, norm = None):
 31 |     if cmap is None:
 32 |         plt.imshow(img)
 33 |     else:
 34 |         if norm is None:
 35 |             plt.imshow(img, cmap=cmap)
 36 |         else:
 37 |             plt.imshow(img, cmap=cmap, norm=norm)
 38 |     plt.axis('off')
 39 |     plt.savefig(path, bbox_inches='tight', pad_inches=0)
 40 |     plt.close()
 41 | 
 42 | 
 43 | 
 44 | def view_h5(h5_path):
 45 | 
 46 |     hdf5_file = h5py.File(h5_path, 'r')
 47 |     print('h5 KEYS: ', hdf5_file.keys())
 48 |     splits = json.load(open(splits_path, 'r'))
 49 |     meta = hdf5_file['metadata']    
 50 | 
 51 |     num_tiles = len(meta)
 52 |    
 53 |     if display_id is not None:
 54 |         for i in range(num_tiles):
 55 |             tile_id = meta[i][0].decode('utf-8')
 56 |             if tile_id == display_id:
 57 |                 dir = os.path.join(save_dir, tile_id)
 58 |                 os.makedirs(dir, exist_ok=True)
 59 |                 for key in hdf5_file.keys():
 60 |                     # if len(hdf5_file[key].shape) != 4:
 61 |                     #     continue
 62 |                     # print('Key: ', key)
 63 |                     img = hdf5_file[key][i]
 64 |                     write_img(img, key, dir)
 65 |     else:
 66 |     # choose a random tile           
 67 |         for j in range(display_num):
 68 |             i = random.randint(0, num_tiles - 1)
 69 |             tile_id = meta[i][0].decode('utf-8')
 70 |             print('Tile ID: ', tile_id)
 71 | 
 72 |             train = splits['train']
 73 |             val = splits['val']
 74 |             test = splits['test']
 75 | 
 76 |             if i in train:
 77 |                 print('Train, idx: ', i)
 78 |             elif i in val:
 79 |                 print('Val, idx: ', i)
 80 |             elif i in test:
 81 |                 print('Test, idx: ', i)
 82 |             
 83 | 
 84 |             if not save_tif:
 85 |                 dir = os.path.join(save_dir, tile_id)
 86 |                 os.makedirs(dir, exist_ok=True)
 87 |                 for key in hdf5_file.keys():
 88 |                     # we only want to visualize sentinel2 for now TESTING PURPOSES
 89 |                     # if key != 'sentinel2':
 90 |                     #     continue
 91 |                     if len(hdf5_file[key].shape) != 4:
 92 |                         continue
 93 |                     # print('Key: ', key)
 94 |                     img = hdf5_file[key][i]
 95 |                     write_img(img, key, dir)                
 96 | 
 97 | def write_img(img, key, dir):
 98 |     if key == 'sentinel2':
 99 |         img = img[[3, 2, 1], :, :]/10000
100 |         clip_val = 0.2
101 |         img = np.clip(img, 0, clip_val)
102 |         img = img/clip_val
103 | 
104 |         img = img.transpose(1, 2, 0)
105 | 
106 |         # plt.imsave(os.path.join(dir, 'sentinel2.png'), img)
107 |         save_img(os.path.join(dir, 's2.png'), img)
108 | 
109 |     elif key == 'sentinel1':
110 |         bands_map = {'VV': 0, 'VH': 1, 'HH': 2, 'HV': 3}
111 |         orbit_map = {'asc': 0, 'desc': 4}
112 |         # write each band separately
113 |         for band, band_idx in bands_map.items():
114 |             for orbit, orbit_idx in orbit_map.items():
115 |                 img_ = img[orbit_idx + band_idx, :, :]
116 |                 # print(np.min(img_), np.max(img_))
117 |                 img_ = (np.clip(img_, -30, 0) + 30) / 30
118 |                 # print(np.min(img_), np.max(img_))
119 |                 # plt.imsave(os.path.join(dir, 's1_' + band + '_' + orbit + '.png'), img_)
120 |                 save_img(os.path.join(dir, 's1_' + band + '_' + orbit + '.png'), img_)
121 | 
122 |     elif key == 'aster':
123 |         # write elevation and slope
124 |         img_ = img[0, :, :]
125 |         # plt.imsave(os.path.join(dir, 'aster_elevation.png'), img_)
126 |         save_img(os.path.join(dir, 'aster_elevation.png'), img_)
127 |         img_ = img[1, :, :]
128 |         # plt.imsave(os.path.join(dir, 'aster_slope.png'), img_)
129 |         save_img(os.path.join(dir, 'aster_slope.png'), img_)
130 | 
131 |     elif key == 'dynamic_world':
132 |         # write the label band
133 |         img_ = img[0, :, :]
134 |         colors = ['#000000', '#419bdf', '#397d49', '#88b053', '#7a87c6', '#e49635', '#dfc35a', '#c4281b', '#a59b8f', '#b39fe1']
135 |         norm = BoundaryNorm([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], len(colors))
136 | 
137 |         cmap = ListedColormap(colors)
138 |         # plt.imsave(os.path.join(dir, 'dynamic_world.png'), img_, cmap=cmap)
139 |         save_img(os.path.join(dir, 'dynamic_world.png'), img_, cmap=cmap, norm=norm)
140 | 
141 | 
142 |     elif key == 'canopy_height_eth':
143 |         # write the 2 bands
144 |         img_ = img[0, :, :]
145 |         # plt.imsave(os.path.join(dir, 'canopy_height_height.png'), img_)
146 |         save_img(os.path.join(dir, 'canopy_height_height.png'), img_)
147 |         img_ = img[1, :, :]
148 |         # plt.imsave(os.path.join(dir, 'canopy_height_std.png'), img_)
149 |         save_img(os.path.join(dir, 'canopy_height_std.png'), img_)
150 |     
151 |     elif key == 'esa_worldcover':
152 |         img_ = img[0, :, :]
153 |         colormap = [
154 |             '#006400',  # Tree cover - 10
155 |             '#ffbb22',  # Shrubland - 20
156 |             '#ffff4c',  # Grassland - 30
157 |             '#f096ff',  # Cropland - 40
158 |             '#fa0000',  # Built-up - 50
159 |             '#b4b4b4',  # Bare / sparse vegetation - 60
160 |             '#f0f0f0',  # Snow and ice - 70
161 |             '#0064c8',  # Permanent water bodies - 80
162 |             '#0096a0',  # Herbaceous wetland - 90
163 |             '#00cf75',  # Mangroves - 95
164 |             '#fae6a0'   # Moss and lichen - 100
165 |         ]
166 | 
167 |         bounds = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]
168 |         norm = BoundaryNorm(bounds, len(colormap))
169 | 
170 |         cmap = ListedColormap(colormap)
171 |         # plt.imsave(os.path.join(dir, 'esa_worldcover.png'), img_, cmap=cmap, norm=norm)
172 | 
173 |         save_img(os.path.join(dir, 'esa_worldcover.png'), img_, cmap=cmap, norm=norm)
174 |     
175 |     else:
176 |         print(key, img)
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | if __name__ == '__main__':
185 |     os.makedirs(save_dir, exist_ok=True)
186 |     view_h5(h5_path)


--------------------------------------------------------------------------------
/ee_utils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/ee_utils/.DS_Store


--------------------------------------------------------------------------------
/ee_utils/ee_data.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A general class to collect the data from GEE. Each function in the class will be a different dataset, and they share common variables like the start and end date, the projection etc.
  3 | '''
  4 | 
  5 | import io
  6 | import config.ee_init
  7 | import ee
  8 | from matplotlib import pyplot as plt
  9 | import numpy as np
 10 | import requests
 11 | import shutil
 12 | import zipfile
 13 | import os
 14 | import logging
 15 | from utils.utils import get_points_filter, get_ee_task_list, read_json
 16 | import math
 17 | import random
 18 | import time
 19 | from numpy.lib import recfunctions as rfn
 20 | import h5py
 21 | from retry import retry
 22 | from multiprocessing import Pool, cpu_count
 23 | from datetime import datetime, timedelta
 24 | import hashlib
 25 | 
 26 | BIOME_LABELS = read_json('./stats/biome_labels.json')
 27 | ECOREGION_LABELS = read_json('./stats/eco_labels.json')
 28 | 
 29 | 
 30 | class ee_set:
 31 |     def __init__(self, tile, cfg, tile_info = None):
 32 |         self.tile = tile
 33 |         self.crs = ''
 34 |         self.start_date = '2017-01-01' # the start date specifies the general time period for the data. The specific date is specified in the function. We consider a 2 year period
 35 |         self.end_date = '2020-12-31' # the end date specifies the general time period for the data. The specific date is specified in the function. We consider a 2 year period
 36 |         self.s2_date = ''
 37 |         self.s2_imageid = ''
 38 |         self.id = tile['properties']['tile_id'] 
 39 |         self.polygon = ee.Geometry.Polygon(tile['geometry']['coordinates'])
 40 |         self.lon = self.polygon.centroid().coordinates().get(0).getInfo()
 41 |         self.lat = self.polygon.centroid().coordinates().get(1).getInfo()
 42 |         self.biome = BIOME_LABELS[tile['properties']['biome']]
 43 |         self.eco_region = ECOREGION_LABELS[tile['properties']['eco_region']]
 44 |         self.cfg = cfg  # loading the config file
 45 |         self.export_folder = self.cfg.export_folder
 46 |         self.image_set = {}
 47 |         self.no_data = False
 48 |         self.return_dict = {} # dict to store the arr returned by export_pixels NOT USED
 49 |         self.img_bands = {} # a dictionary that stores the bands of each dataset acquired
 50 |         self.era5_data = {}
 51 |         self.proj = None
 52 |         self.s2_type = None
 53 |         coord_string = f"{self.lat}_{self.lon}"
 54 |         self.seed = int(hashlib.sha256(coord_string.encode('utf-8')).hexdigest(), 16) % 10**5
 55 |         random.seed(self.seed)
 56 | 
 57 | 
 58 |         
 59 | 
 60 |         if tile_info is not None:
 61 |             self.s2_date = tile_info['S2_DATE']
 62 |             self.crs = tile_info['CRS']
 63 |             # self.s2_imageid = tile_info['S2_IMAGEID']
 64 |             if self.proj is None:
 65 |                 self.proj = ee.Projection(self.crs).atScale(10)
 66 | 
 67 | 
 68 |         # start series of function calls to get the data
 69 |         for function_name in cfg.datasets:
 70 |             if hasattr(self, function_name) and callable(getattr(self, function_name)):
 71 |                 if getattr(self, function_name)() is False:
 72 |                     logging.error(f"Function {function_name} returned None")
 73 | 
 74 |                     if function_name == 'sentinel2':
 75 |                         logging.error(f"Skipping tile {self.id}")
 76 |                         self.no_data = True
 77 |                         break
 78 |             else:
 79 |                 logging.error(f"Function {function_name} does not exist")
 80 |             
 81 |         
 82 | 
 83 | 
 84 |         # merging all the images into one image - comment these lines if you want to export the images seperately
 85 |         if not self.no_data:
 86 |             merged_image = self.image_set[self.cfg.datasets[0]] 
 87 |             for data_name, image in self.image_set.items():
 88 |                 if data_name == self.cfg.datasets[0]:
 89 |                     continue
 90 | 
 91 |                 if isinstance(image, dict):
 92 |                     for extra_info, img in image.items():
 93 |                         if img is None:
 94 |                             continue
 95 |                         merged_image = ee.Image.cat([merged_image, img])
 96 |                 elif image is None:
 97 |                     continue
 98 |                 else:
 99 |                     merged_image = ee.Image.cat([merged_image, image])
100 | 
101 |             self.image_set = {}
102 |             if tile_info is not None:
103 |                 self.image_set['extra'] = merged_image
104 |             else:
105 |                 self.image_set['merged'] = merged_image
106 |                     
107 |                     
108 |         if not self.no_data:
109 |             start = time.time()
110 |             try:
111 |                 self.export_local_single()
112 |             except Exception as e:
113 |                 logging.error(f"Error exporting to local directory: {e}")
114 |                 self.no_data = True
115 |             # self.export_local_parallel()
116 |             logging.debug(f"Time taken for exporting all: {time.time() - start}")
117 |             
118 |         
119 | 
120 | 
121 |     ################################################################################################################################################################################################
122 |     # THE FOLLOWING SET OF FUNCTIONS ARE FOR GETTING THE DATA FROM GEE. WRITE A NEW FUNCTION FOR EACH DATASET
123 |     # MAKE SURE THE NAME OF THE FUNCTION IS THE SAME AS THE NAME OF THE DATASET IN THE CONFIG FILE
124 |     # FOR EACH FUNCTION YOU RETURN A DICTIONARY WITH THE NAME OF THE DATASET AS THE KEY AND THE IMAGE AS THE VALUE
125 |     ################################################################################################################################################################################################
126 |     def sentinel2(self, cld_threshold = 10):
127 |         '''
128 |         This function gets the sentinel2 data for the tile. The function searches for the least cloudy image in the time period and returns that image. To ensure 
129 |         that the image covers the entire tile, we use a points filter that only selects the images that have the bottom right and top left points of the tile.
130 | 
131 |         S2 is used as the base image, and hence we get the date and projection from this image. The bands are selected from the config file.
132 |         
133 |         '''
134 |         start = time.time() 
135 | 
136 |         cfg = self.cfg.sentinel2
137 |         data_name = cfg.name
138 |         bands_l2a = list(cfg.BANDS[0])
139 |         bands_l1c = list(cfg.BANDS[1])
140 |         collection_l2a = cfg.collection[0]
141 |         collection_l1c = cfg.collection[1]
142 | 
143 | 
144 |         rnd_year = random.randint(2017, 2020)
145 |         if rnd_year == 2018:
146 |             # we only go up to november 2018 since l2a is global from dec 2018
147 |             s_date = f"{rnd_year}-01-01"
148 |             e_date = f"{rnd_year}-11-30"
149 |         elif rnd_year == 2017 or rnd_year == 2020:
150 |             s_date = f"{rnd_year}-01-01"
151 |             e_date = f"{rnd_year}-12-31"
152 |         elif rnd_year == 2019:
153 |             # we also include dec 2018
154 |             s_date = f"{rnd_year - 1}-12-01"
155 |             e_date = f"{rnd_year}-12-31"
156 | 
157 |         # random.seed(self.seed)
158 |         if random.randint(0, 1) == 0:
159 |             S2 = ee.ImageCollection(collection_l2a)\
160 |                     .filterBounds(self.polygon)\
161 |                     .filterDate(f"{s_date}", f"{e_date}")\
162 |                     .filterMetadata('CLOUDY_PIXEL_PERCENTAGE', 'less_than', cld_threshold)
163 |             self.s2_type = 'l2a'
164 |             
165 |             if S2.size().getInfo() == 0:
166 |                 S2 = ee.ImageCollection(collection_l1c)\
167 |                     .filterBounds(self.polygon)\
168 |                     .filterDate(f"{s_date}", f"{e_date}")\
169 |                     .filterMetadata('CLOUDY_PIXEL_PERCENTAGE', 'less_than', cld_threshold)
170 |                 self.s2_type = 'l1c'
171 |         else:
172 |             S2 = ee.ImageCollection(collection_l1c)\
173 |                     .filterBounds(self.polygon)\
174 |                     .filterDate(f"{s_date}", f"{e_date}")\
175 |                     .filterMetadata('CLOUDY_PIXEL_PERCENTAGE', 'less_than', cld_threshold)
176 |             self.s2_type = 'l1c'
177 |             
178 |         # points_filter = get_points_filter(self.polygon, buffer_size = -200)
179 |         # filtered_images = S2.filter(points_filter)
180 | 
181 |         filtered_images = S2.filter(ee.Filter.contains('.geo', self.polygon.buffer(200)))
182 | 
183 |         num_filtered_images = filtered_images.size().getInfo()
184 |         if num_filtered_images == 0:
185 |             logging.error('\t No sentinel2 image found for both l1c and l2a')
186 |             return False
187 |         img_list = filtered_images.toList(filtered_images.size())
188 |         random_number = random.randint(0, num_filtered_images - 1)
189 |         sampled_image_full = ee.Image(img_list.get(random_number))
190 |         # Select the desired bands and clip the image
191 |         if self.s2_type == 'l2a':
192 |             if "MSK_CLDPRB" in sampled_image_full.bandNames().getInfo():
193 |                 sampled_image = sampled_image_full.select(bands_l2a).clip(self.polygon).float()
194 |             else:
195 |                 new_bands = [band for band in bands_l2a if band != 'MSK_CLDPRB']
196 |                 bands_l2a = new_bands
197 |                 sampled_image = sampled_image_full.select(new_bands).clip(self.polygon).float()
198 |         else:
199 |             sampled_image = sampled_image_full.select(bands_l1c).clip(self.polygon).float()
200 | 
201 | 
202 |         try:
203 |             self.s2_date = sampled_image.date().format('YYYY-MM-dd').getInfo()
204 |         except ee.ee_exception.EEException:
205 |             tmp  = ee.Image(img_list.get(random_number))
206 |             logging.error(f"type: {self.s2_type}, num images in collection: {num_filtered_images}")
207 |             logging.error(f"bands: {tmp.bandNames().getInfo()}")
208 | 
209 |         tmp = sampled_image.select('B4')
210 |         self.proj = tmp.projection()
211 |         self.crs = self.proj.getInfo()['crs']
212 | 
213 |         logging.debug(f"\t ID: {self.id}\
214 |                 \nBiome name: {self.tile['properties']['biome']}\
215 |                 \nEco-region name: {self.tile['properties']['eco_region']}\
216 |                 \nDate: {self.s2_date}\
217 |                 \nProjection: {self.crs}\
218 |                 \nLat: {self.lat} Lon: {self.lon}\
219 |                 \nPolygon: {self.polygon.getInfo()['coordinates']}\
220 |                 \nS2 type:{self.s2_type}"\
221 |                 )
222 |         if self.s2_type == 'l2a':
223 |             scl = sampled_image.select(['SCL', 'QA60']).reproject(self.proj)
224 |             sampled_image = sampled_image.select([band for band in bands_l2a if band not in ['SCL', 'QA60']]).resample('bilinear').reproject(self.proj)
225 |             sampled_image = sampled_image.addBands(scl)
226 |         else:
227 |             qa60 = sampled_image.select('QA60').reproject(self.proj)
228 |             sampled_image = sampled_image.select([band for band in bands_l1c if band != 'QA60']).resample('bilinear').reproject(self.proj)
229 |             sampled_image = sampled_image.addBands(qa60)
230 |         self.image_set[data_name] = sampled_image
231 |         self.img_bands[data_name] = sampled_image.bandNames().getInfo()
232 |         logging.debug('\t Sentinel2 image loaded')
233 |         logging.debug(f"Time taken for {data_name}: {time.time() - start}")
234 |         
235 | 
236 | 
237 | 
238 |     def sentinel1(self):
239 |         '''
240 |         This function gets the sentinel1 data for the tile. We already know the date and projection from the sentinel2 image, so we use that to get the sentinel1 image. 
241 |         If the image on that date is not available, we use the closest available image. We use the VV and VH bands from the image for both ascending and descending orbits.
242 | 
243 |         '''
244 | 
245 |         start = time.time()
246 |         
247 |         img = (ee.ImageCollection('COPERNICUS/S1_GRD')
248 |                         .filterDate(self.start_date, self.end_date) # gets images in the specified date range
249 |                         .filterBounds(self.polygon) # gets images that have some overlap with the tile
250 |                         .filter(ee.Filter.contains('.geo', self.polygon.buffer(200))) # gets images containing the tile plus some buffer
251 |                         .map(lambda image: image.clip(self.polygon)) # crops to tile
252 |                         .filterMetadata('instrumentMode', 'equals', 'IW') # selects for the interferometric wide swath mode
253 |                         .map(lambda image: image.set('date_difference', image.date().difference(self.s2_date, 'day').abs())) # calculate days off from S2 image
254 |                         .sort('date_difference')) # sort in ascending order by days off
255 | 
256 | 
257 |         # getting the ascending and descending images
258 |         img_asc = img.filterMetadata('orbitProperties_pass', 'equals', 'ASCENDING').first()
259 |         img_desc = img.filterMetadata('orbitProperties_pass', 'equals', 'DESCENDING').first()
260 | 
261 | 
262 | 
263 |         # selecting the bands
264 |         try:
265 |             bands_asc = img_asc.bandNames().getInfo()
266 |             if 'angle' in bands_asc:
267 |                 bands_asc.remove('angle')
268 |         except ee.ee_exception.EEException:
269 |             logging.debug('\t No ascending image found')
270 |             img_asc = None
271 |         try:
272 |             bands_desc = img_desc.bandNames().getInfo()
273 |             if 'angle' in bands_desc:
274 |                 bands_desc.remove('angle')
275 |         except ee.ee_exception.EEException:
276 |             logging.debug('\t No descending image found')
277 |             img_desc = None
278 | 
279 |         # if angle bands are available, remove them
280 |         img_asc = img_asc.select(bands_asc).float() if img_asc is not None else None
281 |         img_desc = img_desc.select(bands_desc).float() if img_desc is not None else None
282 | 
283 |         # resampling the image
284 |         if img_asc is not None:
285 |             img_asc = img_asc.resample('bilinear').reproject(self.proj)
286 |         if img_desc is not None:
287 |             img_desc = img_desc.resample('bilinear').reproject(self.proj)
288 |         
289 |         self.image_set[data_name] = {}
290 |         self.image_set[data_name]['asc'] = img_asc
291 |         self.image_set[data_name]['desc'] = img_desc
292 | 
293 | 
294 |         self.img_bands[data_name + '_asc'] = bands_asc if img_asc is not None else None
295 |         self.img_bands[data_name + '_desc'] = bands_desc if img_desc is not None else None
296 | 
297 |         logging.debug('\t Sentinel1 image loaded')
298 |         logging.debug(f"Time taken for {data_name}: {time.time() - start}")
299 | 
300 | 
301 | 
302 |     def aster(self):
303 |         '''
304 |         This function gets the elevation data for the tile. The data usually consists of the elevation, we also compute the slope from the elevation data, and return both.
305 |         '''
306 |         start = time.time()
307 |         cfg = self.cfg.aster # getting the config for aster elevation data
308 |         data_name = cfg.name # the name used to save the image in the image_set dictionary and the export name
309 |         bands = list(cfg.BANDS) # the bands to be selected from the image
310 | 
311 |         elevation = ee.Image(cfg.collection).clip(self.polygon).select(bands).float()
312 |         slope = ee.Terrain.slope(elevation)
313 |         merge = ee.Image.cat([elevation, slope])
314 | 
315 |         
316 |         
317 |         # self.image_set[data_name]['elevation'] = elevation
318 |         # self.image_set[data_name]['slope'] = slope
319 |         merge = merge.resample('bilinear').reproject(self.proj)
320 |         self.image_set[data_name] = merge
321 |         self.img_bands[data_name] = merge.bandNames().getInfo()
322 |     
323 |         logging.debug('\t elevation and slope image loaded')
324 |         logging.debug(f"Time taken for {data_name}: {time.time() - start}")
325 | 
326 | 
327 |     def era5(self):
328 |         '''
329 |         This function gets the ERA5 data for the tile. The ERA5 is only computed until the mid of 2020, hence we compute the same stats but for the period of 2018 - 2019. As per the world 
330 |         temperature stats, the average temperature from 2018 - 2021 was roughly the same. We compute 3 sets of stats. 1 for the current month, 1 for the previous month, and 1 for the full year.
331 |         '''
332 | 
333 |         start = time.time()
334 | 
335 |         cfg = self.cfg.era5 # getting the config for era5
336 |         data_name = cfg.name
337 |         bands = list(cfg.BANDS)
338 | 
339 |         parts = self.s2_date.split('-')
340 |         year = int(parts[0])
341 |         month = int(parts[1])
342 | 
343 |         end_date = (datetime(year, month, 1) + timedelta(days=32)).replace(day=1) - timedelta(days=1)
344 |         end_date = end_date.strftime('%Y-%m-%d')
345 |         if month > 1:
346 |             month = month - 1
347 |             start_date = f"{year}-{month}-01"
348 |         else:
349 |             month = 12
350 |             start_date = f"{year - 1}-{month}-01"
351 | 
352 |         # getting 2 months in one image collection
353 |         ERA5_monthly = ee.ImageCollection(cfg.collection)\
354 |                 .filterDate(start_date, end_date)\
355 |                 .map(lambda image: image.clip(self.polygon))\
356 |                 .select(bands)\
357 |                 .toBands()
358 |         
359 |         
360 |         # getting the year in one image collection - we get exactly one year of stats including the current month.  
361 |         year, month, _ = map(int, self.s2_date.split('-'))
362 | 
363 |         # Calculate start_date and end_date for the year
364 |         # we subtract 1 from the year to get the previous year 
365 |         start_date = f"{year - 1}-{month}-01"
366 |         end_date = (datetime(year, month, 1) + timedelta(days=32)).replace(day=1) - timedelta(days=1)
367 |         end_date = end_date.strftime('%Y-%m-%d')
368 | 
369 | 
370 |         ERA5_yearly = ee.ImageCollection(cfg.collection)\
371 |                 .filterDate(start_date, end_date)\
372 |                 .map(lambda image: image.clip(self.polygon))
373 |         
374 | 
375 |         def compute_yearly(bandName, imageCollection):
376 |             if 'min' in bandName:
377 |                 reducer = ee.Reducer.min()
378 |             elif 'max' in bandName:
379 |                 reducer = ee.Reducer.max()
380 |             elif 'total' in bandName:
381 |                 reducer = ee.Reducer.sum()
382 |             else:
383 |                 reducer = ee.Reducer.mean()
384 |             yearly = imageCollection.select(bandName).reduce(reducer)
385 |             return yearly
386 |         
387 |         
388 |         ERA5_yearly_image = ee.ImageCollection([compute_yearly(band, ERA5_yearly) for band in bands]).toBands().float()
389 |         ERA5_combined = ee.Image.cat([ERA5_monthly, ERA5_yearly_image])
390 | 
391 |         # if for some reason you wish to export them seperately - uncomment the following lines
392 |         # self.image_set[data_name] = {}
393 |         # self.image_set[data_name]['month1'] = ERA5_month1
394 |         # self.image_set[data_name]['month2'] = ERA5_month2
395 |         # self.image_set[data_name]['year'] = ERA5_yearly_image
396 | 
397 |         center_pixels = ERA5_combined.reduceRegion(
398 |             reducer=ee.Reducer.mean(),
399 |             geometry=self.polygon,
400 |             scale=10 
401 |         )
402 |         center_pixels = center_pixels.getInfo()
403 | 
404 |         band_names = ERA5_combined.bandNames().getInfo()
405 | 
406 |         self.era5_data['month1'] = [center_pixels[band] for band in band_names[:4]]
407 |         self.era5_data['month2'] = [center_pixels[band] for band in band_names[4:8]]
408 |         self.era5_data['year'] = [center_pixels[band] for band in band_names[8:]]
409 | 
410 | 
411 |         # ERA5_combined = ERA5_combined.reproject(self.proj)
412 |         # self.image_set[data_name] = ERA5_combined
413 |         
414 |         self.img_bands[data_name] = band_names
415 | 
416 | 
417 |         logging.debug('\t ERA5 image loaded')
418 |         logging.debug(f"Time taken for {data_name}: {time.time() - start}")
419 | 
420 | 
421 |     def dynamic_world(self):
422 |         '''
423 |         This function gets the dynamic world data for the tile. The dynamic world data is a collection of images with the same name as the sentinel 2 image for that tile. It consist of 9 classes, we add one more to indicate missing
424 |         information. The classes are as follows: 
425 |         0: No data
426 |         1: Water
427 |         2: Trees
428 |         3: Grass
429 |         4: Flooded vegetation
430 |         5: Crops
431 |         6: Shrub and scrub
432 |         7: Built
433 |         8: Bare
434 |         9: Snow and ice
435 | 
436 |         We choose the label band since that contains which of these labels were chosen.
437 |         '''
438 |         start = time.time()
439 |         cfg = self.cfg.dynamic_world
440 |         data_name = cfg.name
441 |         bands = list(cfg.BANDS)
442 | 
443 |         year = self.s2_date.split('-')[0]
444 |         start_date = f"{year}-01-01"
445 |         end_date = f"{year}-12-31"
446 |         dw_ic = ee.ImageCollection(cfg.collection)\
447 |                 .filterBounds(self.polygon)\
448 |                 .filterDate(start_date, end_date)\
449 |                 .select(bands)
450 |         
451 | 
452 | 
453 |         def reclasify(image):
454 |             label = image.select('label')
455 |             label2 = label\
456 |                     .where(image.eq(0), 1)\
457 |                     .where(image.eq(1), 2)\
458 |                     .where(image.eq(2), 3)\
459 |                     .where(image.eq(3), 4)\
460 |                     .where(image.eq(4), 5)\
461 |                     .where(image.eq(5), 6)\
462 |                     .where(image.eq(6), 7)\
463 |                     .where(image.eq(7), 8)\
464 |                     .where(image.eq(8), 9)\
465 |                     .where(image.eq(9), 10)
466 |             
467 |             # replacing the label band with the new label band
468 |             image = image.addBands(label2.rename('label2'))
469 |             image = image.select('label2')
470 |             image = image.rename('label')
471 |             return image
472 |         
473 |         dw_ic = dw_ic.map(reclasify)
474 |         dw_image = dw_ic.mode().clip(self.polygon)
475 |         
476 |         bands = dw_image.bandNames().getInfo()
477 | 
478 | 
479 |         if len(bands) == 0:
480 |             logging.debug('\t No dynamic world image found')
481 |             self.image_set[data_name] = None
482 |         else:
483 |             dw_image = dw_image.reproject(self.proj)
484 |             self.image_set[data_name] = dw_image
485 |             self.img_bands[data_name] = dw_image.bandNames().getInfo()
486 |             logging.debug('\t Dynamic world image loaded')
487 | 
488 | 
489 | 
490 |         logging.debug(f"Time taken for {data_name}: {time.time() - start}")
491 | 
492 |     def canopy_height_eth(self):
493 |         '''
494 |         This function gets the ETH canopy height and standard deviation from the year 2020.
495 |         '''
496 |         start = time.time()
497 |         cfg = self.cfg.canopy_height_eth  # getting the config for canopy_height_eth
498 |         data_name = cfg.name  # the name used to save the image in the image_set dictionary and the export name
499 |         collections = list(cfg.COLLECTIONS)  # the collections with single bands that will be used
500 | 
501 |         height = ee.Image(collections[0]).clip(self.polygon).float()
502 |         std = ee.Image(collections[1]).clip(self.polygon).float()
503 |         merge = ee.Image.cat([height, std])
504 | 
505 |         merge = merge.resample('bilinear').reproject(self.proj)
506 |         merge = merge.rename(['height', 'std'])
507 |         self.image_set[data_name] = merge
508 |         self.img_bands[data_name] = merge.bandNames().getInfo()
509 | 
510 |         logging.debug('\t ETH canopy height and std loaded')
511 |         logging.debug(f"Time taken for {data_name}: {time.time() - start}")
512 | 
513 |     def esa_worldcover(self):
514 |         '''
515 |         This function gets the esa worldcover data for the tile.
516 |         '''
517 | 
518 |         start = time.time()
519 |         cfg = self.cfg.esa_worldcover  # getting the config for esa_worldcover
520 |         data_name = cfg.name  # the name used to save the image in the image_set dictionary and the export name
521 |         bands = list(cfg.BANDS)  # the bands to be selected from the image
522 | 
523 |         dataset = ee.ImageCollection(cfg.collection).first().clip(self.polygon).select(bands)
524 | 
525 |         dataset = dataset.reproject(self.proj)
526 | 
527 |         self.image_set[data_name] = dataset
528 |         self.img_bands[data_name] = dataset.bandNames().getInfo()
529 | 
530 |         logging.debug('\t esa worldcover loaded')
531 |         logging.debug(f"Time taken for {data_name}: {time.time() - start}")
532 | 
533 |         
534 | 
535 | 
536 | 
537 | 
538 | 
539 |     
540 |     ################################################################################################################################################################################################
541 |     # THE FOLLOWING SET OF FUNCTIONS ARE FOR EXPORTING THE ABOVE DATA TO GCS OR LOCAL DIRECTORY
542 |     ################################################################################################################################################################################################
543 | 
544 |     def export_local_parallel(self):
545 |         '''
546 |         A function that exports the data to the local directory. To make it parallel, we create the number of processes equal to the
547 |         number of datasets. Each process will export one dataset. 
548 |         '''
549 |         start = time.time()
550 |         # Create a process pool with a limited number of processes
551 |         num_processes = min(len(self.image_set), cpu_count())
552 |         with Pool(num_processes) as pool:
553 |             args_list = [(data_name, image) for data_name, image in self.image_set.items()]
554 |             pool.starmap(self.export_local, args_list)
555 |         logging.debug(f"Time taken for exporting all: {time.time() - start}")
556 |         logging.info(f"Exported all images for {self.id}")
557 | 
558 |     
559 |     @retry(tries=10, delay=1, backoff=2)
560 |     def export_local(self, data_name, image):
561 |         # data_name, image = args_list
562 |         os.makedirs(f"{self.export_folder}/{data_name}", exist_ok=True)
563 |         if isinstance(image, dict):
564 |             for extra_info, img in image.items():
565 |                 if img is None:
566 |                     continue
567 |                 start = time.time()
568 |                 url = img.getDownloadUrl({
569 |                     'name': f"{data_name}_{extra_info}_{self.id}",
570 |                     'scale': 10,
571 |                     'crs': self.crs,
572 |                     'region': self.polygon.getInfo()['coordinates'],
573 |                     'format': 'GeoTIFF',
574 |                     'bands': img.bandNames().getInfo()
575 |                 })  
576 |                 logging.debug(f"time taken for getting url: {time.time() - start}")
577 |                 
578 |                 r = requests.get(url, stream=True, verify=False)
579 |                 if r.status_code == 200:
580 |                     with open(f"{self.export_folder}/{data_name}/{self.id}_{extra_info}.tif", 'wb') as f:
581 |                         f.write(r.content)
582 |                     logging.debug(f"Downloaded {data_name} to local directory")
583 |                 else:
584 |                     logging.debug(f"Error downloading {data_name} to local directory")
585 |             return
586 |         if image is None:
587 |             return
588 |         start = time.time()
589 |         url = image.getDownloadUrl({
590 |             'name': f"{data_name}_{self.id}",
591 |             'scale': 10,
592 |             'crs': self.crs,
593 |             'region': self.polygon.getInfo()['coordinates'],
594 |             'format': 'GeoTIFF',
595 |             'bands': image.bandNames().getInfo()
596 |         })
597 |         logging.debug(f"time taken for getting url: {time.time() - start}")
598 | 
599 | 
600 |         r = requests.get(url, stream=True, verify=False)
601 |   
602 | 
603 |         if r.status_code == 200:
604 |             with open(f"{self.export_folder}/{data_name}/{self.id}.tif", 'wb') as f:
605 |                 f.write(r.content)
606 |             logging.debug(f"Downloaded {data_name} to local directory")
607 |         else:
608 |             logging.debug(f"Error downloading {data_name} to local directory")
609 |             return
610 |         
611 |     @retry(tries=10, delay=1, backoff=2)
612 |     def export_local_single(self):
613 |         for data_name, image in self.image_set.items():
614 |             os.makedirs(f"{self.export_folder}/{data_name}", exist_ok=True)
615 |             if isinstance(image, dict):
616 |                 for extra_info, img in image.items():
617 |                     if img is None:
618 |                         continue
619 |                     start = time.time()
620 |                     url = img.getDownloadUrl({
621 |                         'name': f"{data_name}_{extra_info}_{self.id}",
622 |                         'scale': 10,
623 |                         'crs': self.crs,
624 |                         'region': self.polygon.getInfo()['coordinates'],
625 |                         'format': 'GeoTIFF',
626 |                         'bands': img.bandNames().getInfo()
627 |                     })  
628 |                     logging.debug(f"time taken for getting url: {time.time() - start}")
629 |                     start = time.time()
630 |                     r = requests.get(url, stream=True, verify=False)
631 |                     if r.status_code == 200:
632 |                         with open(f"{self.export_folder}/{data_name}/{self.id}_{extra_info}.tif", 'wb') as f:
633 |                             f.write(r.content)
634 |                         logging.debug(f"Downloaded {data_name} to local directory")
635 |                         logging.debug(f"time taken for downloading: {time.time() - start}")
636 | 
637 |                     else:
638 |                         logging.debug(f"Error downloading {data_name} to local directory")
639 |                 return
640 |             if image is None:
641 |                 return
642 |             start = time.time()
643 |             url = image.getDownloadUrl({
644 |                 'name': f"{data_name}_{self.id}",
645 |                 'scale': 10,
646 |                 'crs': self.crs,
647 |                 'region': self.polygon.getInfo()['coordinates'],
648 |                 'format': 'GeoTIFF',
649 |                 'bands': image.bandNames().getInfo()
650 |             })
651 |             logging.debug(f"time taken for getting url: {time.time() - start}")
652 |             start = time.time()
653 |             r = requests.get(url, stream=True, verify=False)
654 |     
655 | 
656 |             if r.status_code == 200:
657 |                 with open(f"{self.export_folder}/{data_name}/{self.id}.tif", 'wb') as f:
658 |                     f.write(r.content)
659 |                 logging.debug(f"Downloaded {data_name} to local directory")
660 |                 logging.debug(f"time taken for downloading: {time.time() - start}")
661 |             else:
662 |                 logging.debug(f"Error downloading {data_name} to local directory")
663 |                 return
664 | 
665 |                 
666 |     def export(self):
667 |         '''
668 |         Export the images to GCS. Sometimes the dictionary has a sub dictionary, for example sentinel1 has ascending and descending orbits, hence we create a sub dictionary for each orbit
669 |         '''
670 |         logging.debug('\t ---- Exporting images to GCS ----')
671 |         for data_name, image in self.image_set.items():
672 |             if isinstance(image, dict):
673 |                 for extra_info, img in image.items():
674 |                     if img is None:
675 |                         continue
676 |                     task = ee.batch.Export.image.toCloudStorage(
677 |                         image = img,
678 |                         description = f"{data_name}_{extra_info}_{self.id}",
679 |                         bucket = self.cfg.bucket,
680 |                         fileNamePrefix = self.export_folder + '/' + data_name + '/' + self.id + '_' + extra_info ,
681 |                         region = self.polygon.getInfo()['coordinates'],
682 |                         scale = 10,
683 |                         crs = self.crs,
684 |                         maxPixels = 1e13
685 |                     )
686 |                     task.start()
687 |                     logging.debug(f"\t Exporting {data_name} to GCS")
688 |                     # print(task.status())
689 |                 continue
690 |             task = ee.batch.Export.image.toCloudStorage(
691 |                 image = image,
692 |                 description = f"{data_name}_{self.id}",
693 |                 bucket = self.cfg.bucket,
694 |                 fileNamePrefix = self.export_folder + '/' + data_name + '/' + self.id,
695 |                 region = self.polygon.getInfo()['coordinates'],
696 |                 scale = 10,
697 |                 crs = self.crs,
698 |                 maxPixels = 1e13
699 |             )
700 |             task.start()
701 |             logging.debug(f"\t Exporting {data_name} to GCS")
702 |             
703 |             # print(task.status())
704 | 
705 | 
706 |     @retry(tries=10, delay=1, backoff=2)           
707 |     def download_and_process_image(self, image, crs):
708 |         url = image.getDownloadUrl({
709 |             'bands': image.bandNames().getInfo(),
710 |             'region': self.polygon.getInfo()['coordinates'],
711 |             'scale': 10,
712 |             'format': 'NPY'})
713 |         r = requests.get(url)
714 |         np_geotiff = np.load(io.BytesIO(r.content))
715 | 
716 |         # np_geotiff = np.load(io.BytesIO(geotiff))
717 |         # cropping the image to 128x128
718 |         new_shape = (128, 128)
719 |         old_shape = np_geotiff.shape
720 |         start_x = (old_shape[0] - new_shape[0]) // 2
721 |         start_y = (old_shape[1] - new_shape[1]) // 2
722 |         np_geotiff = np_geotiff[start_x:start_x + new_shape[0], start_y:start_y + new_shape[1]]
723 | 
724 |         arr = rfn.structured_to_unstructured(np_geotiff[list(np_geotiff.dtype.names)])
725 |         arr = arr.transpose(2, 0, 1)
726 |         return arr, np_geotiff.dtype.names
727 | 
728 |     def export_pixels(self):
729 |         '''
730 |         This function exports to the local directory using the computePixels functions.  With this code, I am trying to store the np arrays directly into an HDF5 file.
731 |         '''
732 | 
733 |         # Initialize dictionaries and arrays
734 |         self.return_dict = {}
735 | 
736 |         for data_name, image in self.image_set.items():
737 |             if isinstance(image, dict):
738 |                 if data_name == 'sentinel1':
739 |                     arr = np.full((8, 128, 128), np.nan)
740 |                 elif data_name == 'era5':
741 |                     arr = np.full((12, 128, 128), np.nan)
742 |                     c = 0
743 | 
744 |                 for extra_info, img in image.items():
745 |                     if img is None:
746 |                         continue
747 |                     arr_t, bands_downloaded = self.download_and_process_image(img, self.crs)
748 |                     if data_name == 'sentinel1':
749 |                         if extra_info == 'asc':
750 |                             if 'VV' in bands_downloaded:
751 |                                 arr[0] = arr_t[bands_downloaded.index('VV')]
752 |                             if 'VH' in bands_downloaded:
753 |                                 arr[1] = arr_t[bands_downloaded.index('VH')]
754 |                             if 'HH' in bands_downloaded:
755 |                                 arr[2] = arr_t[bands_downloaded.index('HH')]
756 |                             if 'HV' in bands_downloaded:
757 |                                 arr[3] = arr_t[bands_downloaded.index('HV')]
758 |                         elif extra_info == 'desc':
759 |                             if 'VV' in bands_downloaded:
760 |                                 arr[4] = arr_t[bands_downloaded.index('VV')]
761 |                             if 'VH' in bands_downloaded:
762 |                                 arr[5] = arr_t[bands_downloaded.index('VH')]
763 |                             if 'HH' in bands_downloaded:
764 |                                 arr[6] = arr_t[bands_downloaded.index('HH')]
765 |                             if 'HV' in bands_downloaded:
766 |                                 arr[7] = arr_t[bands_downloaded.index('HV')]
767 | 
768 |                     if data_name == 'era5':
769 |                         arr[c:c+4] = arr_t
770 |                         c += 4
771 |                 self.return_dict[data_name] = arr
772 |             else:
773 |                 arr, bands_downloaded = self.download_and_process_image(image, self.crs)
774 |                 self.return_dict[data_name] = arr
775 | 
776 | 
777 | 
778 |     
779 | 
780 | 
781 |     # def export_pixels(self):
782 |     #     '''
783 |     #     This function also exports to the local directory. It is good for small files, and holds band information. With this code, i am trying to store the np arrays directly into a hdf5 file.
784 |         
785 |     #     '''
786 | 
787 | 
788 |     #     for data_name, image in self.image_set.items():
789 |     #         if isinstance(image, dict):
790 |     #             if data_name == 'sentinel1':
791 |     #                 arr = np.full((8, 128, 128), np.nan)
792 |     #             elif data_name == 'era5':
793 |     #                 arr = np.full((12, 128, 128), np.nan)
794 |     #                 c = 0
795 |                 
796 |     #             for extra_info, img in image.items():
797 |     #                 if img is None:
798 |     #                     continue
799 |     #                 img = img.resample('bicubic').reproject(crs=self.crs, scale=10)
800 |     #                 proj = ee.Projection(self.crs).atScale(10).getInfo()
801 | 
802 |     #                 request = {
803 |     #                     'expression': image,
804 |     #                     'fileFormat': 'NPY',
805 |     #                     'grid': {
806 |     #                         'affineTransform': {
807 |     #                             'scaleX': 10,
808 |     #                             'shearX': 0,
809 |     #                             'shearY': 0,
810 |     #                             'scaleY': -10,
811 |     #                         },
812 |     #                         'crsCode': proj['crs'],
813 |     #                     },
814 | 
815 |     #                 }
816 |     #                 REQUEST = dict(request)
817 |     #                 geotiff = ee.data.computePixels(REQUEST)
818 |     #                 np_geotiff = np.load(io.BytesIO(geotiff))
819 |                     
820 |     #                 # cropping the image to 128x128
821 |     #                 new_shape = (128, 128)
822 |     #                 old_shape = np_geotiff.shape
823 |     #                 start_x = (old_shape[0] - new_shape[0]) // 2
824 |     #                 start_y = (old_shape[1] - new_shape[1]) // 2
825 |     #                 np_geotiff = np_geotiff[start_x:start_x + new_shape[0], start_y:start_y + new_shape[1]]
826 | 
827 |     #                 arr_t = np_geotiff.view(np.float32).reshape((-1,) + np_geotiff.shape)
828 |     #                 if data_name == 'sentinel1':
829 |     #                     bands = np_geotiff.dtype.names
830 | 
831 |     #                     if extra_info == 'asc':
832 |     #                         # put the bands in the correct order [VV, VH, HH, HV], if they are present
833 |     #                         if 'VV' in bands:
834 |     #                             arr[0] = arr_t['VV']
835 |     #                         if 'VH' in bands:
836 |     #                             arr[1] = arr_t['VH']
837 |     #                         if 'HH' in bands:
838 |     #                             arr[2] = arr_t['HH']
839 |     #                         if 'HV' in bands:
840 |     #                             arr[3] = arr_t['HV']
841 |     #                     elif extra_info == 'desc':
842 |     #                         if 'VV' in bands:
843 |     #                             arr[4] = arr_t['VV']
844 |     #                         if 'VH' in bands:
845 |     #                             arr[5] = arr_t['VH']
846 |     #                         if 'HH' in bands:
847 |     #                             arr[6] = arr_t['HH']
848 |     #                         if 'HV' in bands:
849 |     #                             arr[7] = arr_t['HV']
850 |     #                 if data_name == 'era5':
851 |     #                     arr[c:c+4] = arr_t
852 |     #                     c += 4
853 | 
854 |     #             self.return_dict[data_name] = arr
855 | 
856 | 
857 |     #         image = image.resample('bicubic').reproject(crs=self.crs, scale=10)
858 |     #         proj = ee.Projection(self.crs).atScale(10).getInfo()
859 | 
860 |     #         request = {
861 |     #             'expression': image,
862 |     #             'fileFormat': 'NPY',
863 |     #             'grid': {
864 |     #                 'affineTransform': {
865 |     #                     'scaleX': 10,
866 |     #                     'shearX': 0,
867 |     #                     'shearY': 0,
868 |     #                     'scaleY': -10,
869 |     #                 },
870 |     #                 'crsCode': proj['crs'],
871 |     #             },
872 | 
873 |     #         }
874 | 
875 |     #         REQUEST = dict(request)
876 |     #         geotiff = ee.data.computePixels(REQUEST)
877 |     #         np_geotiff = np.load(io.BytesIO(geotiff))
878 |             
879 |     #         # cropping the image to 128x128
880 |     #         new_shape = (128, 128)
881 |     #         old_shape = np_geotiff.shape
882 |     #         start_x = (old_shape[0] - new_shape[0]) // 2
883 |     #         start_y = (old_shape[1] - new_shape[1]) // 2
884 |     #         np_geotiff = np_geotiff[start_x:start_x + new_shape[0], start_y:start_y + new_shape[1]]
885 | 
886 |     #         arr = np_geotiff.view(np.float32).reshape((-1,) + np_geotiff.shape)                
887 |     #         # display_array = rfn.structured_to_unstructured(np_geotiff[['B4', 'B3', 'B2']])/10000
888 |     #         # plt.imshow(display_array)
889 |     #         # plt.show()
890 |     #         self.return_dict[data_name] = arr
891 | 
892 | 
893 |     #         exit()
894 |     #         # writing the np to geoTIFF
895 |     #         from osgeo import gdal
896 |     #         import pyproj
897 |     #         from osgeo import osr
898 | 
899 |     #         upp_left_coords = self.polygon.bounds().coordinates().get(0).getInfo()[3]
900 | 
901 |     #         source_proj = pyproj.Proj(proj='latlong', datum='WGS84')
902 |     #         target_proj = pyproj.Proj(init=proj['crs'])
903 | 
904 |     #         print(upp_left_coords)
905 |     #         print(proj['crs'])
906 | 
907 |     #         upp_left_x, upp_left_y = pyproj.transform(source_proj, target_proj, upp_left_coords[0], upp_left_coords[1])
908 |     #         tranform_var = (upp_left_x, 10, 0, upp_left_y, 0, -10)
909 |     #         driver = gdal.GetDriverByName('GTiff')
910 |     #         GDT_dtype = gdal.GDT_Float32
911 |     #         rows, cols = np_geotiff.shape[0], np_geotiff.shape[1]
912 |     #         band_num = len(np_geotiff.dtype.names)
913 |     #         outRaster = driver.Create('/Users/qbk152/Desktop/codes/global-LR/gdal-test.tif', cols, rows, band_num, GDT_dtype)
914 | 
915 |     #         outRaster.SetGeoTransform(tranform_var)
916 |     #         for b in range(band_num):
917 |     #             outband = outRaster.GetRasterBand(b + 1)
918 |     #             outband.WriteArray(np_geotiff[np_geotiff.dtype.names[b]])
919 |             
920 | 
921 |     #         outRasterSRS = osr.SpatialReference()
922 |     #         outRasterSRS.ImportFromEPSG(int(proj['crs'].split(':')[1]))
923 |     #         outRaster.SetProjection(outRasterSRS.ExportToWkt())
924 | 
925 |     #         outband.FlushCache()
926 | 
927 | 
928 | 
929 | 
930 | 
931 | 
932 | 
933 | 
934 | 
935 | 
936 | 
937 | 
938 | 
939 | 
940 | 
941 | 
942 |     
943 | 
944 | 
945 | 
946 | 
947 | 
948 | 
949 | 


--------------------------------------------------------------------------------
/main_download.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The main function to download all the data from GEE
  3 | 
  4 | '''
  5 | 
  6 | 
  7 | import os
  8 | import config.ee_init 
  9 | import ee
 10 | import numpy as np
 11 | import geojson
 12 | import hydra
 13 | from omegaconf import DictConfig, OmegaConf
 14 | 
 15 | from ee_utils.ee_data import ee_set
 16 | from utils.utils import read_geojson, update_tile_info
 17 | import logging
 18 | import h5py
 19 | import time
 20 | import warnings
 21 | import json
 22 | warnings.filterwarnings("ignore")
 23 | 
 24 | 
 25 | 
 26 | 
 27 | @hydra.main(config_path='config', config_name='config_data')
 28 | def main(cfg: DictConfig) -> None:
 29 |     print(OmegaConf.to_yaml(cfg))
 30 | 
 31 | 
 32 |     # setting up the logger. Logs both to the console and to a file inside outputs/ date/ time. This is created by hydra
 33 |     numeric_level = getattr(logging, cfg.log.upper(), None)
 34 |     if not isinstance(numeric_level, int):
 35 |         raise ValueError('Invalid log level: %s' % cfg.log)
 36 |     logging.basicConfig(
 37 |                         # filename='log.txt', # comment this line if you want to log to console only
 38 |                         level=numeric_level, 
 39 |                         format='%(levelname)s : %(message)s',
 40 |                         filemode='w'
 41 |         )
 42 | 
 43 |     # reading the geojson file
 44 |     gj = read_geojson(cfg.tiles_path)
 45 |     datasets = cfg.datasets
 46 |     cfg.update_geojson = True
 47 | 
 48 |     # if sentinel2 is in the datasets, then we need to update the geojson file to include the date of the image, crs, and other details.
 49 |     # if it is not present, this means that we are downloading other datasets, and hence need to use the geojson data instead of updating
 50 |     # cfg.update_geojson = 'sentinel2' in datasets
 51 |     cfg.read_tile_info = not 'sentinel2' in datasets # if we are downloading things other than s2, we need to read the tile information from the geojson. 
 52 | 
 53 |     tile_info_dict = {}
 54 |     if cfg.read_tile_info:
 55 |         logging.info('Reading tile information from geojson. Please note that any errors with the tiles.geojson, implies that you did not download Sentinel 2 yet. Please download Sentinel 2 first. ')
 56 |         # tile_info = json.load(open(cfg.tile_info_path, 'r'))
 57 |         tile_info = json.load(open('/projects/dereeco/data/global-lr/data_1M_v001/data_1M_v001_tile_info.json', 'r'))
 58 |     else:
 59 |         tile_info = None
 60 | 
 61 | 
 62 |     i = cfg.start_from
 63 |     end = min(cfg.end_at, len(gj['features']))
 64 |     
 65 |     start = time.time()
 66 | 
 67 |     while i < end:
 68 |         start_ = time.time()
 69 |         logging.info(f'####################### Processing tile [{i}/{len(gj["features"])}] #######################')
 70 |         tile = gj['features'][i]
 71 |         id = tile['properties']['tile_id']
 72 |         if cfg.read_tile_info and id not in tile_info.keys():
 73 |             # this is not in tile info, hence the s2 has not been downloaded yet. so we skip this tile
 74 |             logging.info(f"Tile {id} not in tile_info. Skipping")
 75 |             i += 1
 76 |             continue
 77 |         # creating the ee_set object, the function calls are inside the constructor, hence it will automatically download the data
 78 |         if cfg.read_tile_info:
 79 |             ee_set_ = ee_set(tile, cfg, tile_info=tile_info[id])
 80 |         else:
 81 |             ee_set_ = ee_set(tile, cfg) 
 82 |         logging.debug(f"Time taken for 1 tile: {time.time() - start_}")
 83 |         if cfg.update_geojson and not ee_set_.no_data:
 84 |             tile_info_dict[id] = update_tile_info(tile, ee_set_, tile_info[id] if tile_info is not None else None)
 85 |             os.makedirs(f"{cfg.tile_info_path}", exist_ok=True)
 86 |             with open(f"{cfg.tile_info_path}/tile_info_{cfg.start_from}_{cfg.end_at}.json", 'w') as f:
 87 |                 geojson.dump(tile_info_dict, f)
 88 |         elif ee_set_.no_data:
 89 |             logging.info(f"no sentinel2 data for this tile. Skipping")
 90 |             gj['features'].pop(i)
 91 |             i -= 1
 92 | 
 93 | 
 94 | 
 95 |         i += 1
 96 |         # break # we only want to download one tile for now
 97 |     logging.info(f"TOTAL TIME TAKEN: {time.time() - start}")
 98 |     logging.info(f"AVG TIME TAKEN: {(time.time() - start)/(end - cfg.start_from)}")
 99 | 
100 | if __name__ == "__main__":
101 |     main()
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/post_download.py:
--------------------------------------------------------------------------------
 1 | # a file to call other functions after the download is complete (post download)
 2 | 
 3 | 
 4 | from utils.utils import merge_dicts
 5 | from utils.normalization import compute_band_stats
 6 | from utils.splits import create_splits
 7 | import os
 8 | import argparse
 9 | 
10 | def main(args):
11 |     '''
12 |     A function to call other functions after the download is complete. 
13 |     merges all the tile_info files into a single file (these are temporary files created by the slurm jobs)
14 |     converts the downloaded data to h5 format
15 |     computes the band stats
16 |     creates the splits (train and valid only)
17 |     '''
18 | 
19 |     # print('Merging the tile_info files for all slurm jobs into a single file')
20 |     # out_path = os.path.join(args.data_dir, args.data_dir.split('/')[-1] + '_tile_info.json') if args.data_dir[-1] != '/' else os.path.join(args.data_dir, args.data_dir.split('/')[-2] + '_tile_info.json')
21 |     # in_path = os.path.join(args.data_dir, 'tile_info')
22 |     # merge_dicts(in_path, out_path)
23 | 
24 |     # print('converting to h5')
25 |     # os.system(f'python -u utils/convert_to_h5.py --mode create --data_dir {args.data_dir}')
26 | 
27 |     print('computing band stats')
28 |     compute_band_stats(data_folder = args.data_dir)
29 | 
30 |     # print('computing splits')
31 |     # create_splits(data_folder = args.data_dir)
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     parser = argparse.ArgumentParser()
40 |     # provide the name of the output folder, by default the path of the output json is the name followed by _tile_info.json
41 |     parser.add_argument('--data_dir', type=str, help='path to the output folder', required=True)
42 |     args = parser.parse_args()
43 |     main(args)
44 | 


--------------------------------------------------------------------------------
/redownload.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import glob
 4 | import argparse
 5 | 
 6 | 
 7 | # set the following values based on the slurm script. 
 8 | # num_of_tiles =  1300000
 9 | # num_of_jobs = 40
10 | # num_tiles_per_job = 33000
11 | # tile_info_path = 'data/tile_info/*.json'
12 | 
13 | def main(args):
14 |     # num_tiles_per_job = num_of_tiles // num_of_jobs
15 |     num_of_tiles = args.num_of_tiles
16 |     num_of_jobs = args.num_of_jobs
17 |     num_tiles_per_job = args.num_tiles_per_job
18 |     tile_info_path = args.tile_info_path
19 | 
20 | 
21 |     tile_info_files = glob.glob(tile_info_path) 
22 | 
23 |     start = []
24 |     stop = []
25 |     total_files = 0
26 | 
27 |     for i in range(num_tiles_per_job, num_of_tiles+1, num_tiles_per_job):
28 |         files = []
29 |         for f in tile_info_files:
30 |             # get all the files with the last number equal to i. This gets all the files
31 |             # processed in that job or in subsequent redownloads
32 |             if (f.split('.')[0].split('_')[-1] == str(i)):
33 |                 files.append(f)
34 | 
35 |         # read each file and append the count
36 |         count = 0
37 |         for f in files:
38 |             print(f)
39 |             count += len(json.load(open(f, 'r')).keys())
40 |         print(f"Number of tiles processed: {count}")
41 |         start.append(i-num_tiles_per_job + count)
42 |         stop.append(i)
43 |         total_files += num_tiles_per_job - count
44 | 
45 |         # print(i + count - 1, i - 1)cl
46 |         
47 | 
48 | 
49 |         # break
50 |     for i in range(len(start)):
51 |         print(f"Job {i} : {start[i]} to {stop[i]}")
52 | 
53 |     print(f"Total number of files to download: {total_files}")
54 | 
55 |     # write the start and stop to a file
56 |     with open('start_stop_redownload.txt', 'w') as f:
57 |         for i in range(len(start)):
58 |             f.write(f"{start[i]} {stop[i]}\n")
59 | 
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser()
64 | 
65 |     parser.add_argument('--num_of_tiles', type=int, help='total number of tiles already downloaded', default=1300000)
66 |     parser.add_argument('--num_of_jobs', type=int, help='total number of parallel slurm jobs when downloading the full tiles', default=40)
67 |     parser.add_argument('--num_tiles_per_job', type=int, help='total number of tiles processed per job', default=33000)
68 |     parser.add_argument('--tile_info_path', type=str, help='path to the tile_info files', default='data/tile_info/*.json', required=True)
69 | 
70 |     args = parser.parse_args()
71 | 
72 |     main(args)
73 | 
74 |     
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | anyio==4.0.0
  2 | appnope==0.1.3
  3 | argon2-cffi==23.1.0
  4 | argon2-cffi-bindings==21.2.0
  5 | arrow==1.2.3
  6 | asttokens==2.4.0
  7 | async-lru==2.0.4
  8 | attrs==23.1.0
  9 | Babel==2.12.1
 10 | backcall==0.2.0
 11 | beautifulsoup4==4.12.2
 12 | bleach==6.0.0
 13 | braceexpand==0.1.7
 14 | cachetools==5.3.1
 15 | certifi==2023.7.22
 16 | cffi==1.15.1
 17 | charset-normalizer==3.2.0
 18 | click==8.1.7
 19 | click-plugins==1.1.1
 20 | cligj==0.7.2
 21 | comm==0.1.4
 22 | contourpy==1.1.0
 23 | cycler==0.11.0
 24 | debugpy==1.7.0
 25 | decorator==5.1.1
 26 | defusedxml==0.7.1
 27 | earthengine-api==0.1.367
 28 | exceptiongroup==1.1.3
 29 | executing==1.2.0
 30 | fastjsonschema==2.18.0
 31 | Fiona==1.9.4.post1
 32 | fonttools==4.42.1
 33 | fqdn==1.5.1
 34 | geojson==3.0.1
 35 | geopandas==0.13.2
 36 | google-api-core==2.11.1
 37 | google-api-python-client==2.97.0
 38 | google-auth==2.22.0
 39 | google-auth-httplib2==0.1.0
 40 | google-cloud-core==2.3.3
 41 | google-cloud-storage==2.10.0
 42 | google-crc32c==1.5.0
 43 | google-resumable-media==2.5.0
 44 | googleapis-common-protos==1.60.0
 45 | httplib2==0.22.0
 46 | hydra-core==1.3.2
 47 | idna==3.4
 48 | imagecodecs==2023.9.4
 49 | importlib-metadata==6.8.0
 50 | importlib-resources==6.0.1
 51 | ipykernel==6.25.2
 52 | ipython==8.15.0
 53 | ipython-genutils==0.2.0
 54 | ipywidgets==8.1.0
 55 | isoduration==20.11.0
 56 | jedi==0.19.0
 57 | Jinja2==3.1.2
 58 | json5==0.9.14
 59 | jsonpointer==2.4
 60 | jsonschema==4.19.0
 61 | jsonschema-specifications==2023.7.1
 62 | kiwisolver==1.4.5
 63 | MarkupSafe==2.1.3
 64 | matplotlib==3.7.2
 65 | matplotlib-inline==0.1.6
 66 | mistune==3.0.1
 67 | nbclient==0.8.0
 68 | nbconvert==7.8.0
 69 | nbformat==5.9.2
 70 | nest-asyncio==1.5.7
 71 | notebook==7.0.3
 72 | notebook_shim==0.2.3
 73 | numpy==1.25.2
 74 | omegaconf==2.3.0
 75 | overrides==7.4.0
 76 | packaging==23.1
 77 | pandas==2.1.0
 78 | pandocfilters==1.5.0
 79 | parso==0.8.3
 80 | pexpect==4.8.0
 81 | pickleshare==0.7.5
 82 | Pillow==10.0.0
 83 | platformdirs==3.10.0
 84 | prometheus-client==0.17.1
 85 | prompt-toolkit==3.0.39
 86 | protobuf==4.24.2
 87 | psutil==5.9.5
 88 | ptyprocess==0.7.0
 89 | pure-eval==0.2.2
 90 | pyasn1==0.5.0
 91 | pyasn1-modules==0.3.0
 92 | pycparser==2.21
 93 | Pygments==2.16.1
 94 | pyparsing==3.0.9
 95 | pyproj==3.6.0
 96 | python-dateutil==2.8.2
 97 | python-json-logger==2.0.7
 98 | pytz==2023.3
 99 | PyYAML==6.0.1
100 | pyzmq==25.1.1
101 | qtconsole==5.4.4
102 | QtPy==2.4.0
103 | referencing==0.30.2
104 | requests==2.31.0
105 | rfc3339-validator==0.1.4
106 | rfc3986-validator==0.1.1
107 | rpds-py==0.10.2
108 | rsa==4.9
109 | Send2Trash==1.8.2
110 | shapely==2.0.1
111 | six==1.16.0
112 | sniffio==1.3.0
113 | soupsieve==2.5
114 | stack-data==0.6.2
115 | terminado==0.17.1
116 | tifffile==2023.8.30
117 | tinycss2==1.2.1
118 | tomli==2.0.1
119 | tornado==6.3.3
120 | tqdm==4.66.1
121 | traitlets==5.9.0
122 | typing_extensions==4.7.1
123 | tzdata==2023.3
124 | uri-template==1.3.0
125 | uritemplate==4.1.1
126 | urllib3==1.26.16
127 | wcwidth==0.2.6
128 | webcolors==1.13
129 | webdataset==0.2.48
130 | webencodings==0.5.1
131 | websocket-client==1.6.3
132 | widgetsnbextension==4.0.8
133 | zipp==3.16.2
134 | retry
135 | h5py
136 | 
137 | 


--------------------------------------------------------------------------------
/slurm_scripts/slurm_create_tiles.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=geojson-download
 3 | 
 4 | #SBATCH --tasks=1
 5 | #SBATCH --cpus-per-task=4
 6 | #SBATCH --time=2-00:00:00
 7 | # PATH TO SAVE SLURM LOGS
 8 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/tiles-download-%A_%a_%x.out
 9 | # TOTAL MEMORY PER NODE
10 | #SBATCH --mem=4G 
11 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl
12 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
13 | 
14 | 
15 | 
16 | python -u /home/qbk152/vishal/MMEarth-data/create_tiles_polygon.py\
17 |         tiles_geojson_path='/projects/dereeco/data/global-lr/geojson_files/tiles_1M_v001.geojson' \
18 |         num_of_images=1500000 \
19 |         tile_size=1300 \
20 |         uniform_type=0 \
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/slurm_scripts/slurm_download_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=download
 3 | 
 4 | #SBATCH --array=0-39
 5 | #SBATCH --tasks=1
 6 | #SBATCH --cpus-per-task=4
 7 | #SBATCH --time=20-00:00:00
 8 | # PATH TO SAVE SLURM LOGS
 9 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/slurm-%A_%a_%x.out
10 | # TOTAL MEMORY PER NODE
11 | #SBATCH --mem=16G 
12 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl
13 | 
14 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
15 | 
16 | 
17 | task_per_job=37500 # this number is the total number of tiles divided by the number of jobs
18 | start_from=$((SLURM_ARRAY_TASK_ID * task_per_job))
19 | end_at=$((start_from + task_per_job))
20 | 
21 | 
22 | python /home/qbk152/vishal/MMEarth-data/main_download.py start_from=$start_from end_at=$end_at
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/slurm_scripts/slurm_download_seq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=data-download
 3 | 
 4 | #SBATCH --tasks=1
 5 | #SBATCH --cpus-per-task=4
 6 | #SBATCH --time=8-00:00:00
 7 | # PATH TO SAVE SLURM LOGS
 8 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/slurm-%A_%a_%x.out
 9 | # TOTAL MEMORY PER NODE
10 | #SBATCH --mem=4G 
11 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl
12 | 
13 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
14 | 
15 | 
16 | 
17 | python /home/qbk152/vishal/global-lr/main_download.py start_from=4518 end_at=7500
18 | 


--------------------------------------------------------------------------------
/slurm_scripts/slurm_redownload_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=1-mil-download
 3 | 
 4 | #SBATCH --array=0-39
 5 | #SBATCH --tasks=1
 6 | #SBATCH --cpus-per-task=4
 7 | #SBATCH --time=8-00:00:00
 8 | # PATH TO SAVE SLURM LOGS
 9 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/slurm-%A_%a_%x.out
10 | # TOTAL MEMORY PER NODE
11 | #SBATCH --mem=4G 
12 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl
13 | 
14 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
15 | 
16 | # Read start and stop values from the file for each task ID
17 | task_file="start_stop_redownload.txt"
18 | line_number=$((SLURM_ARRAY_TASK_ID + 1))
19 | start_from=$(sed -n "${line_number}p" "$task_file" | awk '{print $1}')
20 | end_at=$(sed -n "${line_number}p" "$task_file" | awk '{print $2}')
21 | 
22 | echo "Task ID: $SLURM_ARRAY_TASK_ID, Start from: $start_from, End at: $end_at"
23 | 
24 | python /home/qbk152/vishal/global-lr/main_download.py start_from=$start_from end_at=$end_at
25 | 


--------------------------------------------------------------------------------
/slurm_scripts/slurm_temp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=chunk
 3 | 
 4 | #SBATCH --tasks=1
 5 | #SBATCH --cpus-per-task=4
 6 | #SBATCH --time=2-00:00:00
 7 | # PATH TO SAVE SLURM LOGS
 8 | #SBATCH --output=/home/qbk152/vishal/slurm_logs/temp-%A_%a_%x.out
 9 | # TOTAL MEMORY PER NODE
10 | #SBATCH --mem=32G 
11 | #SBATCH --exclude=hendrixgpu01fl,hendrixgpu02fl,hendrixgpu07fl,hendrixgpu08fl,hendrixgpu03fl,hendrixgpu04fl
12 | 
13 | echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
14 | 
15 | 
16 | # use 16G of mem when running the below code
17 | # python -u convert_to_h5.py \
18 | #         --mode create \
19 | #         --data_dir /projects/dereeco/data/global-lr/data_300k_130/ \
20 | #         --tile_info /home/qbk152/vishal/global-lr/data/data_300k_130_tile_info.json \
21 | #         --output_file /projects/dereeco/data/global-lr/data_300k_130/data_300k_130.h5\
22 | #         --missing_tiles /home/qbk152/vishal/global-lr/data/missing_tiles_300k.csv
23 | 
24 | # python -u utils/utils.py
25 | # python -u /home/qbk152/vishal/global-lr/normalization.py
26 | 
27 | # python -u utils/convert_to_h5.py \
28 | #         --mode merge \
29 | #         --data_dir1 /projects/dereeco/data/global-lr/data_1M_130_new/ \
30 | #         --data_dir2 /projects/dereeco/data/global-lr/data_missing_130/ \
31 | #         --output_path /projects/dereeco/data/global-lr/data_1M_130_new/data_1M_130_new2.h5 \
32 | 
33 | python -u post_download.py \
34 |         --data_dir  /projects/dereeco/data/global-lr/data_1M_v001/
35 | 
36 | # python -u data_exp/data_exp.py \
37 | #         --data_dir /projects/dereeco/data/global-lr/data_1M_130_new
38 | 
39 | #python -u utils/chunking_h5.py \
40 |  #       --h5_file_path /projects/dereeco/data/global-lr/data_1M_130_new/data_1M_130_new.h5
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/stats/biome_labels.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Tundra": 0,
 3 |     "Tropical & Subtropical Moist Broadleaf Forests": 1,
 4 |     "Mediterranean Forests, Woodlands & Scrub": 2,
 5 |     "Deserts & Xeric Shrublands": 3,
 6 |     "Temperate Grasslands, Savannas & Shrublands": 4,
 7 |     "Boreal Forests/Taiga": 5,
 8 |     "Temperate Conifer Forests": 6,
 9 |     "Temperate Broadleaf & Mixed Forests": 7,
10 |     "Montane Grasslands & Shrublands": 8,
11 |     "Mangroves": 9,
12 |     "Flooded Grasslands & Savannas": 10,
13 |     "Tropical & Subtropical Grasslands, Savannas & Shrublands": 11,
14 |     "Tropical & Subtropical Dry Broadleaf Forests": 12,
15 |     "Tropical & Subtropical Coniferous Forests": 13
16 | }


--------------------------------------------------------------------------------
/stats/biome_stats.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Tropical & Subtropical Moist Broadleaf Forests": 230,
 3 |     "Deserts & Xeric Shrublands": 102,
 4 |     "Temperate Broadleaf & Mixed Forests": 83,
 5 |     "Tropical & Subtropical Grasslands, Savannas & Shrublands": 58,
 6 |     "Tropical & Subtropical Dry Broadleaf Forests": 56,
 7 |     "Tundra": 51,
 8 |     "Temperate Grasslands, Savannas & Shrublands": 48,
 9 |     "Temperate Conifer Forests": 47,
10 |     "Montane Grasslands & Shrublands": 46,
11 |     "Mediterranean Forests, Woodlands & Scrub": 40,
12 |     "Boreal Forests/Taiga": 26,
13 |     "Flooded Grasslands & Savannas": 25,
14 |     "Mangroves": 19,
15 |     "Tropical & Subtropical Coniferous Forests": 15,
16 |     "N/A": 1
17 | }


--------------------------------------------------------------------------------
/stats/eco_labels.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Adelie Land tundra": 0,
  3 |     "Ahklun and Kilbuck Upland Tundra": 1,
  4 |     "Alaska-St. Elias Range tundra": 2,
  5 |     "Aleutian Islands tundra": 3,
  6 |     "Antipodes Subantarctic Islands tundra": 4,
  7 |     "Arctic coastal tundra": 5,
  8 |     "Russian Arctic desert": 6,
  9 |     "Arctic foothills tundra": 7,
 10 |     "Russian Bering tundra": 8,
 11 |     "Beringia lowland tundra": 9,
 12 |     "Beringia upland tundra": 10,
 13 |     "Brooks-British Range tundra": 11,
 14 |     "Canadian Low Arctic tundra": 12,
 15 |     "Central South Antarctic Peninsula tundra": 13,
 16 |     "Cherskii-Kolyma mountain tundra": 14,
 17 |     "Chukchi Peninsula tundra": 15,
 18 |     "Davis Highlands tundra": 16,
 19 |     "East Antarctic tundra": 17,
 20 |     "Ellsworth Land tundra": 18,
 21 |     "Ellsworth Mountains tundra": 19,
 22 |     "Enderby Land tundra": 20,
 23 |     "Canadian High Arctic tundra": 21,
 24 |     "Interior Yukon-Alaska alpine tundra": 22,
 25 |     "Kalaallit Nunaat High Arctic tundra": 23,
 26 |     "Kola Peninsula tundra": 24,
 27 |     "Canadian Middle Arctic Tundra": 25,
 28 |     "Northeast Antarctic Peninsula tundra": 26,
 29 |     "Northwest Antarctic Peninsula tundra": 27,
 30 |     "North Victoria Land tundra": 28,
 31 |     "Northeast Siberian coastal tundra": 29,
 32 |     "Northwest Russian-Novaya Zemlya tundra": 30,
 33 |     "Novosibirsk Islands Arctic desert": 31,
 34 |     "Ogilvie-MacKenzie alpine tundra": 32,
 35 |     "Pacific Coastal Mountain icefields and tundra": 33,
 36 |     "Prince Charles Mountains tundra": 34,
 37 |     "Scandinavian Montane Birch forest and grasslands": 35,
 38 |     "Scotia Sea Islands tundra": 36,
 39 |     "South Antarctic Peninsula tundra": 37,
 40 |     "South Orkney Islands tundra": 38,
 41 |     "South Victoria Land tundra": 39,
 42 |     "Southern Indian Ocean Islands tundra": 40,
 43 |     "Taimyr-Central Siberian tundra": 41,
 44 |     "Torngat Mountain tundra": 42,
 45 |     "Trans-Baikal Bald Mountain tundra": 43,
 46 |     "Transantarctic Mountains tundra": 44,
 47 |     "Wrangel Island Arctic desert": 45,
 48 |     "Yamal-Gydan tundra": 46,
 49 |     "Kamchatka tundra": 47,
 50 |     "Kalaallit Nunaat Arctic steppe": 48,
 51 |     "Dronning Maud Land tundra": 49,
 52 |     "Marie Byrd Land tundra": 50,
 53 |     "Admiralty Islands lowland rain forests": 51,
 54 |     "Albertine Rift montane forests": 52,
 55 |     "Alto Paran\u00e1 Atlantic forests": 53,
 56 |     "Andaman Islands rain forests": 54,
 57 |     "Araucaria moist forests": 55,
 58 |     "Atlantic Coast restingas": 56,
 59 |     "Congolian coastal forests": 57,
 60 |     "Bahia coastal forests": 58,
 61 |     "Bahia interior forests": 59,
 62 |     "Banda Sea Islands moist deciduous forests": 60,
 63 |     "Biak-Numfoor rain forests": 61,
 64 |     "Bolivian Yungas": 62,
 65 |     "Borneo lowland rain forests": 63,
 66 |     "Borneo peat swamp forests": 64,
 67 |     "Brahmaputra Valley semi-evergreen forests": 65,
 68 |     "Buru rain forests": 66,
 69 |     "Caatinga Enclaves moist forests": 67,
 70 |     "Cameroon Highlands forests": 68,
 71 |     "Caqueta moist forests": 69,
 72 |     "Cardamom Mountains rain forests": 70,
 73 |     "Carolines tropical moist forests": 71,
 74 |     "Catatumbo moist forests": 72,
 75 |     "Cauca Valley montane forests": 73,
 76 |     "Cayos Miskitos-San Andr\u00e9s and Providencia moist forests": 74,
 77 |     "Central American Atlantic moist forests": 75,
 78 |     "Central American montane forests": 76,
 79 |     "Central Congolian lowland forests": 77,
 80 |     "Central Polynesian tropical moist forests": 78,
 81 |     "Central Range Papuan montane rain forests": 79,
 82 |     "Chao Phraya freshwater swamp forests": 80,
 83 |     "Chao Phraya lowland moist deciduous forests": 81,
 84 |     "Chiapas montane forests": 82,
 85 |     "Chimalapas montane forests": 83,
 86 |     "Chin Hills-Arakan Yoma montane forests": 84,
 87 |     "Choc\u00f3-Dari\u00e9n moist forests": 85,
 88 |     "Christmas and Cocos Islands tropical forests": 86,
 89 |     "Cocos Island moist forests": 87,
 90 |     "Comoros forests": 88,
 91 |     "Cook Islands tropical moist forests": 89,
 92 |     "Cordillera La Costa montane forests": 90,
 93 |     "Cordillera Oriental montane forests": 91,
 94 |     "Costa Rican seasonal moist forests": 92,
 95 |     "Cross-Niger transition forests": 93,
 96 |     "Cross-Sanaga-Bioko coastal forests": 94,
 97 |     "Cuban moist forests": 95,
 98 |     "Eastern Congolian swamp forests": 96,
 99 |     "Eastern Cordillera Real montane forests": 97,
100 |     "Eastern Guinean forests": 98,
101 |     "East Deccan moist deciduous forests": 99,
102 |     "Eastern Java-Bali montane rain forests": 100,
103 |     "Eastern Java-Bali rain forests": 101,
104 |     "Eastern Micronesia tropical moist forests": 102,
105 |     "Eastern Panamanian montane forests": 103,
106 |     "Ethiopian montane forests": 104,
107 |     "Fernando de Noronha-Atol das Rocas moist forests": 105,
108 |     "Fiji tropical moist forests": 106,
109 |     "Granitic Seychelles forests": 107,
110 |     "Greater Negros-Panay rain forests": 108,
111 |     "Guianan freshwater swamp forests": 109,
112 |     "Guianan Highlands moist forests": 110,
113 |     "Guianan lowland moist forests": 111,
114 |     "Guianan piedmont moist forests": 112,
115 |     "Guinean montane forests": 113,
116 |     "Guizhou Plateau broadleaf and mixed forests": 114,
117 |     "Gurupa v\u00e1rzea": 115,
118 |     "Hainan Island monsoon rain forests": 116,
119 |     "Halmahera rain forests": 117,
120 |     "Hawai'i tropical moist forests": 118,
121 |     "Himalayan subtropical broadleaf forests": 119,
122 |     "Hispaniolan moist forests": 120,
123 |     "Huon Peninsula montane rain forests": 121,
124 |     "Iquitos v\u00e1rzea": 122,
125 |     "Irrawaddy freshwater swamp forests": 123,
126 |     "Irrawaddy moist deciduous forests": 124,
127 |     "Isthmian-Atlantic moist forests": 125,
128 |     "Isthmian-Pacific moist forests": 126,
129 |     "Jamaican moist forests": 127,
130 |     "Japur\u00e1-Solim\u00f5es-Negro moist forests": 128,
131 |     "Jian Nan subtropical evergreen forests": 129,
132 |     "Juru\u00e1-Purus moist forests": 130,
133 |     "Kayah-Karen montane rain forests": 131,
134 |     "Kermadec Islands subtropical moist forests": 132,
135 |     "Knysna-Amatole montane forests": 133,
136 |     "Kwazulu Natal-Cape coastal forests": 134,
137 |     "Leeward Islands moist forests": 135,
138 |     "Lord Howe Island subtropical forests": 136,
139 |     "Louisiade Archipelago rain forests": 137,
140 |     "Lower Gangetic Plains moist deciduous forests": 138,
141 |     "Luang Prabang montane rain forests": 139,
142 |     "Luzon montane rain forests": 140,
143 |     "Luzon rain forests": 141,
144 |     "Madagascar humid forests": 142,
145 |     "Madagascar subhumid forests": 143,
146 |     "Madeira-Tapaj\u00f3s moist forests": 144,
147 |     "Magdalena-Urab\u00e1 moist forests": 145,
148 |     "Magdalena Valley montane forests": 146,
149 |     "Malabar Coast moist forests": 147,
150 |     "Maldives-Lakshadweep-Chagos Archipelago tropical moist forests": 148,
151 |     "Maputaland coastal forests and woodlands": 149,
152 |     "Maraj\u00f3 v\u00e1rzea": 150,
153 |     "Mara\u00f1\u00f3n dry forests": 151,
154 |     "Marquesas tropical moist forests": 152,
155 |     "Mascarene forests": 153,
156 |     "Mato Grosso tropical dry forests": 154,
157 |     "Meghalaya subtropical forests": 155,
158 |     "Mentawai Islands rain forests": 156,
159 |     "Mindanao-Eastern Visayas rain forests": 157,
160 |     "Mindanao montane rain forests": 158,
161 |     "Mindoro rain forests": 159,
162 |     "Mizoram-Manipur-Kachin rain forests": 160,
163 |     "Monte Alegre v\u00e1rzea": 161,
164 |     "Mount Cameroon and Bioko montane forests": 162,
165 |     "Myanmar coastal rain forests": 163,
166 |     "Nansei Islands subtropical evergreen forests": 164,
167 |     "Napo moist forests": 165,
168 |     "Negro-Branco moist forests": 166,
169 |     "New Britain-New Ireland lowland rain forests": 167,
170 |     "New Britain-New Ireland montane rain forests": 168,
171 |     "New Caledonia rain forests": 169,
172 |     "Nicobar Islands rain forests": 170,
173 |     "Niger Delta swamp forests": 171,
174 |     "Nigerian lowland forests": 172,
175 |     "Norfolk Island subtropical forests": 173,
176 |     "North Western Ghats moist deciduous forests": 174,
177 |     "North Western Ghats montane rain forests": 175,
178 |     "Northeast Brazil restingas": 176,
179 |     "Northeast Congolian lowland forests": 177,
180 |     "Northern Annamites rain forests": 178,
181 |     "Northern Indochina subtropical forests": 179,
182 |     "Northern Khorat Plateau moist deciduous forests": 180,
183 |     "Northern New Guinea lowland rain and freshwater swamp forests": 181,
184 |     "Northern New Guinea montane rain forests": 182,
185 |     "Northern Swahili coastal forests": 183,
186 |     "Northern Thailand-Laos moist deciduous forests": 184,
187 |     "Northern Triangle subtropical forests": 185,
188 |     "Northern Vietnam lowland rain forests": 186,
189 |     "Northwest Andean montane forests": 187,
190 |     "Northwest Congolian lowland forests": 188,
191 |     "Oaxacan montane forests": 189,
192 |     "Ogasawara subtropical moist forests": 190,
193 |     "Orinoco Delta swamp forests": 191,
194 |     "Orissa semi-evergreen forests": 192,
195 |     "Palau tropical moist forests": 193,
196 |     "Palawan rain forests": 194,
197 |     "Pantanos de Centla": 195,
198 |     "Pantepui forests & shrublands": 196,
199 |     "Peninsular Malaysian montane rain forests": 197,
200 |     "Peninsular Malaysian peat swamp forests": 198,
201 |     "Peninsular Malaysian rain forests": 199,
202 |     "Pernambuco coastal forests": 200,
203 |     "Pernambuco interior forests": 201,
204 |     "Peruvian Yungas": 202,
205 |     "Pet\u00e9n-Veracruz moist forests": 203,
206 |     "Puerto Rican moist forests": 204,
207 |     "Purus-Madeira moist forests": 205,
208 |     "Purus v\u00e1rzea": 206,
209 |     "Queensland tropical rain forests": 207,
210 |     "Rapa Nui and Sala y G\u00f3mez subtropical forests": 208,
211 |     "Red River freshwater swamp forests": 209,
212 |     "Rio Negro campinarana": 210,
213 |     "Samoan tropical moist forests": 211,
214 |     "Santa Marta montane forests": 212,
215 |     "S\u00e3o Tom\u00e9, Pr\u00edncipe, and Annob\u00f3n forests": 213,
216 |     "Seram rain forests": 214,
217 |     "Serra do Mar coastal forests": 215,
218 |     "Sierra de los Tuxtlas": 216,
219 |     "Sierra Madre de Chiapas moist forests": 217,
220 |     "Society Islands tropical moist forests": 218,
221 |     "Solim\u00f5es-Japur\u00e1 moist forests": 219,
222 |     "Solomon Islands rain forests": 220,
223 |     "South China-Vietnam subtropical evergreen forests": 221,
224 |     "South China Sea Islands": 222,
225 |     "South Taiwan monsoon rain forests": 223,
226 |     "South Western Ghats moist deciduous forests": 224,
227 |     "South Western Ghats montane rain forests": 225,
228 |     "Southeast Papuan rain forests": 226,
229 |     "Southern Andean Yungas": 227,
230 |     "Southern Annamites montane rain forests": 228,
231 |     "Southern New Guinea freshwater swamp forests": 229,
232 |     "Southern New Guinea lowland rain forests": 230,
233 |     "Southern Swahili coastal forests and woodlands": 231,
234 |     "Southwest Amazon moist forests": 232,
235 |     "Southwest Borneo freshwater swamp forests": 233,
236 |     "Sri Lanka lowland rain forests": 234,
237 |     "Sri Lanka montane rain forests": 235,
238 |     "Sulawesi montane rain forests": 236,
239 |     "Sulu Archipelago rain forests": 237,
240 |     "Sumatran freshwater swamp forests": 238,
241 |     "Sumatran lowland rain forests": 239,
242 |     "Sumatran montane rain forests": 240,
243 |     "Sumatran peat swamp forests": 241,
244 |     "Sundaland heath forests": 242,
245 |     "Sundarbans freshwater swamp forests": 243,
246 |     "Taiwan subtropical evergreen forests": 244,
247 |     "Talamancan montane forests": 245,
248 |     "Tapaj\u00f3s-Xingu moist forests": 246,
249 |     "Tenasserim-South Thailand semi-evergreen rain forests": 247,
250 |     "Tongan tropical moist forests": 248,
251 |     "Tonle Sap-Mekong peat swamp forests": 249,
252 |     "Tonle Sap freshwater swamp forests": 250,
253 |     "Trinidad and Tobago moist forest": 251,
254 |     "Trobriand Islands rain forests": 252,
255 |     "Tuamotu tropical moist forests": 253,
256 |     "Tubuai tropical moist forests": 254,
257 |     "Uatum\u00e3-Trombetas moist forests": 255,
258 |     "Ucayali moist forests": 256,
259 |     "Upper Gangetic Plains moist deciduous forests": 257,
260 |     "Vanuatu rain forests": 258,
261 |     "Venezuelan Andes montane forests": 259,
262 |     "Veracruz moist forests": 260,
263 |     "Veracruz montane forests": 261,
264 |     "Vogelkop-Aru lowland rain forests": 262,
265 |     "Vogelkop montane rain forests": 263,
266 |     "Western Congolian swamp forests": 264,
267 |     "Western Ecuador moist forests": 265,
268 |     "Western Guinean lowland forests": 266,
269 |     "Western Java montane rain forests": 267,
270 |     "Western Java rain forests": 268,
271 |     "Western Polynesian tropical moist forests": 269,
272 |     "Windward Islands moist forests": 270,
273 |     "Xingu-Tocantins-Araguaia moist forests": 271,
274 |     "Yapen rain forests": 272,
275 |     "Yucat\u00e1n moist forests": 273,
276 |     "Yunnan Plateau subtropical evergreen forests": 274,
277 |     "Tocantins/Pindare moist forests": 275,
278 |     "Trindade-Martin Vaz Islands tropical forests": 276,
279 |     "Sulawesi lowland rain forests": 277,
280 |     "East African montane forests": 278,
281 |     "Eastern Arc forests": 279,
282 |     "Borneo montane rain forests": 280,
283 |     "Aegean and Western Turkey sclerophyllous and mixed forests": 281,
284 |     "Albany thickets": 282,
285 |     "Anatolian conifer and deciduous mixed forests": 283,
286 |     "California coastal sage and chaparral": 284,
287 |     "California interior chaparral and woodlands": 285,
288 |     "California montane chaparral and woodlands": 286,
289 |     "Canary Islands dry woodlands and forests": 287,
290 |     "Chilean Matorral": 288,
291 |     "Coolgardie woodlands": 289,
292 |     "Corsican montane broadleaf and mixed forests": 290,
293 |     "Crete Mediterranean forests": 291,
294 |     "Cyprus Mediterranean forests": 292,
295 |     "Eastern Mediterranean conifer-broadleaf forests": 293,
296 |     "Esperance mallee": 294,
297 |     "Eyre and York mallee": 295,
298 |     "Flinders-Lofty montane woodlands": 296,
299 |     "Fynbos shrubland": 297,
300 |     "Hampton mallee and woodlands": 298,
301 |     "Iberian conifer forests": 299,
302 |     "Iberian sclerophyllous and semi-deciduous forests": 300,
303 |     "Illyrian deciduous forests": 301,
304 |     "Italian sclerophyllous and semi-deciduous forests": 302,
305 |     "Jarrah-Karri forest and shrublands": 303,
306 |     "Mediterranean Acacia-Argania dry woodlands and succulent thickets": 304,
307 |     "Mediterranean dry woodlands and steppe": 305,
308 |     "Mediterranean woodlands and forests": 306,
309 |     "Murray-Darling woodlands and mallee": 307,
310 |     "Naracoorte woodlands": 308,
311 |     "Northeast Spain and Southern France Mediterranean forests": 309,
312 |     "Northwest Iberian montane forests": 310,
313 |     "Pindus Mountains mixed forests": 311,
314 |     "Renosterveld shrubland": 312,
315 |     "Santa Lucia Montane Chaparral & Woodlands": 313,
316 |     "South Apennine mixed montane forests": 314,
317 |     "Southeast Iberian shrubs and woodlands": 315,
318 |     "Southern Anatolian montane conifer and deciduous forests": 316,
319 |     "Southwest Australia savanna": 317,
320 |     "Southwest Australia woodlands": 318,
321 |     "Southwest Iberian Mediterranean sclerophyllous and mixed forests": 319,
322 |     "Tyrrhenian-Adriatic sclerophyllous and mixed forests": 320,
323 |     "Afghan Mountains semi-desert": 321,
324 |     "Alashan Plateau semi-desert": 322,
325 |     "Aldabra Island xeric scrub": 323,
326 |     "Arabian sand desert": 324,
327 |     "Araya and Paria xeric scrub": 325,
328 |     "Atacama desert": 326,
329 |     "Saharan Atlantic coastal desert": 327,
330 |     "Azerbaijan shrub desert and steppe": 328,
331 |     "Badghyz and Karabil semi-desert": 329,
332 |     "Baja California desert": 330,
333 |     "Baluchistan xeric woodlands": 331,
334 |     "Caribbean shrublands": 332,
335 |     "Carnarvon xeric shrublands": 333,
336 |     "Caspian lowland desert": 334,
337 |     "Central Afghan Mountains xeric woodlands": 335,
338 |     "Central Asian northern desert": 336,
339 |     "Central Asian riparian woodlands": 337,
340 |     "Central Asian southern desert": 338,
341 |     "Central Mexican matorral": 339,
342 |     "Central Persian desert basins": 340,
343 |     "Central Ranges xeric scrub": 341,
344 |     "Chihuahuan desert": 342,
345 |     "Colorado Plateau shrublands": 343,
346 |     "Cuban cactus scrub": 344,
347 |     "Deccan thorn scrub forests": 345,
348 |     "Djibouti xeric shrublands": 346,
349 |     "East Arabian fog shrublands and sand desert": 347,
350 |     "East Sahara Desert": 348,
351 |     "East Saharan montane xeric woodlands": 349,
352 |     "Eastern Gobi desert steppe": 350,
353 |     "Eritrean coastal desert": 351,
354 |     "Gal\u00e1pagos Islands xeric scrub": 352,
355 |     "Gariep Karoo": 353,
356 |     "Gibson desert": 354,
357 |     "Godavari-Krishna mangroves": 355,
358 |     "Gobi Lakes Valley desert steppe": 356,
359 |     "Great Basin shrub steppe": 357,
360 |     "Great Lakes Basin desert steppe": 358,
361 |     "Great Sandy-Tanami desert": 359,
362 |     "Great Victoria desert": 360,
363 |     "Guajira-Barranquilla xeric scrub": 361,
364 |     "Gulf of California xeric scrub": 362,
365 |     "Hobyo grasslands and shrublands": 363,
366 |     "Indus Valley desert": 364,
367 |     "Junggar Basin semi-desert": 365,
368 |     "Kalahari xeric savanna": 366,
369 |     "Kaokoveld desert": 367,
370 |     "Kazakh semi-desert": 368,
371 |     "Kopet Dag semi-desert": 369,
372 |     "La Costa xeric shrublands": 370,
373 |     "Madagascar spiny thickets": 371,
374 |     "Madagascar succulent woodlands": 372,
375 |     "Malpelo Island xeric scrub": 373,
376 |     "Meseta Central matorral": 374,
377 |     "Mesopotamian shrub desert": 375,
378 |     "Mojave desert": 376,
379 |     "Motagua Valley thornscrub": 377,
380 |     "Nama Karoo shrublands": 378,
381 |     "Namaqualand-Richtersveld steppe": 379,
382 |     "Namib Desert": 380,
383 |     "Namibian savanna woodlands": 381,
384 |     "North Arabian desert": 382,
385 |     "North Arabian highland shrublands": 383,
386 |     "North Saharan Xeric Steppe and Woodland": 384,
387 |     "Somali montane xeric woodlands": 385,
388 |     "Aravalli west thorn scrub forests": 386,
389 |     "Nullarbor Plains xeric shrublands": 387,
390 |     "Paraguan\u00e1 xeric scrub": 388,
391 |     "Paropamisus xeric woodlands": 389,
392 |     "Pilbara shrublands": 390,
393 |     "Qaidam Basin semi-desert": 391,
394 |     "Red Sea-Arabian Desert shrublands": 392,
395 |     "Red Sea coastal desert": 393,
396 |     "Registan-North Pakistan sandy desert": 394,
397 |     "San Lucan xeric scrub": 395,
398 |     "Sechura desert": 396,
399 |     "Simpson desert": 397,
400 |     "Snake-Columbia shrub steppe": 398,
401 |     "Socotra Island xeric shrublands": 399,
402 |     "Sonoran desert": 400,
403 |     "South Iran Nubo-Sindian desert and semi-desert": 401,
404 |     "South Sahara desert": 402,
405 |     "Southwest Arabian Escarpment shrublands and woodlands": 403,
406 |     "Southwest Arabian highland xeric scrub": 404,
407 |     "St. Peter and St. Paul Rocks": 405,
408 |     "Succulent Karoo xeric shrublands": 406,
409 |     "Taklimakan desert": 407,
410 |     "Tamaulipan matorral": 408,
411 |     "Tamaulipan mezquital": 409,
412 |     "Tehuac\u00e1n Valley matorral": 410,
413 |     "Thar desert": 411,
414 |     "Tibesti-Jebel Uweinat montane xeric woodlands": 412,
415 |     "Tirari-Sturt stony desert": 413,
416 |     "West Sahara desert": 414,
417 |     "West Saharan montane xeric woodlands": 415,
418 |     "Western Australian Mulga shrublands": 416,
419 |     "Wyoming Basin shrub steppe": 417,
420 |     "Southwest Arabian coastal xeric shrublands": 418,
421 |     "Arabian-Persian Gulf coastal plain desert": 419,
422 |     "South Arabian plains and plateau desert": 420,
423 |     "Arabian desert": 421,
424 |     "Ile Europa and Bassas da India xeric scrub": 422,
425 |     "Al-Hajar foothill xeric woodlands and shrublands": 423,
426 |     "Al-Hajar montane woodlands and shrublands": 424,
427 |     "Alai-Western Tian Shan steppe": 425,
428 |     "Altai steppe and semi-desert": 426,
429 |     "Amsterdam-Saint Paul Islands temperate grasslands": 427,
430 |     "California Central Valley grasslands": 428,
431 |     "Canadian Aspen forests and parklands": 429,
432 |     "Canterbury-Otago tussock grasslands": 430,
433 |     "Central Anatolian steppe": 431,
434 |     "Central-Southern US mixed grasslands": 432,
435 |     "Central US forest-grasslands transition": 433,
436 |     "Central Tallgrass prairie": 434,
437 |     "Cross-Timbers savanna-woodland": 435,
438 |     "Daurian forest steppe": 436,
439 |     "Eastern Anatolian montane steppe": 437,
440 |     "Eastern Australia mulga shrublands": 438,
441 |     "Edwards Plateau savanna": 439,
442 |     "Emin Valley steppe": 440,
443 |     "Espinal": 441,
444 |     "Faroe Islands boreal grasslands": 442,
445 |     "Flint Hills tallgrass prairie": 443,
446 |     "Gissaro-Alai open woodlands": 444,
447 |     "Humid Pampas": 445,
448 |     "Kazakh forest steppe": 446,
449 |     "Kazakh steppe": 447,
450 |     "Kazakh upland steppe": 448,
451 |     "Low Monte": 449,
452 |     "Mid-Atlantic US coastal savannas": 450,
453 |     "Mongolian-Manchurian grassland": 451,
454 |     "Montana Valley and Foothill grasslands": 452,
455 |     "Nebraska Sand Hills mixed grasslands": 453,
456 |     "Northern Shortgrass prairie": 454,
457 |     "Northern Tallgrass prairie": 455,
458 |     "Palouse prairie": 456,
459 |     "Patagonian steppe": 457,
460 |     "Pontic steppe": 458,
461 |     "Sayan Intermontane steppe": 459,
462 |     "Selenge-Orkhon forest steppe": 460,
463 |     "South Siberian forest steppe": 461,
464 |     "Southeast Australia temperate savanna": 462,
465 |     "Southeast US mixed woodlands and savannas": 463,
466 |     "Southeast US conifer savannas": 464,
467 |     "Syrian xeric grasslands and shrublands": 465,
468 |     "Texas blackland prairies": 466,
469 |     "Tian Shan foothill arid steppe": 467,
470 |     "Tristan Da Cunha-Gough Islands shrub and grasslands": 468,
471 |     "Western shortgrass prairie": 469,
472 |     "Willamette Valley oak savanna": 470,
473 |     "Alaska Peninsula montane taiga": 471,
474 |     "Central Canadian Shield forests": 472,
475 |     "Cook Inlet taiga": 473,
476 |     "Copper Plateau taiga": 474,
477 |     "East Siberian taiga": 475,
478 |     "Eastern Canadian forests": 476,
479 |     "Eastern Canadian Shield taiga": 477,
480 |     "Iceland boreal birch forests and alpine tundra": 478,
481 |     "Interior Alaska-Yukon lowland taiga": 479,
482 |     "Kamchatka-Kurile meadows and sparse forests": 480,
483 |     "Kamchatka taiga": 481,
484 |     "Mid-Canada Boreal Plains forests": 482,
485 |     "Midwest Canadian Shield forests": 483,
486 |     "Muskwa-Slave Lake taiga": 484,
487 |     "Northeast Siberian taiga": 485,
488 |     "Northern Canadian Shield taiga": 486,
489 |     "Okhotsk-Manchurian taiga": 487,
490 |     "Sakhalin Island taiga": 488,
491 |     "Scandinavian and Russian taiga": 489,
492 |     "Southern Hudson Bay taiga": 490,
493 |     "Trans-Baikal conifer forests": 491,
494 |     "Urals montane forest and taiga": 492,
495 |     "Watson Highlands taiga": 493,
496 |     "West Siberian taiga": 494,
497 |     "Northern Cordillera forests": 495,
498 |     "Northwest Territories taiga": 496,
499 |     "Alberta-British Columbia foothills forests": 497,
500 |     "Alps conifer and mixed forests": 498,
501 |     "Altai montane forest and forest steppe": 499,
502 |     "Arizona Mountains forests": 500,
503 |     "Atlantic coastal pine barrens": 501,
504 |     "Blue Mountains forests": 502,
505 |     "British Columbia coastal conifer forests": 503,
506 |     "Caledon conifer forests": 504,
507 |     "Carpathian montane forests": 505,
508 |     "Central-Southern Cascades Forests": 506,
509 |     "Central British Columbia Mountain forests": 507,
510 |     "Central Pacific Northwest coastal forests": 508,
511 |     "Colorado Rockies forests": 509,
512 |     "Da Hinggan-Dzhagdy Mountains conifer forests": 510,
513 |     "East Afghan montane conifer forests": 511,
514 |     "Eastern Cascades forests": 512,
515 |     "Eastern Himalayan subalpine conifer forests": 513,
516 |     "Elburz Range forest steppe": 514,
517 |     "Fraser Plateau and Basin conifer forests": 515,
518 |     "Great Basin montane forests": 516,
519 |     "Helanshan montane conifer forests": 517,
520 |     "Hengduan Mountains subalpine conifer forests": 518,
521 |     "Hokkaido montane conifer forests": 519,
522 |     "Honshu alpine conifer forests": 520,
523 |     "Khangai Mountains conifer forests": 521,
524 |     "Klamath-Siskiyou forests": 522,
525 |     "Mediterranean conifer and mixed forests": 523,
526 |     "Northeast Himalayan subalpine conifer forests": 524,
527 |     "Northern Anatolian conifer and deciduous forests": 525,
528 |     "Northern California coastal forests": 526,
529 |     "Nujiang Langcang Gorge alpine conifer and mixed forests": 527,
530 |     "Okanogan dry forests": 528,
531 |     "Piney Woods": 529,
532 |     "Puget lowland forests": 530,
533 |     "Qilian Mountains conifer forests": 531,
534 |     "Qionglai-Minshan conifer forests": 532,
535 |     "Sayan montane conifer forests": 533,
536 |     "Scandinavian coastal conifer forests": 534,
537 |     "Sierra Nevada forests": 535,
538 |     "South Central Rockies forests": 536,
539 |     "Tian Shan montane conifer forests": 537,
540 |     "Wasatch and Uinta montane forests": 538,
541 |     "Western Himalayan subalpine conifer forests": 539,
542 |     "Queen Charlotte Islands conifer forests": 540,
543 |     "Northern Pacific Alaskan coastal forests": 541,
544 |     "North Cascades conifer forests": 542,
545 |     "Northern Rockies conifer forests": 543,
546 |     "Allegheny Highlands forests": 544,
547 |     "Appalachian-Blue Ridge forests": 545,
548 |     "Appalachian mixed mesophytic forests": 546,
549 |     "Appalachian Piedmont forests": 547,
550 |     "Appenine deciduous montane forests": 548,
551 |     "European Atlantic mixed forests": 549,
552 |     "Azores temperate mixed forests": 550,
553 |     "Balkan mixed forests": 551,
554 |     "Baltic mixed forests": 552,
555 |     "Cantabrian mixed forests": 553,
556 |     "Caspian Hyrcanian mixed forests": 554,
557 |     "Caucasus mixed forests": 555,
558 |     "Celtic broadleaf forests": 556,
559 |     "Central Anatolian steppe and woodlands": 557,
560 |     "Central China Loess Plateau mixed forests": 558,
561 |     "Central European mixed forests": 559,
562 |     "Central Korean deciduous forests": 560,
563 |     "Changbai Mountains mixed forests": 561,
564 |     "Changjiang Plain evergreen forests": 562,
565 |     "Chatham Island temperate forests": 563,
566 |     "Crimean Submediterranean forest complex": 564,
567 |     "Daba Mountains evergreen forests": 565,
568 |     "Dinaric Mountains mixed forests": 566,
569 |     "East Central Texas forests": 567,
570 |     "East European forest steppe": 568,
571 |     "Eastern Anatolian deciduous forests": 569,
572 |     "Eastern Australian temperate forests": 570,
573 |     "Eastern Canadian Forest-Boreal transition": 571,
574 |     "Eastern Great Lakes lowland forests": 572,
575 |     "Eastern Himalayan broadleaf forests": 573,
576 |     "English Lowlands beech forests": 574,
577 |     "Euxine-Colchic broadleaf forests": 575,
578 |     "Fiordland temperate forests": 576,
579 |     "Gulf of St. Lawrence lowland forests": 577,
580 |     "Hokkaido deciduous forests": 578,
581 |     "Huang He Plain mixed forests": 579,
582 |     "Interior Plateau US Hardwood Forests": 580,
583 |     "Juan Fern\u00e1ndez Islands temperate forests": 581,
584 |     "Madeira evergreen forests": 582,
585 |     "Magellanic subpolar forests": 583,
586 |     "Manchurian mixed forests": 584,
587 |     "Mississippi lowland forests": 585,
588 |     "Nelson Coast temperate forests": 586,
589 |     "New England-Acadian forests": 587,
590 |     "Nihonkai evergreen forests": 588,
591 |     "Nihonkai montane deciduous forests": 589,
592 |     "New Zealand North Island temperate forests": 590,
593 |     "Northeast China Plain deciduous forests": 591,
594 |     "Northeast US Coastal forests": 592,
595 |     "Northern Triangle temperate forests": 593,
596 |     "Northland temperate kauri forests": 594,
597 |     "Ozark Highlands mixed forests": 595,
598 |     "Ozark Mountain forests": 596,
599 |     "Pannonian mixed forests": 597,
600 |     "Po Basin mixed forests": 598,
601 |     "Pyrenees conifer and mixed forests": 599,
602 |     "Qin Ling Mountains deciduous forests": 600,
603 |     "Rakiura Island temperate forests": 601,
604 |     "Richmond temperate forests": 602,
605 |     "Rodope montane mixed forests": 603,
606 |     "San F\u00e9lix-San Ambrosio Islands temperate forests": 604,
607 |     "Sarmatic mixed forests": 605,
608 |     "Sichuan Basin evergreen broadleaf forests": 606,
609 |     "New Zealand South Island temperate forests": 607,
610 |     "Southeast Australia temperate forests": 608,
611 |     "Southern Great Lakes forests": 609,
612 |     "Southern Korea evergreen forests": 610,
613 |     "Taiheiyo evergreen forests": 611,
614 |     "Taiheiyo montane deciduous forests": 612,
615 |     "Tarim Basin deciduous forests and steppe": 613,
616 |     "Tasmanian Central Highland forests": 614,
617 |     "Tasmanian temperate forests": 615,
618 |     "Tasmanian temperate rain forests": 616,
619 |     "Upper Midwest US forest-savanna transition": 617,
620 |     "Ussuri broadleaf and mixed forests": 618,
621 |     "Valdivian temperate forests": 619,
622 |     "Western European broadleaf forests": 620,
623 |     "Western Great Lakes forests": 621,
624 |     "Western Himalayan broadleaf forests": 622,
625 |     "Western Siberian hemiboreal forests": 623,
626 |     "Westland temperate forests": 624,
627 |     "Zagros Mountains forest steppe": 625,
628 |     "North Atlantic moist mixed forests": 626,
629 |     "Altai alpine meadow and tundra": 627,
630 |     "Angolan montane forest-grassland": 628,
631 |     "Australian Alps montane grasslands": 629,
632 |     "Central Andean dry puna": 630,
633 |     "Central Andean puna": 631,
634 |     "Central Andean wet puna": 632,
635 |     "Papuan Central Range sub-alpine grasslands": 633,
636 |     "Central Tibetan Plateau alpine steppe": 634,
637 |     "Cordillera Central p\u00e1ramo": 635,
638 |     "Cordillera de Merida p\u00e1ramo": 636,
639 |     "Eastern Himalayan alpine shrub and meadows": 637,
640 |     "Ethiopian montane grasslands and woodlands": 638,
641 |     "Ethiopian montane moorlands": 639,
642 |     "Ghorat-Hazarajat alpine meadow": 640,
643 |     "High Monte": 641,
644 |     "Highveld grasslands": 642,
645 |     "Hindu Kush alpine meadow": 643,
646 |     "Jos Plateau forest-grassland": 644,
647 |     "Karakoram-West Tibetan Plateau alpine steppe": 645,
648 |     "Khangai Mountains alpine meadow": 646,
649 |     "Kopet Dag woodlands and forest steppe": 647,
650 |     "Kuh Rud and Eastern Iran montane woodlands": 648,
651 |     "Madagascar ericoid thickets": 649,
652 |     "Mediterranean High Atlas juniper steppe": 650,
653 |     "Mulanje Montane forest-grassland": 651,
654 |     "North Tibetan Plateau-Kunlun Mountains alpine desert": 652,
655 |     "Northern Andean p\u00e1ramo": 653,
656 |     "Northwestern Himalayan alpine shrub and meadows": 654,
657 |     "Nyanga-Chimanimani Montane forest-grassland": 655,
658 |     "Ordos Plateau steppe": 656,
659 |     "Pamir alpine desert and tundra": 657,
660 |     "Qilian Mountains subalpine meadows": 658,
661 |     "Rwenzori-Virunga montane moorlands": 659,
662 |     "Santa Marta p\u00e1ramo": 660,
663 |     "Sayan alpine meadows and tundra": 661,
664 |     "New Zealand South Island montane grasslands": 662,
665 |     "Southeast Tibet shrublands and meadows": 663,
666 |     "Southern Andean steppe": 664,
667 |     "Southern Rift Montane forest-grassland": 665,
668 |     "Sulaiman Range alpine meadows": 666,
669 |     "Tian Shan montane steppe and meadows": 667,
670 |     "Tibetan Plateau alpine shrublands and meadows": 668,
671 |     "Western Himalayan alpine shrub and meadows": 669,
672 |     "Yarlung Zanbo arid steppe": 670,
673 |     "East African montane moorlands": 671,
674 |     "Kinabalu montane alpine meadows": 672,
675 |     "Amazon-Orinoco-Southern Caribbean mangroves": 673,
676 |     "Bahamian-Antillean mangroves": 674,
677 |     "Central African mangroves": 675,
678 |     "East African mangroves": 676,
679 |     "Guinean mangroves": 677,
680 |     "Indochina mangroves": 678,
681 |     "Indus River Delta-Arabian Sea mangroves": 679,
682 |     "Madagascar mangroves": 680,
683 |     "Mesoamerican Gulf-Caribbean mangroves": 681,
684 |     "Myanmar Coast mangroves": 682,
685 |     "New Guinea mangroves": 683,
686 |     "Northern Mesoamerican Pacific mangroves": 684,
687 |     "Red Sea mangroves": 685,
688 |     "South American Pacific mangroves": 686,
689 |     "Southern Africa mangroves": 687,
690 |     "Southern Atlantic Brazilian mangroves": 688,
691 |     "Southern Mesoamerican Pacific mangroves": 689,
692 |     "Sunda Shelf mangroves": 690,
693 |     "Sundarbans mangroves": 691,
694 |     "Amur meadow steppe": 692,
695 |     "Bohai Sea saline meadow": 693,
696 |     "Cuban wetlands": 694,
697 |     "East African halophytics": 695,
698 |     "Enriquillo wetlands": 696,
699 |     "Etosha Pan halophytics": 697,
700 |     "Everglades flooded grasslands": 698,
701 |     "Guayaquil flooded grasslands": 699,
702 |     "Inner Niger Delta flooded savanna": 700,
703 |     "Lake Chad flooded savanna": 701,
704 |     "Makgadikgadi halophytics": 702,
705 |     "Nenjiang River grassland": 703,
706 |     "Nile Delta flooded savanna": 704,
707 |     "Orinoco wetlands": 705,
708 |     "Pantanal": 706,
709 |     "Paran\u00e1 flooded savanna": 707,
710 |     "Rann of Kutch seasonal salt marsh": 708,
711 |     "Saharan halophytics": 709,
712 |     "Southern Cone Mesopotamian savanna": 710,
713 |     "Sudd flooded grasslands": 711,
714 |     "Suiphun-Khanka meadows and forest meadows": 712,
715 |     "Tigris-Euphrates alluvial salt marsh": 713,
716 |     "Yellow Sea saline meadow": 714,
717 |     "Zambezian coastal flooded savanna": 715,
718 |     "Zambezian flooded grasslands": 716,
719 |     "Angolan mopane woodlands": 717,
720 |     "Angolan scarp savanna and woodlands": 718,
721 |     "Angolan wet miombo woodlands": 719,
722 |     "Arnhem Land tropical savanna": 720,
723 |     "Ascension scrub and grasslands": 721,
724 |     "Belizian pine savannas": 722,
725 |     "Beni savanna": 723,
726 |     "Brigalow tropical savanna": 724,
727 |     "Campos Rupestres montane savanna": 725,
728 |     "Cape York Peninsula tropical savanna": 726,
729 |     "Carpentaria tropical savanna": 727,
730 |     "Central bushveld": 728,
731 |     "Central Zambezian wet miombo woodlands": 729,
732 |     "Cerrado": 730,
733 |     "Clipperton Island shrub and grasslands": 731,
734 |     "Drakensberg Escarpment savanna and thicket": 732,
735 |     "Drakensberg grasslands": 733,
736 |     "Dry Chaco": 734,
737 |     "East Sudanian savanna": 735,
738 |     "Einasleigh upland savanna": 736,
739 |     "Guianan savanna": 737,
740 |     "Guinean forest-savanna": 738,
741 |     "Hawai'i tropical high shrublands": 739,
742 |     "Hawai'i tropical low shrublands": 740,
743 |     "Horn of Africa xeric bushlands": 741,
744 |     "Humid Chaco": 742,
745 |     "Itigi-Sumbu thicket": 743,
746 |     "Kalahari Acacia woodlands": 744,
747 |     "Kimberly tropical savanna": 745,
748 |     "Limpopo lowveld": 746,
749 |     "Llanos": 747,
750 |     "Mandara Plateau woodlands": 748,
751 |     "Masai xeric grasslands and shrublands": 749,
752 |     "Miskito pine forests": 750,
753 |     "Mitchell Grass Downs": 751,
754 |     "Northern Acacia-Commiphora bushlands and thickets": 752,
755 |     "Northern Congolian Forest-Savanna": 753,
756 |     "Northwest Hawai'i scrub": 754,
757 |     "Sahelian Acacia savanna": 755,
758 |     "Serengeti volcanic grasslands": 756,
759 |     "Somali Acacia-Commiphora bushlands and thickets": 757,
760 |     "Southern Acacia-Commiphora bushlands and thickets": 758,
761 |     "Southern Congolian forest-savanna": 759,
762 |     "Terai-Duar savanna and grasslands": 760,
763 |     "Trans Fly savanna and grasslands": 761,
764 |     "Uruguayan savanna": 762,
765 |     "Victoria Plains tropical savanna": 763,
766 |     "West Sudanian savanna": 764,
767 |     "Western Gulf coastal grasslands": 765,
768 |     "Zambezian-Limpopo mixed woodlands": 766,
769 |     "Zambezian Baikiaea woodlands": 767,
770 |     "Dry miombo woodlands": 768,
771 |     "Zambezian mopane woodlands": 769,
772 |     "St. Helena scrub and woodlands": 770,
773 |     "Victoria Basin forest-savanna": 771,
774 |     "Western Congolian forest-savanna": 772,
775 |     "Southwest Arabian montane woodlands and grasslands": 773,
776 |     "South Arabian fog woodlands, shrublands, and dune": 774,
777 |     "Apure-Villavicencio dry forests": 775,
778 |     "Baj\u00edo dry forests": 776,
779 |     "Balsas dry forests": 777,
780 |     "Bolivian montane dry forests": 778,
781 |     "Caatinga": 779,
782 |     "Cape Verde Islands dry forests": 780,
783 |     "Cauca Valley dry forests": 781,
784 |     "Central American dry forests": 782,
785 |     "Central Deccan Plateau dry deciduous forests": 783,
786 |     "Central Indochina dry forests": 784,
787 |     "Chhota-Nagpur dry deciduous forests": 785,
788 |     "Chiapas Depression dry forests": 786,
789 |     "Chiquitano dry forests": 787,
790 |     "Cuban dry forests": 788,
791 |     "East Deccan dry-evergreen forests": 789,
792 |     "Ecuadorian dry forests": 790,
793 |     "Fiji tropical dry forests": 791,
794 |     "Hawai'i tropical dry forests": 792,
795 |     "Hispaniolan dry forests": 793,
796 |     "Irrawaddy dry forests": 794,
797 |     "Islas Revillagigedo dry forests": 795,
798 |     "Jalisco dry forests": 796,
799 |     "Jamaican dry forests": 797,
800 |     "Khathiar-Gir dry deciduous forests": 798,
801 |     "Lara-Falc\u00f3n dry forests": 799,
802 |     "Lesser Sundas deciduous forests": 800,
803 |     "Madagascar dry deciduous forests": 801,
804 |     "Magdalena Valley dry forests": 802,
805 |     "Maracaibo dry forests": 803,
806 |     "Maranh\u00e3o Baba\u00e7u forests": 804,
807 |     "Marianas tropical dry forests": 805,
808 |     "Narmada Valley dry deciduous forests": 806,
809 |     "New Caledonia dry forests": 807,
810 |     "North Deccan dry deciduous forests": 808,
811 |     "Panamanian dry forests": 809,
812 |     "Pat\u00eda valley dry forests": 810,
813 |     "Puerto Rican dry forests": 811,
814 |     "Sierra de la Laguna dry forests": 812,
815 |     "Sinaloan dry forests": 813,
816 |     "Sin\u00fa Valley dry forests": 814,
817 |     "Sonoran-Sinaloan subtropical dry forest": 815,
818 |     "South Deccan Plateau dry deciduous forests": 816,
819 |     "Southeast Indochina dry evergreen forests": 817,
820 |     "Southern Pacific dry forests": 818,
821 |     "Southern Vietnam lowland dry forests": 819,
822 |     "Sri Lanka dry-zone dry evergreen forests": 820,
823 |     "Sumba deciduous forests": 821,
824 |     "Timor and Wetar deciduous forests": 822,
825 |     "Tumbes-Piura dry forests": 823,
826 |     "Veracruz dry forests": 824,
827 |     "Yap tropical dry forests": 825,
828 |     "Yucat\u00e1n dry forests": 826,
829 |     "Zambezian evergreen dry forests": 827,
830 |     "Brazilian Atlantic dry forests": 828,
831 |     "Trinidad and Tobago dry forest": 829,
832 |     "Lesser Antillean dry forests": 830,
833 |     "Bermuda subtropical conifer forests": 831,
834 |     "Central American pine-oak forests": 832,
835 |     "Cuban pine forests": 833,
836 |     "Himalayan subtropical pine forests": 834,
837 |     "Hispaniolan pine forests": 835,
838 |     "Luzon tropical pine forests": 836,
839 |     "Northeast India-Myanmar pine forests": 837,
840 |     "Sierra de la Laguna pine-oak forests": 838,
841 |     "Sierra Madre de Oaxaca pine-oak forests": 839,
842 |     "Sierra Madre del Sur pine-oak forests": 840,
843 |     "Sierra Madre Occidental pine-oak forests": 841,
844 |     "Sierra Madre Oriental pine-oak forests": 842,
845 |     "Sumatran tropical pine forests": 843,
846 |     "Trans-Mexican Volcanic Belt pine-oak forests": 844,
847 |     "Bahamian pineyards": 845
848 | }


--------------------------------------------------------------------------------
/stats/realm_stats.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "realm1": {
 3 |         "Tundra": 36,
 4 |         "Tropical & Subtropical Moist Broadleaf Forests": 81,
 5 |         "Mediterranean Forests, Woodlands & Scrub": 5,
 6 |         "Deserts & Xeric Shrublands": 27,
 7 |         "Temperate Grasslands, Savannas & Shrublands": 23,
 8 |         "Boreal Forests/Taiga": 15,
 9 |         "Temperate Conifer Forests": 24,
10 |         "Temperate Broadleaf & Mixed Forests": 21,
11 |         "Montane Grasslands & Shrublands": 9,
12 |         "Mangroves": 7,
13 |         "Flooded Grasslands & Savannas": 8,
14 |         "Tropical & Subtropical Grasslands, Savannas & Shrublands": 12,
15 |         "Tropical & Subtropical Dry Broadleaf Forests": 33,
16 |         "Tropical & Subtropical Coniferous Forests": 11
17 |     },
18 |     "realm2": {
19 |         "Tundra": 15,
20 |         "Tropical & Subtropical Moist Broadleaf Forests": 60,
21 |         "Mediterranean Forests, Woodlands & Scrub": 35,
22 |         "Deserts & Xeric Shrublands": 70,
23 |         "Temperate Grasslands, Savannas & Shrublands": 25,
24 |         "Boreal Forests/Taiga": 11,
25 |         "Temperate Conifer Forests": 21,
26 |         "Temperate Broadleaf & Mixed Forests": 59,
27 |         "Montane Grasslands & Shrublands": 36,
28 |         "Mangroves": 7,
29 |         "Flooded Grasslands & Savannas": 16,
30 |         "Tropical & Subtropical Grasslands, Savannas & Shrublands": 42,
31 |         "Tropical & Subtropical Dry Broadleaf Forests": 7
32 |     }
33 | }


--------------------------------------------------------------------------------
/stats/total_area_biome.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Deserts & Xeric Shrublands": 26323247.410199884,
 3 |     "Tropical & Subtropical Grasslands, Savannas & Shrublands": 21391107.38756527,
 4 |     "Tropical & Subtropical Moist Broadleaf Forests": 19580104.74197682,
 5 |     "Boreal Forests/Taiga": 15316577.636974096,
 6 |     "Temperate Broadleaf & Mixed Forests": 12559792.49171971,
 7 |     "Temperate Grasslands, Savannas & Shrublands": 10571603.918123659,
 8 |     "Tundra": 8526394.453534998,
 9 |     "Montane Grasslands & Shrublands": 4872379.374186906,
10 |     "Tropical & Subtropical Dry Broadleaf Forests": 3877948.9100976833,
11 |     "Temperate Conifer Forests": 3761948.527841549,
12 |     "Mediterranean Forests, Woodlands & Scrub": 3296014.700088584,
13 |     "Flooded Grasslands & Savannas": 1153800.8216760682,
14 |     "Tropical & Subtropical Coniferous Forests": 683071.888304758,
15 |     "Mangroves": 332287.848897852
16 | }


--------------------------------------------------------------------------------
/tmp.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vishalned/MMEarth-data/32fe297d76681cc9b1791239756f93ce027007b0/tmp.py


--------------------------------------------------------------------------------
/utils/biome_data_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | File that has code to read the resolve ecoregions geojson file and extract the biome names and eco-regions. 
 3 | It also writes the biome names to a json file.
 4 | '''
 5 | 
 6 | import geojson
 7 | from tqdm import tqdm
 8 | import json
 9 | 
10 | def read_geojson(filename):
11 |     with open(filename) as f:
12 |         gj = geojson.load(f)
13 |     return gj
14 | 
15 | def get_biome_data():
16 |     path = "../datasets/RESOLVE_ecoregions.geojson"
17 |     gj = read_geojson(path)
18 | 
19 |     data = gj['features'][0]['properties']
20 | 
21 |     # getting all the biome names, and the list of eco_regions
22 |     print('List of keys: ', data.keys())
23 | 
24 |     biome_names = {}
25 | 
26 |     eco_count = 0
27 |     for i in tqdm(range(len(gj['features']))):
28 |         if gj['features'][i]['properties']['BIOME_NAME'] not in biome_names.keys():
29 |             biome_names[gj['features'][i]['properties']['BIOME_NAME']] = []
30 | 
31 |         biome_names[gj['features'][i]['properties']['BIOME_NAME']].append([gj['features'][i]['properties']['ECO_NAME'], gj['features'][i]['properties']['REALM']])
32 |         eco_count += 1
33 | 
34 |     print('Total number of biomes', len(biome_names.keys()))
35 |     print('Total number of eco-regions', eco_count)
36 |     # writing the biome names to a json file
37 |     import json
38 |     
39 |     with open('stats/biome_names.json', 'w') as fp:
40 |         json.dump(biome_names, fp)
41 | 
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     import os
46 |     os.makedirs('stats', exist_ok=True)
47 |     get_biome_data()
48 | 


--------------------------------------------------------------------------------
/utils/chunking_h5.py:
--------------------------------------------------------------------------------
 1 | # take an existing h5 file, and create a new one with the same data but by using chunks.
 2 | 
 3 | import h5py
 4 | import numpy as np
 5 | import os
 6 | import argparse
 7 | 
 8 | 
 9 | def create_h5_file_with_chunks(h5_file_path = '', chunk_size = 1):
10 |     """
11 |     Create a new h5 file with the same data as the original file, but with the specified chunk size.
12 |     :param h5_file_path: path to the original h5 file.
13 |     :param chunk_size: chunk size to use.
14 |     :return: None
15 |     """
16 |     # open the original file
17 |     h5 = h5py.File(h5_file_path, 'r')
18 |     keys = list(h5.keys())
19 |     # create a new file
20 |     name = h5_file_path.split('/')[-1][:-3]
21 |     new_h5_file_path = os.path.join(os.path.dirname(h5_file_path), name + '_chunked_gzip.h5')
22 |     print(new_h5_file_path)
23 |     if os.path.exists(new_h5_file_path):
24 |         os.remove(new_h5_file_path)
25 |         
26 |     new_h5 = h5py.File(new_h5_file_path, 'w')
27 |     # the dataset is too big to load into memory, so we will iterate over the keys
28 |     meta = h5['metadata']
29 |     num_samples = meta.shape[0]
30 |     for key in keys:
31 |         print('creating dataset for key: ', key)
32 |         shape = h5[key].shape
33 |         tmp = new_h5.create_dataset(key, shape = h5[key].shape, dtype = h5[key].dtype, chunks = (chunk_size, *shape[1:]), compression = 'gzip')
34 | 
35 |         # iterate over the samples
36 |         for i in range(num_samples):
37 |             if i % 1000 == 0:
38 |                 print(key, i)
39 |             # get the sample
40 |             sample = h5[key][i]
41 |             # write the sample to the new file
42 |             tmp[i] = sample
43 | 
44 |         
45 | 
46 | 
47 |     # close the files
48 |     h5.close()
49 |     new_h5.close()
50 | 
51 | 
52 | if __name__ == '__main__':
53 | 
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument('--h5_file_path', type=str, help='path to the h5 file', required=True)
56 |     parser.add_argument('--chunk_size', type=int, help='chunk size to use', default=1)
57 |     args = parser.parse_args()
58 | 
59 |     create_h5_file_with_chunks(h5_file_path = args.h5_file_path, chunk_size = args.chunk_size)


--------------------------------------------------------------------------------
/utils/convert_to_h5.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A python script to do the pre-processing and convert the data to h5 format. It will then be ready to be used by the dataloader for training.
  3 | '''
  4 | 
  5 | 
  6 | import argparse
  7 | import os
  8 | import h5py
  9 | import numpy as np
 10 | import json
 11 | import tifffile as tiff
 12 | 
 13 | 
 14 | 
 15 | 
 16 | MODALITIES = {
 17 |             'sentinel2': {'dtype': 'uint16', 'n_bands': 13, 'bands': ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8A', 'B8', 'B9', 'B10', 'B11', 'B12']},
 18 |             'sentinel2_cloudmask': {'dtype': 'uint16', 'n_bands': 1, 'bands': ['QA60']},
 19 |             'sentinel2_cloudprod': {'dtype': 'uint16', 'n_bands': 1, 'bands': ['MSK_CLDPRB']},
 20 |             'sentinel2_scl': {'dtype': 'uint8', 'n_bands': 1, 'bands': ['SCL']},
 21 |             'sentinel1': {'dtype': 'float32', 'n_bands': 8, 'bands': ['asc_VV', 'asc_VH', 'asc_HH', 'asc_HV', 'desc_VV', 'desc_VH', 'desc_HH', 'desc_HV']},
 22 |             'aster': {'dtype': 'int16', 'n_bands': 2, 'bands': ['elevation', 'slope']},
 23 |             'era5': {'dtype': 'float32', 'n_bands': 12, 'bands': ['prev_month_avg_temp', 'prev_month_min_temp', 'prev_month_max_temp', 'prev_month_total_precip', 'curr_month_avg_temp', 'curr_month_min_temp', 'curr_month_max_temp', 'curr_month_total_precip', 'year_avg_temp', 'year_min_temp', 'year_max_temp', 'year_total_precip']},
 24 |             'dynamic_world': {'dtype': 'uint8', 'n_bands': 1, 'bands': ['landcover']},
 25 |             'canopy_height_eth': {'dtype': 'int8', 'n_bands': 2, 'bands': ['height', 'std']},
 26 |             'lat': {'dtype': 'float32', 'n_bands': 2, 'bands': ['sin', 'cos']},
 27 |             'lon': {'dtype': 'float32', 'n_bands': 2, 'bands': ['sin', 'cos']},
 28 |             'biome': {'dtype': 'uint8', 'n_bands': 1},
 29 |             'eco_region': {'dtype': 'uint16', 'n_bands': 1},
 30 |             'month': {'dtype': 'float32', 'n_bands': 2, 'bands': ['sin', 'cos']},
 31 |             'esa_worldcover':{ 'dtype': 'uint8', 'n_bands': 1, 'bands': ['map']}
 32 | }
 33 | 
 34 | sentinel2_bands = [
 35 |     'B1',
 36 |     'B2',
 37 |     'B3',
 38 |     'B4',
 39 |     'B5',
 40 |     'B6',
 41 |     'B7',
 42 |     'B8A',
 43 |     'B8',
 44 |     'B9',
 45 |     'B10',
 46 |     'B11',
 47 |     'B12',
 48 | ]
 49 | 
 50 | # MODALITIES_IN_IMAGE = [
 51 | #     'sentinel2',
 52 | #     'sentinel1_asc',
 53 | #     'sentinel1_desc',
 54 | #     'aster',
 55 | #     'dynamic_world',
 56 | #     'canopy_height_eth',
 57 | #     'esa_worldcover'
 58 | 
 59 | # ]
 60 | 
 61 | variables = {}
 62 | remove = []
 63 | 
 64 | 
 65 | 
 66 | def read_data(tile_id, tile_info, data_dir, img_size, exisiting_datasets=None):
 67 |     tile_info_bands = tile_info['BANDS']
 68 |     type = tile_info['S2_type']
 69 |     try:
 70 |         data = tiff.imread(os.path.join(data_dir, tile_id + '.tif'))
 71 |     except:
 72 |         return None
 73 |     return_data_dict = {}
 74 |     count = 0 # this represents the count of the bands, we use this since all the bands are stacked in the same order
 75 |     
 76 |     # creating a center crop of size img_size
 77 |     start_x = (data.shape[0] - img_size) // 2
 78 |     start_y = (data.shape[1] - img_size) // 2
 79 |     if len(data.shape) == 2:
 80 |         data = np.expand_dims(data, axis=2)
 81 |     data = data[start_x:start_x + img_size, start_y:start_y + img_size, :]
 82 |     for modality, modality_info in MODALITIES.items():
 83 |         if exisiting_datasets is not None and modality not in exisiting_datasets:
 84 |             continue
 85 | 
 86 |         
 87 |         ### SENTINEL 2 ###
 88 |         if modality == 'sentinel2':
 89 |             placeholder = np.zeros((13, img_size, img_size), dtype='uint16')
 90 |             count += len(tile_info_bands['sentinel2']) - 3 if type == 'l2a' else len(tile_info_bands['sentinel2']) - 1
 91 |             for i, band in enumerate(sentinel2_bands):
 92 |                 if band in tile_info_bands['sentinel2']:
 93 |                     placeholder[i] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index(band)], 2).transpose(2, 0, 1)
 94 |             return_data_dict['sentinel2'] = placeholder
 95 | 
 96 |         if modality == 'sentinel2_cloudmask':
 97 |             if 'QA60' in tile_info_bands['sentinel2']:
 98 |                 return_data_dict['sentinel2_cloudmask'] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index('QA60')], 2).transpose(2, 0, 1)
 99 |                 count += 1
100 |             else:
101 |                 return_data_dict['sentinel2_cloudmask'] = np.ones((1, img_size, img_size), dtype='uint16') * 65535
102 | 
103 |         if modality == 'sentinel2_cloudprod':
104 |             
105 |             if 'MSK_CLDPRB' in tile_info_bands['sentinel2']:
106 |                 count += 1
107 |                 return_data_dict['sentinel2_cloudprod'] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index('MSK_CLDPRB')], 2).transpose(2, 0, 1)
108 |             else:
109 |                 return_data_dict['sentinel2_cloudprod'] = np.ones((1, img_size, img_size), dtype='uint16') * 65535
110 | 
111 | 
112 |         if modality == 'sentinel2_scl':
113 |             if 'SCL' in tile_info_bands['sentinel2']:
114 |                 count += 1
115 |                 return_data_dict['sentinel2_scl'] = np.expand_dims(data[:, :, tile_info_bands['sentinel2'].index('SCL')], 2).transpose(2, 0, 1)
116 |             else:
117 |                 return_data_dict['sentinel2_scl'] = np.ones((1, img_size, img_size), dtype='uint8') * 255
118 | 
119 | 
120 | 
121 |         ### SENTINEL 1 ###
122 |         if modality == 'sentinel1':
123 |             bands_map = {'VV': 0, 'VH': 1, 'HH': 2, 'HV': 3}
124 |             orbit_map = {'asc': 0, 'desc': 4}
125 |             tmp = np.ones((8, img_size, img_size), dtype='float32') * float('-inf')
126 |             for orbit in ['asc', 'desc']:
127 |                 if tile_info_bands[f'sentinel1_{orbit}'] is not None:
128 |                     bands_img = tile_info_bands[f'sentinel1_{orbit}']
129 |                     count += len(bands_img)
130 |                     for i, band in enumerate(bands_img):
131 |                         tmp[orbit_map[orbit] + bands_map[band]] = data[:, :, count - len(bands_img) + i]
132 | 
133 |             return_data_dict['sentinel1'] = tmp
134 | 
135 |         ### ASTER ###
136 |         if modality == 'aster':
137 |             count += len(tile_info_bands['aster'])
138 |             return_data_dict['aster'] = data[:, :, count - len(tile_info_bands['aster']):count].transpose(2, 0, 1)
139 | 
140 |         ### ERA5 ###
141 |         if modality == 'era5':
142 | 
143 |             era_data = tile_info['era5']
144 |             
145 |             return_data_dict['era5'] = np.stack([era_data['month1'] + era_data['month2'] + era_data['year']], axis=0).astype('float32')
146 | 
147 |         ### DYNAMIC WORLD ###
148 |         if modality == 'dynamic_world':
149 |             try:
150 |                 count += len(tile_info_bands['dynamic_world'])
151 |                 return_data_dict['dynamic_world'] = np.expand_dims(data[:, :, count - len(tile_info_bands['dynamic_world'])], axis = 0)
152 |             except Exception as e:
153 |                 print('dynamic_world not found')
154 |                 return_data_dict['dynamic_world'] = np.zeros((1, img_size, img_size), dtype='uint8')
155 |             
156 | 
157 |         ### CANOPY HEIGHT ###
158 |         if modality == 'canopy_height_eth':
159 |             count += len(tile_info_bands['canopy_height_eth']) 
160 |             tmp = data[:, :, count - len(tile_info_bands['canopy_height_eth']):count].transpose(2, 0, 1)
161 |             if tmp.shape[0] == 0:
162 |                 return_data_dict['canopy_height_eth'] = np.ones((2, img_size, img_size), dtype='int8') * 255
163 |             else:
164 |                 return_data_dict['canopy_height_eth'] = data[:, :, count - len(tile_info_bands['canopy_height_eth']):count].transpose(2, 0, 1)
165 |    
166 | 
167 | 
168 |         ### LATITUDE ###
169 |         if modality == 'lat':
170 |             return_data_dict['lat'] = np.stack([np.sin(np.deg2rad(tile_info['lat'])), np.cos(np.deg2rad(tile_info['lat']))], axis=0).astype('float32')
171 | 
172 | 
173 |         ### LONGITUDE ###
174 |         if modality == 'lon':
175 |             return_data_dict['lon'] = np.stack([np.sin(np.deg2rad(tile_info['lon'])), np.cos(np.deg2rad(tile_info['lon']))], axis=0).astype('float32')
176 |         
177 |         ### BIOME ###
178 |         if modality == 'biome':
179 |             biome = tile_info['biome']
180 |             one_hot = np.zeros(14)
181 |             one_hot[biome] = 1
182 |             return_data_dict['biome'] = one_hot.astype('uint8')
183 | 
184 |         ### ECO-REGION ###
185 |         if modality == 'eco_region':
186 |             eco_region = tile_info['eco_region']
187 |             one_hot = np.zeros(846)
188 |             one_hot[eco_region] = 1
189 |             return_data_dict['eco_region'] = one_hot.astype('uint16')
190 | 
191 |         ### MONTH ###
192 |         if modality == 'month':
193 |             month = tile_info['S2_DATE'].split('-')[1]
194 |             month = int(month)
195 |             return_data_dict['month'] = np.stack([np.sin(2 * np.pi * month / 12), np.cos(2 * np.pi * month / 12)], axis=0).astype('float32')
196 | 
197 |         ## ESA WORLD COVER ###
198 |         if modality == 'esa_worldcover':
199 |             try:
200 |                 count += len(tile_info_bands['esa_worldcover'])
201 |                 return_data_dict['esa_worldcover'] = np.expand_dims(data[:, :, count - len(tile_info_bands['esa_worldcover'])], axis = 0)
202 | 
203 |             except Exception as e:
204 |                 print('esa_worldcover not found')
205 |                 print('exception: ',e)
206 |                 exit()
207 |                 return_data_dict['esa_worldcover'] = np.ones((1, img_size, img_size), dtype='uint8') * 255
208 | 
209 |     return return_data_dict
210 | 
211 | 
212 | def main(args):
213 | 
214 |     mode = args.mode
215 |     print('Mode: ', mode)
216 |     if mode == 'create':
217 |         data_dir = args.data_dir
218 |         data_dir = os.path.join(data_dir, 'merged')
219 |         img_size = args.image_size
220 | 
221 |         # we first check if the output file already exists, if it does, we delete it
222 |         if os.path.exists(args.output_file): 
223 |             os.remove(args.output_file) 
224 | 
225 |         hdf5_file = h5py.File(args.output_file, 'a')
226 |         
227 |         num_tiles = len(os.listdir(data_dir))
228 | 
229 |         tile_info = json.load(open(args.tile_info, 'r'))
230 | 
231 | 
232 |         # we calculate the real number of tiles by verifying how many tiles
233 |         # have number of bands equal to the sum of the bands in the tile_info.json file
234 | 
235 |         num_tiles = 0
236 |         for i, tile_id in enumerate(tile_info):
237 |             count_t = 0
238 |             for b in ['sentinel2', 'sentinel1_asc', 'sentinel1_desc', 'aster', 'dynamic_world', 'canopy_height_eth', 'esa_worldcover']:
239 |                 if b in tile_info[tile_id]['BANDS']:
240 |                     count_t += len(tile_info[tile_id]['BANDS'][b]) if tile_info[tile_id]['BANDS'][b] is not None else 0
241 | 
242 |             try:
243 |                 data = tiff.imread(os.path.join(data_dir, tile_id + '.tif'))
244 |             except:
245 |                 # sometimes the data is not downloaded, so we skip it
246 |                 continue
247 |             if data.shape[-1] == count_t:
248 |                 num_tiles += 1
249 |             else:
250 |                 #  we store the lat lon in a csv file to check the tiles that have not been downloaded
251 |                 print('Tile shape mismatch: ', tile_id)
252 |                 lat, lon = tile_info[tile_id]['lat'], tile_info[tile_id]['lon']
253 |                 # with open(args.missing_tiles, 'a') as f:
254 |                 #     f.write(f'{tile_id},{lat},{lon}\n')
255 | 
256 |         # num_tiles = len(tile_info)
257 |         print('Number of tiles: ', num_tiles)
258 |         print('Number of entries in tile_info: ', len(tile_info))
259 |         print('Number of tiles skipped due to mismatch: ', len(tile_info) - num_tiles)
260 |         # exit()
261 | 
262 |         
263 | 
264 |         # creating a dataset for each modality
265 |         for modality, modality_info in MODALITIES.items():
266 |             if modality == 'lat' or modality == 'lon' or modality == 'month':
267 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, 2), dtype=modality_info['dtype'], compression='gzip', chunks=(1, 2))
268 |             elif modality == 'biome':
269 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, 14), dtype=modality_info['dtype'], compression='gzip', chunks=(1, 14))
270 |             elif modality == 'eco_region':
271 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, 846), dtype=modality_info['dtype'], compression='gzip', chunks=(1, 846))
272 |             elif modality == 'era5':
273 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, modality_info['n_bands']), dtype=modality_info['dtype'], compression='gzip', chunks=(1, modality_info['n_bands']))
274 |             else:
275 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(num_tiles, modality_info['n_bands'], img_size, img_size), dtype=modality_info['dtype'], compression='gzip', chunks=(1, modality_info['n_bands'], img_size, img_size))
276 |         
277 | 
278 |         # create a new meta data with tile_id and s2 type which is either l2a or l1c
279 |         metadata_dt = np.dtype([('tile_id', 'S100'), ('S2_type', 'S10')]) # string of length 100
280 |         ds_metadata = hdf5_file.create_dataset('metadata', shape=(num_tiles,), dtype=metadata_dt, compression='gzip', chunks=(1,))
281 | 
282 |         # metadata_dt = np.dtype([('tile_id', 'S100')]) # string of length 100
283 |         # ds_metadata = hdf5_file.create_dataset('metadata', shape=(num_tiles,), dtype=metadata_dt, compression='gzip', chunks=(1,))
284 | 
285 |         j = 0
286 |         count_s = 0
287 |         for i, tile_id in enumerate(tile_info):
288 | 
289 |             print(f'Processing tile {i}/{num_tiles}, {tile_id}')
290 |             count_t = 0
291 | 
292 |             for b in ['sentinel2', 'sentinel1_asc', 'sentinel1_desc', 'aster', 'dynamic_world', 'canopy_height_eth', 'esa_worldcover']:
293 |                 if b in tile_info[tile_id]['BANDS']:
294 |                     count_t += len(tile_info[tile_id]['BANDS'][b]) if tile_info[tile_id]['BANDS'][b] is not None else 0
295 | 
296 |             try:
297 |                 data = tiff.imread(os.path.join(data_dir, tile_id + '.tif'))
298 |             except:
299 |                 print('Skipping tile: ', tile_id)
300 |                 continue
301 | 
302 |             if data.shape[-1] != count_t:
303 |                 print('mismatch')
304 |                 count_s += 1
305 |                 continue
306 | 
307 | 
308 |             data_ = read_data(tile_id, tile_info[tile_id], data_dir, img_size)
309 |             if data_ is None:
310 |                 print('Skipping tile: ', tile_id)
311 |                 continue
312 |             for modality, _ in MODALITIES.items():
313 |                 try:
314 |                     variables[modality][j] = data_[modality]
315 |                 except Exception as e:
316 |                     print('Error in modality: ', modality)
317 |                     print(e)
318 |                     exit() 
319 |             # breakpoint()        
320 |             # ds_metadata[j] = tile_id
321 |             ds_metadata[j] = (tile_id, tile_info[tile_id]['S2_type'])
322 |             j += 1
323 |             # exit() ## testing
324 |         hdf5_file.close()
325 |         print('Done!')
326 |         print('number of tiles: ', num_tiles)
327 | 
328 |         print('Number of tiles skipped due to mismatch: ', count_s)
329 |     else:
330 |         # we are now merging 2 h5 files
331 |         if os.path.exists(args.output_file): 
332 |             print('Output file already exists. ')
333 |         
334 |         img_size = args.image_size
335 | 
336 |         file1 = h5py.File(args.path1, 'r')
337 |         file2 = h5py.File(args.path2, 'r')
338 |         tile_info1 = json.load(open(args.path1_tile_info, 'r'))
339 |         tile_info2 = json.load(open(args.path2_tile_info, 'r'))
340 | 
341 |         size = len(file1['metadata']) + len(file2['metadata'])
342 | 
343 |         hdf5_file = h5py.File(args.output_path, 'a')
344 | 
345 |         # creating a dataset for each modality
346 |         for modality, modality_info in MODALITIES.items():
347 |             if modality == 'lat' or modality == 'lon' or modality == 'month':
348 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(size, 2), dtype=modality_info['dtype'])
349 |             elif modality == 'biome':
350 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(size, 14), dtype=modality_info['dtype'])
351 |             elif modality == 'eco_region':
352 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(size, 846), dtype=modality_info['dtype'])
353 |             elif modality == 'era5':
354 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(size, modality_info['n_bands']), dtype=modality_info['dtype'])
355 |             else:
356 |                 variables[modality] = hdf5_file.create_dataset(modality, shape=(size, modality_info['n_bands'], img_size, img_size), dtype=modality_info['dtype'])
357 | 
358 |         metadata_dt = np.dtype([('tile_id', 'S100')]) # string of length 100
359 |         ds_metadata = hdf5_file.create_dataset('metadata', shape=(size,), dtype=metadata_dt)
360 | 
361 |         j = 0
362 |         for i in range(len(file1['metadata'])):
363 |             print(f'Processing tile {i}/{len(file1["metadata"])}')
364 |             # we append the data from the first file, and then the data from the second file
365 |             for modality, _ in MODALITIES.items():
366 |                 variables[modality][j] = file1[modality][i]
367 |             ds_metadata[j] = file1['metadata'][i]
368 |             j += 1
369 | 
370 |         for i in range(len(file2['metadata'])):
371 |             print(f'Processing tile {i}/{len(file2["metadata"])}')
372 |             # we append the data from the first file, and then the data from the second file
373 |             for modality, _ in MODALITIES.items():
374 |                 variables[modality][j] = file2[modality][i]
375 |             ds_metadata[j] = file2['metadata'][i]
376 |             j += 1
377 | 
378 |         hdf5_file.close()
379 |         print('Done!')
380 | 
381 |         # we also need to merge the tile_info files
382 |         tile_info = {}
383 |         for i in range(len(file1['metadata'])):
384 |             tile = file1['metadata'][i][0].decode('utf-8')
385 |             tile_info[tile] = tile_info1[tile]
386 |         for i in range(len(file2['metadata'])):
387 |             tile = file2['metadata'][i][0].decode('utf-8')
388 |             tile_info[tile] = tile_info2[tile]
389 |         with open(args.output_path.split('.')[0] + '_tile_info.json', 'w') as f:
390 |             json.dump(tile_info, f)
391 |         print('number of tiles: ', size)
392 | 
393 | 
394 | if __name__ == '__main__':
395 |     parser = argparse.ArgumentParser(description='Convert the data to h5 format')
396 |     parser.add_argument('--mode', type=str, required=True, help='append or create', choices= ['merge', 'create'])
397 |     
398 |     # args for create mode
399 |     # required args
400 |     parser.add_argument('--data_dir', type=str, default='', help='path to the data directory')
401 |     # optional args
402 |     parser.add_argument('--tile_info', type=str, default='', help='path to the tile info json file')  
403 |     parser.add_argument('--output_file', type=str, default='', help='path to the output h5 file')
404 |     parser.add_argument('--missing_tiles', type=str, default='', help='path to the csv file containing the missing tiles')
405 |     parser.add_argument('--image_size', type=int, default=128, help='size of the image')
406 | 
407 | 
408 |     # args for merge mode
409 |     # required args
410 |     parser.add_argument('--data_dir1', type=str, default='', help='path to the first folder')
411 |     parser.add_argument('--data_dir2', type=str, default='', help='path to the first folder')
412 |     parser.add_argument('--output_path', type=str, help='path to the output h5 file')
413 |     # optional args
414 |     parser.add_argument('--path1', type=str, default='', help='path to the first h5 file')
415 |     parser.add_argument('--path1_tile_info', type=str, default='', help='path to the tile info json file for the first h5 file')
416 |     parser.add_argument('--path2', type=str, default='', help='path to the second h5 file')
417 |     parser.add_argument('--path2_tile_info', type=str, default='', help='path to the tile info json file for the second h5 file')
418 |     
419 | 
420 |     args = parser.parse_args()
421 |     if args.mode == 'merge':
422 |         assert args.output_path != '', 'Please provide the output path'
423 |         assert args.data_dir1 != '', 'Please provide the path to the first folder'
424 |         assert args.data_dir2 != '', 'Please provide the path to the second folder'
425 | 
426 |         name1 = args.data_dir1.split('/')[-1] if args.data_dir1[-1] != '/' else args.data_dir1.split('/')[-2]
427 |         name2 = args.data_dir2.split('/')[-1] if args.data_dir2[-1] != '/' else args.data_dir2.split('/')[-2]
428 |         if args.path1 == '':
429 |             args.path1 = os.path.join(args.data_dir1, name1 + '.h5')
430 |         if args.path1_tile_info == '':
431 |             args.path1_tile_info = os.path.join(args.data_dir1, name1 + '_tile_info.json')
432 |         if args.path2 == '':
433 |             args.path2 = os.path.join(args.data_dir2, name2 + '.h5')
434 |         if args.path2_tile_info == '':
435 |             args.path2_tile_info = os.path.join(args.data_dir2, name2 + '_tile_info.json')
436 |     
437 |     if args.mode == 'create':
438 |         assert args.data_dir != '', 'Please provide the path to the data directory'
439 |         name = args.data_dir.split('/')[-1] if args.data_dir[-1] != '/' else args.data_dir.split('/')[-2]
440 |         if args.tile_info == '':
441 |             args.tile_info = os.path.join(args.data_dir, name + '_tile_info.json')
442 |         if args.output_file == '':
443 |             args.output_file = os.path.join(args.data_dir, name + '.h5')
444 |         if args.missing_tiles == '':
445 |             args.missing_tiles = os.path.join(args.data_dir, name + 'missing_tiles.csv')
446 |         
447 | 
448 | 
449 |     main(args)


--------------------------------------------------------------------------------
/utils/normalization.py:
--------------------------------------------------------------------------------
  1 | # this function reads the h5 file and computes the mean and std of each band 
  2 | # including the min and max, and saves it in a json file. 
  3 | 
  4 | 
  5 | import os
  6 | import json
  7 | import numpy as np
  8 | import h5py
  9 | from math import inf
 10 | 
 11 | 
 12 | NO_DATA_VAL = {
 13 |     'sentinel2': 0,
 14 |     'sentinel2_cloudmask': 65535,
 15 |     'sentinel2_cloudprod': 65535,
 16 |     'sentinel2_scl': 255,
 17 |     'sentinel1': float('-inf'),
 18 |     'aster': float('-inf'),
 19 |     'canopy_height_eth': 255,
 20 |     'dynamic_world': 0,
 21 |     'esa_worldcover': 255,
 22 |     'lat': float('-inf'),
 23 |     'lon': float('-inf'),
 24 |     'month': float('-inf'),
 25 |     'era5': float('inf')
 26 | }
 27 | 
 28 | # DATA_PATH = "/home/qbk152/vishal/global-lr/data/data_1M_130/data_1M_130.h5"
 29 | # TILE_INFO = "/home/qbk152/vishal/global-lr/data/data_1M_130/data_1M_130_tile_info.json"
 30 | # SUBSET_SIZE = 100000 ##### define subset size here, we only compute mean and std for a subset
 31 | # STORE_PATH = "/home/qbk152/vishal/global-lr/data/data_1M_130/data_1M_130_band_stats.json"
 32 | 
 33 | # DATA_PATH = "/projects/dereeco/data/global-lr/data_1M_130/data_1M_130.h5"
 34 | # TILE_INFO = "/projects/dereeco/data/global-lr/data_1M_130/data_1M_130_tile_info.json"
 35 | # SUBSET_SIZE = 100000 ##### define subset size here, we only compute mean and std for a subset
 36 | # STORE_PATH = "/projects/dereeco/data/global-lr/data_1M_130/data_1M_130_band_stats.json"
 37 | 
 38 | 
 39 | 
 40 | def compute_band_stats(data_folder = '', tile_info = '', store_path = ''):
 41 |     SUBSET_SIZE = 100000 ##### define subset size here, we only compute mean and std for a subset
 42 | 
 43 | 
 44 |     if data_folder == '':
 45 |         raise ValueError("Please provide the path to the data folder")
 46 |     
 47 |     name = data_folder.split('/')[-1]  if data_folder[-1] != '/' else data_folder.split('/')[-2]
 48 |     data_path = os.path.join(data_folder, name + '.h5')
 49 |     tile_info = os.path.join(data_folder, name + '_tile_info.json') if tile_info == '' else tile_info
 50 |     store_path = os.path.join(data_folder, name + '_band_stats.json') if store_path == '' else store_path
 51 | 
 52 |     # since the number of images are large, we compute the rolling mean and std
 53 | 
 54 |     # read the tile info
 55 |     with open(tile_info, 'r') as f:
 56 |         tile_info = json.load(f)
 57 | 
 58 |     # read the h5 file
 59 |     f = h5py.File(data_path, 'r')
 60 | 
 61 |     meta = f['metadata']
 62 |     bands = list(i for i in f.keys() if i != 'metadata')
 63 |     print(bands)
 64 | 
 65 | 
 66 |     print("number of images: ", len(meta))
 67 | 
 68 |     return_dict = {}
 69 |     
 70 | 
 71 |     
 72 | 
 73 |     for band in bands:
 74 |         if band in ['lat', 'lon', 'month', 'era5']:
 75 |             print('computing stats for band: ', band)
 76 |             if band == 'era5':
 77 |                 num_images = np.count_nonzero(~np.isnan(f[band]), axis=0)
 78 |             else:
 79 |                 num_images = len(meta)
 80 |             data = f[band]
 81 |             mean = np.nansum(data, axis=0) / num_images
 82 |             std = np.sqrt(np.nansum((data - mean)**2, axis = 0) / num_images)
 83 |             min_val = np.nanmin(data, axis=0)
 84 |             max_val = np.nanmax(data, axis=0)
 85 |             return_dict[band] = {
 86 |                 'mean': list(mean.astype(float)),
 87 |                 'std': list(std.astype(float)),
 88 |                 'min': list(min_val.astype(float)),
 89 |                 'max': list(max_val.astype(float))
 90 |             }
 91 |             print(return_dict[band])
 92 |             continue
 93 |         
 94 |         
 95 |         if band not in ['sentinel2_cloudmask', 'sentinel2_cloudprod', 'sentinel2', 'sentinel1', 'aster', 'canopy_height_eth']:
 96 |             continue
 97 |         
 98 |         
 99 |         print('computing stats for band: ', band)
100 |         num_images = len(meta)
101 |         
102 |         subset_size = min(SUBSET_SIZE, num_images)
103 |         C = f[band].shape[1]
104 |         channel_sums = np.zeros(C, dtype=np.float64)
105 |         channel_sums_squared = np.zeros(C, dtype=np.float64)
106 |         count_ = np.zeros(C, dtype=np.float64)
107 |         min_val = np.ones(C, dtype=np.float64)*float('inf')
108 |         max_val = np.ones(C, dtype=np.float64)*float('-inf')
109 |         max_range = 1.7e308
110 | 
111 | 
112 |         inf_values = 0
113 |         # set numpy seed
114 |         np.random.seed(0)
115 |         indices = np.random.randint(0, num_images, size=subset_size)
116 | 
117 |         if 'sentinel2' in band:
118 |             channel_sums_l2a = np.zeros(C, dtype=np.float64)
119 |             channel_sums_squared_l2a = np.zeros(C, dtype=np.float64)
120 |             count_l2a = np.zeros(C, dtype=np.float64)
121 |             channel_sums_l1c = np.zeros(C, dtype=np.float64)
122 |             channel_sums_squared_l1c = np.zeros(C, dtype=np.float64)
123 |             count_l1c = np.zeros(C, dtype=np.float64)
124 |             min_val_l2a = np.ones(C, dtype=np.float64)*float('inf')
125 |             max_val_l2a = np.ones(C, dtype=np.float64)*float('-inf')
126 |             min_val_l1c = np.ones(C, dtype=np.float64)*float('inf')
127 |             max_val_l1c = np.ones(C, dtype=np.float64)*float('-inf')
128 |             max_range = 1.7e308
129 | 
130 |             for idx, i in enumerate(indices):
131 |                 name = meta[i][0].decode('utf-8')
132 |                 if tile_info[name]['S2_type'] == "l2a":
133 |                     image = np.float64(f[band][i])
134 |                     channel_sums_l2a += np.sum(image, axis=(1, 2), where=(image != NO_DATA_VAL[band]))
135 |                     channel_sums_squared_l2a += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band]))
136 |                     count_l2a += np.sum(image != NO_DATA_VAL[band], axis=(1, 2))
137 |                     tmp_img = np.where(image == NO_DATA_VAL[band], np.nan, image)
138 |                     min_val_l2a = np.nanmin([min_val_l2a, np.nanmin(tmp_img, axis=(1, 2))], axis=0)
139 |                     max_val_l2a = np.nanmax([max_val_l2a, np.nanmax(tmp_img, axis=(1, 2))], axis=0)
140 | 
141 | 
142 | 
143 |                 elif tile_info[name]['S2_type'] == "l1c":
144 |                     image = np.float64(f[band][i])
145 |                     channel_sums_l1c += np.sum(image, axis=(1, 2), where=(image != NO_DATA_VAL[band]))
146 |                     channel_sums_squared_l1c += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band]))
147 |                     count_l1c += np.sum(image != NO_DATA_VAL[band], axis=(1, 2))
148 |                     tmp_img = np.where(image == NO_DATA_VAL[band], np.nan, image)
149 |                     min_val_l1c = np.nanmin([min_val_l1c, np.nanmin(tmp_img, axis=(1, 2))], axis=0)
150 |                     max_val_l1c = np.nanmax([max_val_l1c, np.nanmax(tmp_img, axis=(1, 2))], axis=0)
151 | 
152 |             mean_l2a = channel_sums_l2a/count_l2a
153 |             std_l2a = np.sqrt((channel_sums_squared_l2a/count_l2a) - mean_l2a**2)
154 |             mean_l1c = channel_sums_l1c/count_l1c
155 |             std_l1c = np.sqrt((channel_sums_squared_l1c/count_l1c) - mean_l1c**2)
156 |             return_dict[band + "_l2a"] = {
157 |                 'mean': list(mean_l2a.astype(float)),
158 |                 'std': list(std_l2a.astype(float)),
159 |                 'min': list(min_val_l2a.astype(float)),
160 |                 'max': list(max_val_l2a.astype(float))
161 |             }
162 |             return_dict[band + "_l1c"] = {
163 |                 'mean': list(mean_l1c.astype(float)),
164 |                 'std': list(std_l1c.astype(float)),
165 |                 'min': list(min_val_l1c.astype(float)),
166 |                 'max': list(max_val_l1c.astype(float))
167 |             }
168 |             print(return_dict[band + "_l2a"])
169 |             print(return_dict[band + "_l1c"])
170 | 
171 |         else:
172 | 
173 |             for i in indices:
174 |                 
175 |                 image = np.float64(f[band][i])
176 |                 tmp = np.sum(image, axis=(1, 2), where=(image != NO_DATA_VAL[band]))
177 |                 
178 |                 channel_sums += tmp
179 |                 # if band == 'sentinel1':
180 |                 #     channel_sums_squared += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band] or image != float('-inf')))
181 |                 # else:
182 |                 channel_sums_squared += np.sum(image**2, axis=(1, 2), where=(image != NO_DATA_VAL[band]))
183 |                 if np.any(channel_sums_squared > max_range):
184 |                     print("channel_sums_squared: ", channel_sums_squared)
185 |                     print("index", idx)
186 |                     raise OverflowError("channel_sums_squared is greater than max_range")
187 |                 
188 |                 # computing min and max
189 |                 # replace all no data with nan
190 |                 tmp_img = np.where(image == NO_DATA_VAL[band], np.nan, image)
191 |                 min_val = np.nanmin([min_val, np.nanmin(tmp_img, axis=(1, 2))], axis=0)
192 |                 max_val = np.nanmax([max_val, np.nanmax(tmp_img, axis=(1, 2))], axis=0)
193 |                 count_ += np.sum(image != NO_DATA_VAL[band], axis=(1, 2))
194 | 
195 | 
196 |             mean = channel_sums/count_
197 |             std = np.sqrt((channel_sums_squared/count_) - mean**2)
198 | 
199 |             return_dict[band] = {
200 |                 'mean': list(mean.astype(float)),
201 |                 'std': list(std.astype(float)),
202 |                 'min': list(min_val.astype(float)),
203 |                 'max': list(max_val.astype(float))
204 |             }
205 | 
206 |             print(return_dict[band])
207 |         # exit()
208 | 
209 |     with open(store_path, 'w') as f:
210 |         json.dump(return_dict, f)
211 | 
212 | 
213 | 
214 | if __name__ == "__main__":
215 |     compute_band_stats(DATA_PATH, TILE_INFO, STORE_PATH)


--------------------------------------------------------------------------------
/utils/splits.py:
--------------------------------------------------------------------------------
  1 | # a file that has functions to create the train, val, and test splits from a h5 file. 
  2 | # we only create a new file, with indices of the train, val, and test splits.
  3 | 
  4 | 
  5 | import h5py
  6 | import numpy as np
  7 | import os
  8 | import json
  9 | 
 10 | # data_path = "/home/qbk152/vishal/global-lr/data/data_100k_130.h5"
 11 | # tile_info = "/home/qbk152/vishal/global-lr/data/data_100k_130_tile_info.json"
 12 | # store_path = "/home/qbk152/vishal/global-lr/data/data_100k_130_splits.json"
 13 | 
 14 | 
 15 | def create_splits(data_folder = '', tile_info = '', store_path = '', train_split=1.0, val_split=0.0, test_split=0):
 16 | 
 17 |     if data_folder == '':
 18 |         raise ValueError("Please provide the path to the data folder")
 19 |     name = data_folder.split('/')[-1] if data_folder[-1] != '/' else data_folder.split('/')[-2]
 20 |     data_path = os.path.join(data_folder, name + '.h5') 
 21 |     tile_info = os.path.join(data_folder, name + '_tile_info.json') if tile_info == '' else tile_info
 22 |     store_path = os.path.join(data_folder, name + '_splits.json') if store_path == '' else store_path
 23 |     
 24 | 
 25 |     # read the tile info
 26 |     with open(tile_info, 'r') as f:
 27 |         tile_info = json.load(f)
 28 | 
 29 |     # read the h5 file
 30 |     f = h5py.File(data_path, 'r')
 31 | 
 32 |     meta = f['metadata']
 33 |     bands = list(i for i in f.keys() if i != 'metadata')
 34 |     print(bands)
 35 | 
 36 | 
 37 |     print("number of images: ", len(meta))
 38 | 
 39 |     # create the splits
 40 |     num_images = len(meta)
 41 |     num_train = int(train_split * num_images)
 42 |     num_val = int(val_split * num_images)
 43 |     num_test = num_images - num_train - num_val
 44 | 
 45 |     # create the indices
 46 |     indices = np.arange(num_images)
 47 |     np.random.shuffle(indices)
 48 | 
 49 |     train_indices = indices[:num_train]
 50 |     val_indices = indices[num_train:num_train + num_val]
 51 |     test_indices = indices[num_train + num_val:]
 52 | 
 53 | 
 54 |     # create the splits
 55 |     splits = {
 56 |         'train': train_indices.tolist(),
 57 |         'val': val_indices.tolist(),
 58 |         'test': test_indices.tolist()
 59 |     }
 60 | 
 61 |     # store the splits in a json file
 62 |     with open(store_path, 'w') as f:
 63 |         json.dump(splits, f)
 64 | 
 65 | 
 66 |     # # create the splits
 67 |     # train_split = {}
 68 |     # val_split = {}
 69 |     # test_split = {}
 70 | 
 71 |     # for band in bands:
 72 |     #     print('creating splits for band: ', band)
 73 |     #     data = f[band]
 74 |     #     train_split[band] = data[train_indices]
 75 |     #     val_split[band] = data[val_indices]
 76 |     #     test_split[band] = data[test_indices]
 77 | 
 78 |     # # create the metadata splits
 79 |     # train_meta = meta[train_indices]
 80 |     # val_meta = meta[val_indices]
 81 |     # test_meta = meta[test_indices]
 82 | 
 83 |     # # create the output file
 84 |     # f_out = h5py.File(store_path, 'w')
 85 | 
 86 |     # # write the metadata
 87 |     # train_meta_out = f_out.create_dataset('train_metadata', data=train_meta)
 88 |     # val_meta_out = f_out.create_dataset('val_metadata', data=val_meta)
 89 |     # test_meta_out = f_out.create_dataset('test_metadata', data=test_meta)
 90 | 
 91 |     # # write the data
 92 |     # for band in bands:
 93 |     #     print('writing band: ', band)
 94 |     #     train_out = f_out.create_dataset('train_' + band, data=train_split[band])
 95 |     #     val_out = f_out.create_dataset('val_' + band, data=val_split[band])
 96 |     #     test_out = f_out.create_dataset('test_' + band, data=test_split[band])
 97 | 
 98 |     # # close the files
 99 |     # f.close()
100 |     # f_out.close()
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     create_splits(data_path, tile_info, store_path) 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import geojson
  3 | import logging
  4 | import ee
  5 | import os
  6 | import glob
  7 | import config.ee_init
  8 | import json
  9 | 
 10 | 
 11 | def read_geojson(path):
 12 |     '''
 13 |     Reads the geojson file
 14 |     '''
 15 |     with open(path) as f:
 16 |         gj = geojson.load(f)
 17 |     return gj
 18 | 
 19 | def merge_dicts(in_path, out_path = 'data/data_100k_130_tile_info.json'):
 20 |     '''
 21 |     Merges the dictionaries from the tile_info json files.
 22 |     '''
 23 | 
 24 |     # reading all the tile json files
 25 |     tile_info_dict = {}
 26 |     for tile_name in glob.glob(f'{in_path}/tile_info_*'):
 27 |         tmp = read_geojson(tile_name)
 28 |         tile_info_dict = tmp | tile_info_dict
 29 | 
 30 |     # writing the merged dictionary to a file
 31 |     print(len(tile_info_dict))
 32 |     with open(out_path, 'w') as f:
 33 |         geojson.dump(tile_info_dict, f)
 34 | 
 35 |     # remove the individual tile_info files
 36 |     # for tile_name in glob.glob('data/tile_info/tile_info_*'):
 37 |     #     os.remove(tile_name)
 38 | 
 39 | def create_missing_tiles_geojson():
 40 |     '''
 41 |     Creates the missing tiles geojson file.
 42 |     '''
 43 | 
 44 |     missing_tiles_csv = 'data/missing_tiles_1M.csv'
 45 |     tile_geojson = '/home/qbk152/vishal/global-lr/tile_polygons/uni_biomes_only/tiles_1M_130.geojson'
 46 | 
 47 |     geojson = {
 48 |         'type': 'FeatureCollection',
 49 |         'features': []
 50 |     }
 51 | 
 52 |     # reading the tile geojson file
 53 |     gj = read_geojson(tile_geojson)
 54 |     features = gj['features']
 55 | 
 56 |     # reading the missing tiles csv file
 57 |     for line in open(missing_tiles_csv):
 58 |         line = line.strip()
 59 |         line = line.split(',')
 60 |         tile_name = line[0]
 61 |         print('Processing tile: ', tile_name)
 62 | 
 63 |         # finding the tile in the tile geojson file
 64 |         for feature in features:
 65 |             if feature['properties']['tile_id'] == tile_name:
 66 |                 geojson['features'].append(feature)
 67 |                 break
 68 | 
 69 |     # writing the geojson file
 70 |     print('Total missing tiles: ', len(geojson['features']))
 71 |     with open('data/missing_tiles_1M.geojson', 'w') as f:
 72 |         json.dump(geojson, f)
 73 | 
 74 | 
 75 |         
 76 | 
 77 | 
 78 | 
 79 | def update_tile_info(tile, ee_set_, tile_info = None):
 80 |     '''
 81 |     Updates the tile information in the geojson file.
 82 |     '''
 83 | 
 84 |     if tile_info is not None:
 85 |         # this implies that there already exists a tile_info file, and we need to read the information from that file and update it with the new information which for now is only the bands
 86 |         id = ee_set_.id
 87 |         existing_bands = tile_info['BANDS']
 88 |         new_bands = ee_set_.img_bands
 89 |         bands = existing_bands | new_bands
 90 |         tile_info['BANDS'] = bands
 91 | 
 92 |         # # HARDCODED: adding the era5 data to the tile_info
 93 |         # if len(ee_set_.era5_data) > 0:
 94 |         #     tile_info['era5'] = ee_set_.era5_data
 95 |         return tile_info
 96 |     else:
 97 |         return_dict = {}
 98 |         return_dict['S2_DATE'] = ee_set_.s2_date
 99 |         # return_dict['S2_IMAGEID'] = ee_set_.s2_imageid
100 |         return_dict['S2_type'] = ee_set_.s2_type
101 |         return_dict['CRS'] = ee_set_.crs
102 |         return_dict['lat'] = ee_set_.lat
103 |         return_dict['lon'] = ee_set_.lon
104 |         return_dict['biome'] = ee_set_.biome
105 |         return_dict['eco_region'] = ee_set_.eco_region
106 |         return_dict['NO_DATA'] = ee_set_.no_data
107 |         return_dict['BANDS'] = ee_set_.img_bands
108 |         if len(ee_set_.era5_data) > 0:
109 |             return_dict['era5'] = ee_set_.era5_data
110 |         return return_dict
111 | 
112 | 
113 | 
114 | def get_points_filter(roi, buffer_size=0):
115 |     pnt_roi = roi.buffer(buffer_size, ee.ErrorMargin(1)).bounds()
116 |     coord_list = ee.List(pnt_roi.coordinates().get(0))
117 |     b_left = ee.Geometry.Point(coord_list.get(0))
118 |     b_right = ee.Geometry.Point(coord_list.get(1))
119 |     t_right = ee.Geometry.Point(coord_list.get(2))
120 |     t_left = ee.Geometry.Point(coord_list.get(3))
121 | 
122 |     points_filter = ee.Filter.And(
123 |         ee.Filter.geometry(b_right),
124 |         ee.Filter.geometry(t_left)
125 |     )
126 | 
127 |     return points_filter
128 | 
129 | 
130 | def get_ee_task_list():
131 |     '''
132 |     Gets the list of all the tasks in the EE project.
133 |     '''
134 |     tasks = []
135 |     task_list = ee.data.getTaskList()
136 |     for task in task_list:
137 |         if task['state'] in ['RUNNING', 'READY']:
138 |             tasks.append(task['id'])
139 |     return tasks
140 | 
141 | 
142 | def read_txt(path):
143 |     '''
144 |     Reads the txt file and returns a list of the lines in the file.
145 |     '''
146 |     string_list = []
147 |     with open(path, 'r') as f:
148 |         for line in f:
149 |             string_list.append(line.strip())
150 |     return string_list
151 | 
152 | def write_json(path, data):
153 |     '''
154 |     Writes the data to the json file.
155 |     '''
156 |     with open(path, 'w') as f:
157 |         json.dump(data, f)
158 | 
159 | def read_json(path):
160 |     '''
161 |     Reads the json file and returns the data.
162 |     '''
163 |     with open(path, 'r') as f:
164 |         data = json.load(f)
165 |     return data
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     merge_dicts('/home/qbk152/vishal/global-lr/data/data_300k_130_tile_info.json')
170 |     # create_missing_tiles_geojson()


--------------------------------------------------------------------------------