├── .gitignore
├── LICENSE
├── README.md
├── dataset
    ├── France_RPG
    │   ├── RPG2DF.py
    │   └── exploreRPG_labels.py
    ├── README.md
    ├── __init__.py
    ├── labelled_dense
    │   ├── SS
    │   │   ├── __init__.py
    │   │   ├── extract_images_for_parcel_labels.py
    │   │   ├── extract_parcel_labels_raster.py
    │   │   └── make_image_timeseries_for_parcel_labels.py
    │   ├── __init__.py
    │   ├── extract_images_for_labels.py
    │   ├── extract_images_for_parcel_labels.py
    │   ├── extract_labels_raster.py
    │   ├── extract_parcel_ground_truths.py
    │   ├── find_parcel_dimensions.py
    │   ├── make_image_timeseries_for_labels.py
    │   ├── make_image_timeseries_for_parcel_labels.py
    │   ├── make_labelled_dataset.sh
    │   ├── make_labelled_parcel_dataset.sh
    │   └── split_ground_truths_by_location.ipynb
    └── unlabelled
    │   ├── __init__.py
    │   ├── extract_images.py
    │   ├── make_image_timeseries.py
    │   └── make_unlabelled_dataset.sh
├── diagram.png
├── download
    ├── README.md
    ├── __init__.py
    ├── download.sh
    ├── find_S2_products_for_tile.ipynb
    ├── find_S2_tiles_for_aoi.ipynb
    ├── get_downloaded_products_info.ipynb
    └── sentinelsat_download_tileid.py
├── requirements.txt
└── utils
    ├── __init__.py
    ├── data_utils.py
    ├── date_utils.py
    ├── geospatial_data_utils.py
    ├── multiprocessing_utils.py
    └── sentinel_products_utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | **/*.png
 3 | __pycache__
 4 | **/__pycache__
 5 | *.pyc
 6 | **/*.pyc
 7 | misc
 8 | **/pw.csv
 9 | pw.csv
10 | **/.ipynb_checkpoints
11 | run_many.sh
12 | 
13 | # exclude superseded
14 | dataset/labelled_dense/SS
15 | download/SS
16 | 
17 | # exclude files not tested/documented
18 | dataset/labelled_dense/extract_images_for_parcel_labels.py
19 | dataset/labelled_dense/extract_parcel_ground_truths.py
20 | dataset/labelled_dense/find_parcel_dimensions.py
21 | dataset/labelled_dense/make_image_timeseries_for_parcel_labels.py
22 | dataset/labelled_dense/make_labelled_parcel_dataset.sh
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSatData: Building large scale datasets of satellite images for training machine learning models
 2 | ![plot](./diagram.png)
 3 | DeepSatData is a toolkit for making datasets from satellite imagery suitable for training machine learning models. 
 4 | The process is split into two distinct parts:
 5 | - identifying and downloading relevant Sentinel products for an area and time period of interest. Read more in  [download](./download)
 6 | - processing downloaded products into datasets. Read more in [dataset](./dataset). 
 7 |  
 8 | Further details on the methodology used can be found in our papers 
 9 | ["DeepSatData: Building large scale datasets of satellite images for training machine learning models"](arxiv url) and 
10 | ["Context-self contrastive pretraining for crop type semantic segmentation"](https://arxiv.org/abs/2104.04310). 
11 | 
12 | ## Dependencies
13 | Install dependencies using pip
14 | ```
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | or creating a conda environment
19 | ```
20 | conda create --name <env_name> --file requirements.txt
21 | ```
22 | 
23 | ## Citation
24 | If you use DeepSatData in your research consider citing the following BibTeX entries:
25 | ```
26 | @misc{tarasiou2021deepsatdata,
27 |       title={DeepSatData: Building large scale datasets of satellite images for training machine learning models}, 
28 |       author={Michail Tarasiou and Stefanos Zafeiriou},
29 |       year={2021},
30 |       eprint={2104.13824},
31 |       archivePrefix={arXiv},
32 |       primaryClass={cs.CV}
33 | }
34 | 
35 | @misc{tarasiou2021contextself,
36 |       title={Context-self contrastive pretraining for crop type semantic segmentation}, 
37 |       author={Michail Tarasiou and Riza Alp Guler and Stefanos Zafeiriou},
38 |       year={2021},
39 |       eprint={2104.04310},
40 |       archivePrefix={arXiv},
41 |       primaryClass={cs.CV}
42 | }
43 | ```
44 | 
45 | 
46 | ## License
47 | This project is under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for details.
48 | 
49 | <!---## Related projects
50 | This code was used in creating datasets used in [github repo name](github repo url)--->
51 | 


--------------------------------------------------------------------------------
/dataset/France_RPG/RPG2DF.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import shapefile
 3 | from shapely import geometry
 4 | import pandas as pd
 5 | import os
 6 | 
 7 | 
 8 | def main():
 9 |     args = parser.parse_args()
10 | 
11 |     rpg_file = os.path.join(args.rpg_dir, 'PARCELLES_GRAPHIQUES')
12 | 
13 |     sf = shapefile.Reader(rpg_file)
14 |     year = args.rpg_dir.split("-")[-1]
15 |     # print(year)
16 | 
17 |     data = []
18 |     for i in range(len(sf)):
19 |         # if i == 100:
20 |         #     break
21 |         if i % 1e6 == 0:
22 |             print('processing record %d of %d' % (i, len(sf)))
23 |         s = sf.shape(i)
24 |         rec = sf.record(i)
25 |         parcel = geometry.Polygon(s.points)
26 |         data.append([parcel, rec[2]])
27 | 
28 |     data = pd.DataFrame(data, columns=['geometry', 'CODE_CULTU'])
29 | 
30 |     print("num parcels in data file: %d" % data.shape[0])
31 | 
32 |     codecultu = data['CODE_CULTU'].drop_duplicates().tolist()
33 |     codecultu = {code: i + 1 for i, code in enumerate(codecultu)}
34 | 
35 |     data['ground_truth'] = data['CODE_CULTU'].map(codecultu)
36 |     del data['CODE_CULTU']
37 |     data['crs'] = args.epsg
38 |     data['year'] = year
39 |     data = data[['ground_truth', 'crs', 'year', 'geometry']]
40 | 
41 |     savedir = os.path.join(os.path.dirname(rpg_file), 'DF')
42 |     if not os.path.exists(savedir):
43 |         os.makedirs(savedir)
44 | 
45 |     data.to_csv(os.path.join(savedir, os.path.basename(rpg_file) + '_DF.csv'), index=False)
46 | 
47 |     pd.DataFrame([[k, v] for k, v in codecultu.items()], columns=['CODE_CULTU', 'ground_truth']) \
48 |         .to_csv(os.path.join(savedir, os.path.basename(rpg_file) + '_DF_codes.csv'), index=False)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser(description='Extract polygons and ground truths from RPG data')
53 |     parser.add_argument('--rpg-dir', type=str, help='Path to RPG directory')
54 |     parser.add_argument('--epsg', default='2154', type=str, help='EPSG coordinate system for RPG data')
55 |     
56 |     main()
57 | 


--------------------------------------------------------------------------------
/dataset/France_RPG/exploreRPG_labels.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/dataset/France_RPG/exploreRPG_labels.py


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
  1 | # Process Sentinel products 
  2 | What is of interest is to make a timeseries of Sentinel images for the duration of one year given a directory of 
  3 | downloaded Sentinel products. The code assumes that all (unziped) Sentinel products saved in the same directory correspond to a 
  4 | single tile, for example:
  5 | ```
  6 | products_dir
  7 | └───T28PCB
  8 | │   └───S2A_MSIL1C_20180702T113321_N0206_R080_T28PCB_20180702T151612.SAFE
  9 | │   └───S2A_MSIL1C_20180831T113321_N0206_R080_T28PCB_20180831T153248.SAFE
 10 | │   └───...
 11 | └───T28PDA
 12 | │    └───S2B_MSIL1C_20180316T112109_N0206_R037_T28PDA_20180316T132558.SAFE
 13 | │    └───S2B_MSIL1C_20180624T112109_N0206_R037_T28PDA_20180624T132810.SAFE
 14 | │    └───...
 15 | ```
 16 | 
 17 | Because the size of a Sentinel tile is too large to fit into gpu memory we split each tile into smaller manageable pieces
 18 | of size (HxW) and stack pieces corresponding to the same location at different timestamps to create a timeseries object. 
 19 | The final output of the process is a .pickle file with the following contents:
 20 | - a numpy array of size (TxH_ixW_i) named after each Sentinel band i. We do not rescale bands to match their resolution 
 21 | resulting in a different size for each band. T is the number of available dates
 22 | - a numpy array named "doy" of size T which corresponds to the "day of the year" for each available date
 23 | - a numpy array named "year" of size 1 corresponding to the year of observations
 24 | 
 25 | If ground truth data are available we also include the following:
 26 | - a numpy array named "labels" of size (HxW) corresponding to ground truth labels
 27 | - a numpy array named "ids" of size (HxW) corresponding to parcel identities
 28 | 
 29 | ## Including ground truth data
 30 | ### Make canonical .csv 
 31 | If available, ground truth data are used in the form of a canonical .csv file containing the following columns:
 32 | - id: (int) object id corresponding to polygon area (optional, if not included a unique integer will be assigned)
 33 | - ground_truth: (int) class corresponding to polygon area
 34 | - crs: (int) geographic coordinate reference system
 35 | - year: (int) the year the ground truth is valid for the given geometry
 36 | - geometry: (str) shapely polygon or multipolygon  
 37 | 
 38 | For example:
 39 | ```
 40 | ground_truth,crs,year,geometry
 41 | 1,32628,2019,"POLYGON ((325059.9695234112 1579552.827570891, 325082.9883194482 1579557.590080416, ...))"
 42 | 2,32628,2019,"POLYGON ((325108.9175379751 1579675.065315364, 325119.871309883 1579667.392383354, .))"
 43 | ``` 
 44 | 
 45 | Specifically for 
 46 | [RPG](https://www.data.gouv.fr/en/datasets/registre-parcellaire-graphique-rpg-contours-des-parcelles-et-ilots-culturaux-et-leur-groupe-de-cultures-majoritaire/) 
 47 | crop type data for France, the following can be used the following to transform .shp files to canonical .csv:
 48 | ```shell
 49 | python dataset/France_RPG/RPG2DF.py --rpg-dir <RPG files parent directory>
 50 | ``` 
 51 | 
 52 | ### Generate data
 53 | We distinguish between two different use cases:
 54 | 1. we overlay a grid of size equal to the desired sample_size on the AOI.
 55 | For each grid square we make a raster of all ground truths and satellite images (timeseries) that fall into that square.
 56 | The end result is a set of samples of size (sample_size X sample_size) each containing potentially multiple fields and 
 57 | not necessarily whole fields as some will be cut at the image boundaries. (show examples) 
 58 | 2. for each object in the canonical .csv we create a raster ground truth image in which the object is centered and all other pixels not
 59 | falling inside the polygon region are assigned the background class. We also generate satellite image timeseries as before. 
 60 | This results in a single object per sample at the center of the image.
 61 |   
 62 | #### Use case 1
 63 | For use case 1 run the following bash script to generate data corresponding to spatial locations for which there are available ground 
 64 | truths in the form of parcel polygons. 
 65 | ```shell
 66 | sh dataset/labelled_dense/make_labelled_dataset.sh ground_truths_file=<1:ground_truths_file> products_dir=<2:products_dir> labels_dir=<3:labels_dir> windows_dir=<4:windows_dir> timeseries_dir=<5:timeseries_dir> 
 67 | res=<6:res> sample_size=<7:sample_size> num_processes<8:num_processes> bands=<8:bands (optional)>
 68 | ```
 69 | where:
 70 | - ground_truths_file: file path for canonical .csv file as defined above
 71 | - products_dir: directory path for downloaded Sentinel products
 72 | - labels_dir: directory to save rasterized ground truths
 73 | - windows_dir: directory to save extracted image windows
 74 | - timeseries_dir: directory to ave final timeseries objects
 75 | - res: highest resolution of satellite image bands, 10 (m) for Sentinel-2
 76 | - sample_size: number of pixels of final image windows (for highest resolution image band) and ground truths
 77 | - num_processes: number of processes to run on parallel
 78 | - bands: (list) which satellite image bands to use, e.g. 'B02,B03,B04,...'. If not specified all bands are used (optional) 
 79 | 
 80 | #### Use case 2
 81 | 
 82 | For use case 2 we first need to decide on the spatial dimensions of the samples. The following command finds the maximum N-S, E-W 
 83 | distance for each parcel as well as the maximum of the two distances and also saves a cummulative histogram for these 
 84 | dimensions. 
 85 | ```shell
 86 | python dataset/labelled_dense/find_parcel_dimensions.py ground_truths_file=<1:ground_truths_file> products_dir=<2:products_dir> save_dir=<3:save_dir>
 87 | ```
 88 | where:
 89 | - ground_truths_file: file path for canonical .csv file as defined above
 90 | - products_dir: directory path for downloaded Sentinel products
 91 | - save_dir: directory to save output
 92 | 
 93 | This information will help guide the decision on the sample size. We want most parcels to fit in the sample but not make 
 94 | it larger than needed as this would mean wasting of computational resources. Of course other considerations may come into play.
 95 | 
 96 | After deciding on the parcel size we run the following command to generate use case 2 data:
 97 | 
 98 | ```shell
 99 | sh dataset/labelled_dense/make_labelled_parcel_dataset.sh ground_truths_file=<1:ground_truths_file> products_dir=<2:products_dir> labels_dir=<3:labels_dir> windows_dir=<4:windows_dir> timeseries_dir=<5:timeseries_dir> 
100 | res=<6:res> sample_size=<7:sample_size> Npoly=<8:Npoly> num_processes=<9:num_processes> bands=<10:bands (optional)>
101 | ```
102 | ```shell
103 | sh dataset/labelled_dense/make_labelled_parcel_dataset.sh ground_truths_file='/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/example_parcels_in_AOI.csv' products_dir='/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2018/cloud_0_30' labels_dir='/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18_example/LABELS' windows_dir='/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18_example/IMAGES' timeseries_dir='/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18_example/TIMESERIES' res=10 sample_size=100 Npoly=50 num_processes=8
104 | ```
105 | 
106 | ## Without ground truth data
107 | In this case we only need to provide the directory where Sentinel products are downloaded. Optionally we can provide an 
108 | anchor point which wil be used when constructing the grid for splitting the AOI into smaller pieces. If provided the 
109 | anchor will be placed at a vertex of the constructed grid. 
110 | 
111 | ### Generate data
112 | Run the following bash script to generate data corresponding to spatial locations for which there are available ground 
113 | truths in the form of parcel polygons. 
114 | ```shell
115 | sh dataset/unlabelled/make_unlabelled_dataset.sh products_dir=<1:products_dir> windows_dir=<2:windows_dir> timeseries_dir=<3:timeseries_dir> res=<4:res> 
116 | sample_size=<5:sample_size> num_processes=<6:num_processes> anchor=<7:anchor (optional)> bands=<8:bands (optional)> 
117 | ```
118 | where:
119 | - products_dir: (str) directory path for downloaded Sentinel products
120 | - windows_dir: (str) directory to save extracted image windows
121 | - timeseries_dir: (str) directory to ave final timeseries objects
122 | - res: (int) highest resolution of satellite image bands, 10 (m) for Sentinel-2
123 | - sample_size: (int) number of pixels of final image windows (for highest resolution image band) and ground truths
124 | - num_processes: (int) number of processes to run on parallel
125 | - anchor: (list) (N,W,CRS) coordinates of an anchor point and CRS to use as a corner for extracting windows (optional)
126 | - bands: (list) which satellite image bands to use, e.g. 'B02,B03,B04,...'. If not specified all bands are used (optional) 
127 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/dataset/__init__.py


--------------------------------------------------------------------------------
/dataset/labelled_dense/SS/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/dataset/labelled_dense/SS/__init__.py


--------------------------------------------------------------------------------
/dataset/labelled_dense/SS/extract_images_for_parcel_labels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Given a set of S2 tiles and a labelled_dense lable map, extract crops of images matching the location of labels
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import rasterio
  7 | import numpy as np
  8 | import os
  9 | from glob import glob
 10 | import pickle
 11 | if __name__ == "__main__" and __package__ is None:
 12 |     from sys import path
 13 |     from os.path import dirname as dir
 14 |     path.insert(0, dir(dir(path[0])))
 15 |     __package__ = "examples"
 16 | from utils.data_utils import find_number
 17 | from utils.geospatial_data_utils import GeoTransform
 18 | from utils.multiprocessing_utils import run_pool
 19 | from utils.sentinel_products_utils import get_S2prod_info
 20 | 
 21 | 
 22 | mult = {'B01': 1/6.,'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 23 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 24 | # jp2s = ["%s.jp2" % i for i in mult.keys()]
 25 | 
 26 | 
 27 | def extract_images(imdirs):
 28 | 
 29 |     jp2s = ["%s.jp2" % i for i in bands]
 30 | 
 31 |     saved_files_info = []
 32 | 
 33 |     for ii, imdir in enumerate(imdirs):
 34 |         # ii, imdir = 1, imdirs[1]
 35 | 
 36 |         print("unfolding product %d of %d" % (ii, len(imdirs)))
 37 | 
 38 |         imname = "%s_%s" % (imdir.split("/")[-2].split("_")[-3], imdir.split("/")[-4].split("_")[-5])
 39 | 
 40 |         # read product
 41 |         data = {}
 42 |         for jp2 in jp2s:
 43 |             with rasterio.open("%s/%s_%s" % (imdir, imname, jp2)) as f:
 44 |                 data[jp2[:-4]] = f.read(1)
 45 | 
 46 |         # if str(f.crs).split(':')[1] != CRSl:
 47 |             # geotransform_prod2label = GeoTransform(str(f.crs).split(':')[1], CRSl, loc2loc=CRSl != '4326')
 48 |             # geotransform_label2prod = GeoTransform(CRSl, str(f.crs).split(':')[1], loc2loc=CRSl != '4326')
 49 |             # Wp, Np = geotransform_prod2label(np.array(f.transform)[2], np.array(f.transform)[5])
 50 |         # else:
 51 |         #     Wp, Np = np.array(f.transform)[2], np.array(f.transform)[5]
 52 |         geotransform_label2prod = GeoTransform(CRSl, str(f.crs).split(':')[1], loc2loc=CRSl != '4326')
 53 |         Wp, Np = np.array(f.transform)[2], np.array(f.transform)[5]
 54 | 
 55 |         prod_savedir = os.path.join(savedir, imdir.split("/")[-4].split(".")[0])
 56 |         if not os.path.exists(prod_savedir):
 57 |             os.makedirs(prod_savedir)
 58 | 
 59 |         # saved_gt_info[saved_gt_info['Ntl']==saved_gt_info['Ntl'].max()]
 60 |         for i in range(saved_gt_info.shape[0]):
 61 |             # i = 3600
 62 |             # i = 4500
 63 |             # i = 4000
 64 | 
 65 |             Nl = saved_gt_info.iloc[i]['Ntl']
 66 |             Wl = saved_gt_info.iloc[i]['Wtl']
 67 |             Wlp, Nlp = geotransform_label2prod(Wl, Nl)
 68 | 
 69 |             # ip = int((Np - Nl) / res) # + 2
 70 |             # jp = int((Wl - Wp) / res) # + 2
 71 |             ip = int((Np - Nlp) / res)  # + 2
 72 |             jp = int((Wlp - Wp) / res)  # + 2
 73 | 
 74 |             # # sample outside Sentinel product
 75 |             # if (ip < 0) or (jp < 0):
 76 |             #     saved_files_info.append(
 77 |             #         [None, Nl, Wl, Np, Wp, ip, jp, sample_size, sample_size, date, imdir,
 78 |             #          "sample outside Sentinel product"])
 79 |             #     continue
 80 | 
 81 |             date = imdir.split("/")[-4].split(".")[0].split("_")[2][:8]
 82 | 
 83 |             sample = {}
 84 |             for jp2 in jp2s:
 85 |                 xpmin = int(np.round(mult[jp2[:-4]] * ip))
 86 |                 ypmin = int(np.round(mult[jp2[:-4]] * jp))
 87 |                 sample[jp2[:-4]] = data[jp2[:-4]][xpmin: xpmin + int(mult[jp2[:-4]] * sample_size),
 88 |                                                   ypmin: ypmin + int(mult[jp2[:-4]] * sample_size)]
 89 | 
 90 |             # this parcel falls in black region for this product
 91 |             if sample[jp2[:-4]].sum() == 0:
 92 |                 saved_files_info.append(
 93 |                     ["", Nlp, Wlp, Nl, Wl, Np, Wp, ip, jp, sample_size, sample_size, date, imdir, "no image"])
 94 |                 continue
 95 | 
 96 |             # import matplotlib.pyplot as plt
 97 |             #
 98 |             #
 99 |             # with open(saved_gt_info.iloc[i]['filepath'], 'rb') as handle:
100 |             #     labels = pickle.load(handle)  # , protocol=pickle.HIGHEST_PROTOCOL)plt.figure()
101 |             #
102 |             # print(ip, jp)
103 |             #
104 |             #
105 |             # plt.figure()
106 |             # plt.imshow(sample['B03'])
107 |             # plt.imshow(labels['ratios'], alpha=0.7)
108 |             #
109 |             # # plt.figure()
110 |             # # plt.imshow(labels['ratios'])
111 |             # ij = np.array([[3534, 10068], [3582, 10746], [9828, 3456]])
112 |             # l = np.array([[63, 65], [43, 58], [47, 66]])
113 |             # im = np.array([[63, 70], [45, 65], [55, 68.5]])
114 |             # im - l
115 | 
116 |             sample_save_path = "%s/N%d_W%d_D%s_CRS%s.pickle" % (prod_savedir, int(Nl), int(Wl), date, CRSl)
117 |             with open(sample_save_path, 'wb') as handle:
118 |                 pickle.dump(sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
119 | 
120 |             saved_files_info.append(
121 |                 [sample_save_path, Nlp, Wlp, Nl, Wl, Np, Wp, ip, jp, sample_size, sample_size, date, imdir, "ok"])
122 | 
123 |     df = pd.DataFrame(data=saved_files_info,
124 |                       columns=['sample_path', 'Nlp', 'Wlp', 'Nl', 'Wl', 'Np', 'Wp', 'ip', 'jp',
125 |                                'height', 'width', 'Date', 'S2_prod_imdir', "comment"])
126 |     return df
127 | 
128 | 
129 | def main():
130 |     # ground truths
131 |     gtfiles = os.listdir(ground_truths_dir)
132 |     # years = [find_number(s, "Y") for s in gtfiles]
133 |     # files = {year: {} for year in set(years)}
134 |     # for i, file in enumerate(gtfiles):
135 |     #     if not file.startswith('INVALID'):
136 |     #         files[years[i]][file.split("_")[0]] = file
137 |     # print("found ground truths for years %s" % ", ".join(list(files.keys())))
138 |     # global labels
139 |     # global Nl
140 |     # global Wl
141 |     global CRSl
142 |     global saved_gt_info
143 | 
144 |     # global num_rows
145 |     # global num_cols
146 | 
147 |     # sentinel products
148 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
149 |     prod_df = get_S2prod_info(imdirs)
150 |     prod_df['Year'] = prod_df['Time'].apply(lambda s: s[:4])
151 | 
152 |     out = []
153 |     for gtfile in gtfiles:
154 |         # gtfile = gtfiles[0]
155 | 
156 |         # # ground truths
157 |         # labels = np.loadtxt(os.path.join(ground_truths_dir, files[year]['LABELS']), dtype=np.float32)
158 |         # Nl = int(find_number(files[year]['LABELS'], "N"))
159 |         # Wl = int(find_number(files[year]['LABELS'], "W"))
160 |         #
161 |         # num_rows, num_cols = [d / sample_size for d in labels.shape]
162 |         # assert (np.ceil(num_rows) == num_rows) and (np.ceil(num_cols) == num_cols), \
163 |         # "sample size should be fitting exactly in labels, this suggests an error in extract_labels_raster script"
164 |         saved_gt_info = pd.read_csv(os.path.join(ground_truths_dir, gtfile, 'saved_data_info.csv'))
165 | 
166 |         year = find_number(gtfile, "Y")
167 |         CRSl = find_number(gtfile, "CRS")
168 | 
169 |         # sentinel products
170 |         products = prod_df[prod_df['Year'] == year]
171 |         imdirs = products['path'].tolist()
172 | 
173 |         df_year = run_pool(imdirs, extract_images, num_processes)
174 |         # df = extract_images([imdirs[0]])
175 |         out.append(pd.concat(df_year))
176 | 
177 |     df = pd.concat(out).reset_index(drop=True)
178 |     df['crs'] = CRSl
179 |     df.to_csv(os.path.join(savedir, "extracted_windows_data_info.csv"), index=False)
180 | 
181 | 
182 | if __name__ == "__main__":
183 | 
184 |     # parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
185 |     # parser.add_argument('--ground_truths_dir', help='directory containing ground truth parcels raster')
186 |     # parser.add_argument('--products_dir', help='directory containing downloaded sentinel products')
187 |     # parser.add_argument('--savedir', help='save directory to extract sentinel products windows')
188 |     # parser.add_argument('--bands', default=None, help='which satellite image bands to use')
189 |     # parser.add_argument('--res', default=10, help='pixel size in meters')
190 |     # parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
191 |     # parser.add_argument('--num_processes', default=4, help='number of parallel processes')
192 |     # # ---------------------------------------------------------------------------------------------
193 |     #
194 |     # args = parser.parse_args()
195 |     #
196 |     # ground_truths_dir = args.ground_truths_dir
197 |     #
198 |     # products_dir = args.products_dir
199 |     #
200 |     # savedir = args.savedir
201 |     # print("savedir: ", savedir)
202 |     # if not os.path.exists(savedir):
203 |     #     os.makedirs(savedir)
204 |     #
205 |     # bands = args.bands
206 |     # if bands == 'None':
207 |     #     bands = list(mult.keys())
208 |     # else:
209 |     #     bands = bands.split(',')
210 |     #
211 |     # res = int(args.res)
212 |     #
213 |     # sample_size = int(args.sample_size)
214 |     #
215 |     # num_processes = int(args.num_processes)
216 | 
217 |     ground_truths_dir = '/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/LABELS4'
218 |     products_dir = '/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2018/cloud_0_30'
219 |     savedir = '/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/IMAGES'
220 |     bands = 'None'
221 |     if bands == 'None':
222 |         bands = list(mult.keys())
223 |     else:
224 |         bands = bands.split(',')
225 |     res = 10
226 |     sample_size = 100
227 |     num_processes = 4
228 | 
229 | 
230 |     # main()
231 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/SS/extract_parcel_labels_raster.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | import numpy as np
  4 | from shapely import geometry
  5 | import os
  6 | from glob import glob
  7 | from multiprocessing import Pool
  8 | if __name__ == "__main__" and __package__ is None:
  9 |     from sys import path
 10 |     from os.path import dirname as dir
 11 |     path.insert(0, dir(dir(path[0])))
 12 |     __package__ = "examples"
 13 | from utils.geospatial_data_utils import GeoTransform, get_points_from_str_poly
 14 | from utils.multiprocessing_utils import split_df
 15 | from utils.sentinel_products_utils import get_S2prod_info
 16 | import matplotlib.pyplot as plt
 17 | import pickle
 18 | from copy import deepcopy
 19 | 
 20 | 
 21 | def is_valid(parcel_poly, pxmin, pymax):
 22 |     """
 23 |     checks if parcel_poly polygon has valid shape
 24 |     """
 25 |     isvalid = True
 26 |     i = 0
 27 |     j = 0
 28 |     pix_points = [[pxmin + loc[0] * res, pymax - loc[1] * res] for loc in
 29 |                   [[j, i], [j + 1, i], [j + 1, i + 1], [j, i + 1], [j, i]]]
 30 |     try:
 31 |         parcel_poly.intersection(geometry.Polygon(pix_points)).area
 32 |     except:
 33 |         isvalid = False
 34 |     return isvalid
 35 | 
 36 | 
 37 | def plot_poly(points, c=None, newfig=False):
 38 |     if type(points) in [list, tuple]:
 39 |         points = np.array(points)
 40 |     if c is None:
 41 |         c = "r"
 42 |     if newfig:
 43 |         plt.figure()
 44 |     for i in range(points.shape[0] - 1):
 45 |         plt.plot(points[i:i + 2, 0], points[i:i + 2, 1], c=c)
 46 | 
 47 | 
 48 | def str_line_eq(points, h=1e-1):
 49 |     assert points.shape == (2, 2), 'Two points must be used to derive straight line equation'
 50 |     x1, y1 = points[0]
 51 |     x2, y2 = points[1]
 52 |     denom = x2 - x1
 53 |     if denom == 0:
 54 |         denom = h
 55 |     a = (y2 - y1) / denom  # (x2 - x1)
 56 |     b = (y1 * x2 - x1 * y2) / denom  # (x2 - x1)
 57 |     return a, b
 58 | 
 59 | 
 60 | def extract_parcel_labels_raster(inputs):
 61 | 
 62 |     # rank = 0
 63 |     rank, geodata, W, N, Wp, Np, year, crs = inputs
 64 |     # rank, geodata, W, N, Wp, Np, year, crs = inputs[0]
 65 | 
 66 |     # # arrays to save
 67 |     # AOI_labels = np.zeros((int(np.round(dy / res)), int(np.round(dx / res))), dtype=np.float32) # + max_label + 1
 68 |     # AOI_ids = np.zeros((int(np.round(dy / res)), int(np.round(dx / res))), dtype=np.float32)
 69 |     # AOI_masks = AOI_ids.copy()
 70 |     # # additional/helper arrays
 71 |     # AOI_ratios = AOI_ids.copy()
 72 |     year_savedir = os.path.join(savedir, 'Y%s_N%s_W%s_R%d_CRS%s' % (year, N, W, res, crs))
 73 |     if not os.path.exists(year_savedir):
 74 |         os.makedirs(year_savedir)
 75 | 
 76 |     saved_data_info = []
 77 |     # invalid_shapes = []
 78 |     for ii in range(geodata.shape[0]):
 79 |         # ii = 3600  # 4500
 80 |         print("process %d, parcel %d of %d" % (rank, ii+1, geodata.shape[0]))
 81 |         parcel_poly = geodata['geometry'][ii]
 82 |         label = geodata['ground_truth'][ii]
 83 |         id = geodata['id'][ii]
 84 | 
 85 |         points = get_points_from_str_poly(parcel_poly)
 86 |         anchor = np.array(geometry.Polygon(points).centroid)
 87 |         # anchor = points.mean(axis=0)
 88 |         N0 = anchor[1] + sample_size * res / 2.
 89 |         W0 = anchor[0] - sample_size * res / 2.
 90 | 
 91 |         # correct for non integer offset wrt product Nmax, Wmax (top-left) coordinates
 92 |         dN = (Np - N0) % 60
 93 |         dW = (W0 - Wp) % 60
 94 |         N0 += dN
 95 |         W0 -= dW
 96 |         # anchor[1] = N0 - sample_size * res / 2.
 97 |         # anchor[0] = W0 + sample_size * res / 2.
 98 |         anchor = np.array([W0 + sample_size * res / 2., N0 - sample_size * res / 2.])
 99 |         # anchor = points.mean(axis=0) #- sample_size * res / 2
100 | 
101 |         # pr = points - anchor
102 |         pr = (points - anchor + sample_size * res / 2)  # !!!
103 |         parcel_poly = geometry.Polygon(pr)
104 | 
105 |         pxmin, pymin = pr.min(axis=0)
106 |         pxmax, pymax = pr.max(axis=0)
107 | 
108 |         # DONT DO VERY SMALL ONES
109 |         # if ((pxmax - pxmin) < 20) or ((pymax - pymin) < 20):
110 | 
111 |         if not is_valid(parcel_poly, pxmin, pymax):
112 |             try:
113 |                 int_area = sum(
114 |                     [geometry.Polygon(np.array(pol.coords[:])).area for pol in parcel_poly.buffer(0).interiors])
115 |                 ext_area = geometry.Polygon(np.array(parcel_poly.buffer(0).exterior.coords[:])).area
116 |                 if int_area / ext_area < 0.05:  # threshold for discarding a parcel polygon
117 |                     print("included, number of interior areas %d, intarea: %.0f, extarea: %.0f, ratio: %.4f" %
118 |                           (len(parcel_poly.buffer(0).interiors), int_area, ext_area, int_area / ext_area))
119 |                     parcel_poly = geometry.Polygon(np.array(parcel_poly.buffer(0).exterior.coords[:]))
120 |                     pr = np.stack([np.array(i) for i in parcel_poly.exterior.coords.xy]).T
121 |                     pxmin, pymin = pr.min(axis=0)
122 |                     pxmax, pymax = pr.max(axis=0)
123 |                 else:
124 |                     print("excluded, number of interior areas %d, intarea: %.0f, extarea: %.0f, ratio: %.4f" %
125 |                           (len(parcel_poly.buffer(0).interiors), int_area, ext_area, int_area / ext_area))
126 |                     values = geodata.iloc[ii].to_list()
127 |                     for v in [N0, W0, None]:
128 |                         values.append(v)
129 |                     saved_data_info.append(values)
130 |                     continue
131 |             except:
132 |                 continue
133 | 
134 |         # labels = np.zeros((sample_size, sample_size), dtype=np.float32)
135 |         # ids = labels.copy()
136 |         ratios = np.zeros((sample_size, sample_size), dtype=np.float32)
137 |         alpha = ratios.copy()
138 |         # global_alpha = ratios.copy()
139 |         global_beta = ratios.copy()
140 |         # local_alpha = ratios.copy()
141 |         local_beta = ratios.copy()
142 | 
143 |         row0 = int(np.floor((1 - pymax / (sample_size * res)) * sample_size))
144 |         row1 = int(np.ceil((1 - pymin / (sample_size * res)) * sample_size))
145 |         col0 = int(np.floor(pxmin / (sample_size * res) * sample_size))
146 |         col1 = int(np.ceil(pxmax / (sample_size * res) * sample_size))  # + 1
147 |         # row0 = int((1 - pr[:, 1].max() / dy) * AOI_labels.shape[0])
148 |         # row1 = int((1 - pr[:, 1].min() / dy) * AOI_labels.shape[0])
149 |         # col0 = int(pr[:, 0].min() / dx * AOI_labels.shape[1])
150 |         # col1 = int(pr[:, 0].max() / dx * AOI_labels.shape[1]) + 1
151 | 
152 |         # H, W = sample_size, sample_size
153 |         Height, Width = row1 - row0, col1 - col0
154 | 
155 |         # if (Height < 5) or (Width)
156 |         # bl = False
157 | 
158 |         for i in range(Height):
159 |             # if bl:
160 |             #     break
161 |             for j in range(Width):
162 |                 # i, j = 0, 3
163 |                 if (row0 + i) * (col0 + j) < 0:
164 |                     continue
165 | 
166 |                 try:
167 | 
168 |                     pix_points = [[pxmin + loc[0] * res, pymax - loc[1] * res] for loc in
169 |                                   [[j, i], [j + 1, i], [j + 1, i + 1], [j, i + 1], [j, i]]]
170 | 
171 |                     pix_poly = geometry.Polygon(pix_points)
172 | 
173 |                     value = parcel_poly.intersection(pix_poly).area / res ** 2
174 |                     if (0 < value) and (value < 1):  # parcel cuts through pixel
175 |                         # print(i, j)
176 |                         # bl = True
177 |                         global_points = np.array(parcel_poly.boundary.intersection(pix_poly.boundary))
178 |                         if global_points.shape[0] > 2: # !!!
179 |                             global_points = global_points[:2]
180 |                         global_params = str_line_eq(global_points)
181 |                         alpha[row0 + i + 1, col0 + j + 1] = global_params[0]
182 |                         # global_alpha[row0 + i + 1, col0 + j + 1] = global_params[0]
183 |                         global_beta[row0 + i + 1, col0 + j + 1] = global_params[1] / (sample_size * res)
184 |                         local_points = (global_points - np.array([pxmin + j * res, pymax - i * res])) / res
185 |                         local_params = str_line_eq(local_points)
186 |                         # local_alpha[row0 + i + 1, col0 + j + 1] = local_params[0]
187 |                         local_beta[row0 + i + 1, col0 + j + 1] = local_params[1]
188 | 
189 |                         # break
190 | 
191 |                     if value == 0:  # no intersection
192 |                         continue
193 | 
194 |                     # labels[row0 + i + 1, col0 + j + 1] = label
195 |                     ratios[row0 + i + 1, col0 + j + 1] = value
196 |                     # ratios[col0 + i + 1, row0 + j + 1] = value
197 | 
198 |                     # ids[row0 + i + 1, col0 + j + 1] = id
199 | 
200 |                 except:
201 |                     continue
202 |         # replace global, local alpha with alpha
203 |         sample = {'N': N0, 'W': W0, 'boundary': pr / res, 'label': label, 'id': id, 'ratios': ratios,
204 |                   'alpha': alpha, 'global_beta': global_beta, 'local_beta': local_beta}
205 |         impath = os.path.join(year_savedir, 'N%d_E%d_ground_truths.pickle' % (N0, W0))
206 |         with open(impath, 'wb') as handle:
207 |             pickle.dump(sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
208 | 
209 |         values = geodata.iloc[ii].to_list()
210 |         for v in [N0, W0, impath]:
211 |             values.append(v)
212 |         saved_data_info.append(values)
213 | 
214 |     return saved_data_info
215 | 
216 | 
217 |     # lines = []
218 |     # boundary = parcel_poly.boundary
219 |     # if boundary.type == 'MultiLineString':
220 |     #     for line in boundary:
221 |     #         lines.append(line)
222 |     # else:
223 |     #     lines.append(boundary)
224 |     #
225 |     # ########################################################################
226 |     #
227 |     #
228 |     #
229 |     # points = pix_poly.boundary.intersection(parcel_poly.boundary)  # multipoint
230 |     # points = np.array(points)
231 |     #
232 |     #
233 |     # print(points)
234 |     # plt.figure()
235 |     # plt.imshow(ratios)
236 |     # plt.scatter((anchor[0]-W0) / res, (N0 - anchor[1])/res)
237 | 
238 | 
239 | #     plt.figure()
240 | #     plt.hist(global_alpha[global_alpha != 0], 20)
241 | #
242 | #     plt.figure()
243 | #     plt.hist(alpha[alpha != 0], 20)
244 | #
245 | #     plt.figure()
246 | #     plt.hist(np.tanh(global_beta[global_beta != 0]))
247 | #
248 | #     plt.figure()
249 | #     plt.hist(np.tanh(local_beta[local_beta != 0]))
250 | #
251 | #     plt.figure()
252 | #     plt.hist(np.tanh(global_alpha[global_alpha != 0]), 20)
253 | #
254 | #     plt.figure()
255 | #     plt.hist(np.tanh(global_beta[global_beta > 0] / 1000.))
256 | #
257 |     # plt.figure()
258 |     # plt.imshow(alpha)  # , ::-1])
259 |     # plt.title('alpha')
260 |     # plt.colorbar()
261 |     # pr1 = pr / res # - np.array([2.5, 8])
262 |     # pr1[:, 1] = 100 - pr1[:, 1]
263 |     # pr1 = pr1
264 |     # plot_poly(pr1, newfig=False)
265 | #
266 | #     plt.figure()
267 | #     plt.imshow(global_beta)  # , ::-1])
268 | #     plt.title('global_beta')
269 | #     plt.colorbar()
270 | #     plot_poly(pr1, newfig=False)
271 | #
272 | #
273 | # def dot(x1, x2):
274 | #     return x1.dot(x2)
275 | #
276 | #
277 | # def norm_dot(x1, x2):
278 | #     x1 = x1 / np.linalg.norm(x1)
279 | #     x2 = x2 / np.linalg.norm(x2)
280 | #     return x1.dot(x2)
281 | #
282 | #
283 | # l1 = np.array([-100, 300])
284 | # l2 = np.array([100, -300])
285 | # l3 = np.array([20, -5])
286 | #
287 | # dot(l1, l2)
288 | # dot(l2, l3)
289 | #
290 | # norm_dot(l1, l2)
291 | # norm_dot(l2, l3)
292 | 
293 | 
294 | 
295 | 
296 | # plot_poly(pix_points, newfig=False)
297 | 
298 | # x = np.linspace(0, 100, 100)
299 | # y = -0.407134 * x + (100 - 103.457/10)  # 103.457
300 | #
301 | # plt.plot(x, y)
302 | 
303 |     # # plt.figure()
304 |     # plt.plot(*geometry.Polygon(pr1).exterior.xy)
305 |     # # for i in range(pr1.shape[0] - 1):
306 |     # #     plt.plot(pr1[i:i + 2, 0], pr1[i:i + 2, 1], c='r')
307 |     # plt.scatter(1, 1)
308 | 
309 |     # return AOI_labels, AOI_ids, AOI_masks, AOI_ratios, pd.DataFrame(invalid_shapes)
310 | 
311 | 
312 | def main():
313 |     # ground truth data
314 |     gt_df = pd.read_csv(ground_truths_file)
315 |     if 'id' not in gt_df:
316 |         print('Column "id" not included. Assigning values from 1 to file size')
317 |         gt_df['id'] = range(1, gt_df.shape[0]+1)
318 |     # gt_df['id'] = range(1, gt_df.shape[0]+1)
319 |     assert (gt_df['crs'] == gt_df['crs'].iloc[0]).all(), \
320 |         "Polygons corresponding to multiple CRS were found in %s" % ground_truths_file
321 |     crs = gt_df['crs'].iloc[0]
322 |     yearly_grouped_gt = gt_df.groupby('year')
323 |     years = list(yearly_grouped_gt.groups.keys())
324 |     print("found ground truth data for years %s" % ", ".join([str(i) for i in years]))
325 |     if 0 in gt_df['ground_truth'].drop_duplicates():
326 |         gt_df['ground_truth'] += 1
327 | 
328 |     # sentinel products
329 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
330 |     prod_df = get_S2prod_info(imdirs)
331 |     assert (prod_df['West']==prod_df['West'].iloc[0]).all() and (prod_df['North']==prod_df['North'].iloc[0]).all(),\
332 |     "Sentinel products corresponding to multiple tiles were found in %s" % products_dir
333 |     geotr = GeoTransform(intr=prod_df['crs'].iloc[0].split(':')[1], outtr=gt_df['crs'].iloc[0], loc2loc=gt_df['crs'].iloc[0] != '4326')
334 |     prod_WN = prod_df[['West', 'North']].iloc[0].tolist()
335 |     prod_WN = geotr(prod_WN[0], prod_WN[1])  # in ground truth data coordinate system
336 |     d = (10 * prod_df[['height', 'width']].iloc[0].values).tolist()
337 | 
338 |     # find all ground truth data that fall inside sentinel product
339 |     prod_poly = geometry.Polygon([[prod_WN[0] + loc[0] * d[0], prod_WN[1] - loc[1] * d[1]] for loc in
340 |                                   [[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]])
341 |     print(prod_poly)
342 |     def f(x):
343 |         try:
344 |             x = get_points_from_str_poly(x)
345 |             W = x[:, 0].min()
346 |             E = x[:, 0].max()
347 |             S = x[:, 1].min()
348 |             N = x[:, 1].max()
349 |             x = geometry.Polygon(x)
350 |             inratio = prod_poly.intersection(x).area / x.area
351 |             return np.array([N, S, W, E, inratio])
352 |         except:
353 |             return np.array([0, 0, 0, 0, 0])
354 | 
355 |     gt_df[['N', 'S', 'W', 'E', 'inratio']] = np.stack(gt_df['geometry'].apply(f).values)
356 |     gt_df = gt_df[gt_df['inratio'] == 1.0]
357 |     print("found %d polygons inside sentinel tile" % gt_df.shape[0])
358 | 
359 |     N = int(np.ceil(gt_df['N'].max()))   # N-maxy
360 |     # S = int(np.floor(gt_df['S'].min()))  # S-miny
361 |     # E = int(np.ceil(gt_df['E'].max()))   # E-maxx
362 |     W = int(np.floor(gt_df['W'].min()))  # W-minx
363 | 
364 |     # # increase AOI dimensions to match integer multiple of sample size
365 |     # if np.ceil((maxy - miny) / (sample_size * res)) != (maxy - miny) / (sample_size * res):
366 |     #     dy = (np.ceil((maxy - miny) / (sample_size * res)) - (maxy - miny) / (sample_size * res)) * (sample_size * res)
367 |     #     miny = miny - dy
368 |     # if np.ceil((maxx - minx) / (sample_size * res)) != (maxx - minx) / (sample_size * res):
369 |     #     dx = (np.ceil((maxx - minx) / (sample_size * res)) - (maxx - minx) / (sample_size * res)) * (sample_size * res)
370 |     #     maxx = maxx + dx
371 |     # dx = maxx - minx
372 |     # dy = maxy - miny
373 |     # anchor = minx, miny  # WS
374 | 
375 |     pool = Pool(num_processes)
376 | 
377 |     for year in years:
378 |         # year = years[0]
379 | 
380 |         geodata = gt_df[gt_df['year'] == year].reset_index(drop=True)
381 | 
382 |         inputs = [[i, df_, W, N, prod_WN[0], prod_WN[1], year, crs] for i, df_ in enumerate(split_df(geodata, num_processes))]
383 | 
384 |         outputs = pool.map(extract_parcel_labels_raster, inputs)
385 | 
386 |         saved_data_info = pd.concat(pd.DataFrame(out) for out in outputs)
387 |         save_name = os.path.join(savedir, 'Y%s_N%s_W%s_R%d_CRS%s' % (year, N, W, res, crs), 'saved_data_info.csv')
388 |         saved_data_info.columns = ['id', 'ground_truth', 'crs', 'year', 'geometry', 'Np', 'Sp', 'Wp', 'Ep',
389 |        'inratio', 'Dy', 'Dx', 'D', 'Ntl', 'Wtl', 'filepath']
390 |         saved_data_info.to_csv(save_name, index=False)
391 | 
392 |         # d = pd.read_csv(save_name)
393 |         # d['filepath'] = d['filepath'].apply(lambda s: os.path.join('/'.join(s.split('/')[:-1]),
394 |         #                                                            'Y2018_N6650384_W799943_R10_CRS2154',
395 |         #                                                            s.split('/')[-1]))
396 |         # AOI_labels = np.stack([out_[0] for out_ in outputs])
397 |         # AOI_ids = np.stack([out_[1] for out_ in outputs])
398 |         # AOI_masks = np.stack([out_[2] for out_ in outputs])
399 |         # AOI_ratios = np.stack([out_[3] for out_ in outputs])
400 |         # invalid_shapes = pd.concat([out_[4] for out_ in outputs])
401 |         #
402 |         # labels = AOI_labels.max(axis=0)
403 |         # masks = AOI_masks.max(axis=0)
404 |         # ids = AOI_ids.sum(axis=0)
405 |         #
406 |         # locs = np.stack(np.where((AOI_labels > 0).sum(axis=0) > 1)).T
407 |         #
408 |         # for i, loc in enumerate(locs):
409 |         #
410 |         #     if i % 1000 == 0:
411 |         #         print("correcting inter process overlaps, step %d of %d" % (i, locs.shape[0]))
412 |         #
413 |         #     if any(AOI_ratios[:, loc[0], loc[1]] == 1.0):
414 |         #         masks[loc[0], loc[1]] = 2
415 |         #     else:
416 |         #         masks[loc[0], loc[1]] = 1
417 |         #
418 |         #     idx = np.argmax(AOI_ratios[:, loc[0], loc[1]])
419 |         #     labels[loc[0], loc[1]] = AOI_labels[idx, loc[0], loc[1]]
420 |         #     ids[loc[0], loc[1]] = AOI_ids[idx, loc[0], loc[1]]
421 |         #
422 |         # np.savetxt("%s/LABELS_Y%s_N%s_W%s_R%d_CRS%s.csv" %
423 |         #            (savedir, str(year), str(maxy), str(minx), res, str(crs)), labels)
424 |         # np.savetxt("%s/IDS_Y%s_N%s_W%s_R%d_CRS%s.csv" %
425 |         #            (savedir, str(year), str(maxy), str(int(maxx)), res, str(crs)), ids)
426 |         # np.savetxt("%s/MASKS_Y%s_N%s_W%s_R%d_CRS%s.csv" %
427 |         #            (savedir, str(year), str(maxy), str(int(maxx)), res, str(crs)), masks)
428 |         # if invalid_shapes.shape[0] != 0:
429 |         #     invalid_shapes.to_csv(
430 |         #         "%s/INVALID_Y%s_N%s_W%s_R%d_CRS%s.csv" %
431 |         #         (savedir, str(year), str(maxy), str(int(maxx)), res, str(crs)), index=False)
432 | 
433 | 
434 | if __name__ == "__main__":
435 | 
436 |     # parser = argparse.ArgumentParser(description='Make raster from shapely polygons')
437 |     # parser.add_argument('--ground_truths_file', help='filename containing ground truth parcel polygons')
438 |     # parser.add_argument('--products_dir', help='directory containing sentinel products')
439 |     # parser.add_argument('--savedir', help='save directory to extract ground truths in raster mode')
440 |     # parser.add_argument('--res', default=10, help='pixel size in meters')
441 |     # parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
442 |     # parser.add_argument('--num_processes', default=4, help='number of parallel processes')
443 |     #
444 |     # args = parser.parse_args()
445 |     #
446 |     # ground_truths_file = args.ground_truths_file
447 |     #
448 |     # products_dir = args.products_dir
449 |     #
450 |     # savedir = args.savedir
451 |     # print("savedir: ", savedir)
452 |     # if not os.path.exists(savedir):
453 |     #     os.makedirs(savedir)
454 |     #
455 |     # res = int(args.res)
456 |     #
457 |     # sample_size = int(args.sample_size)
458 |     #
459 |     # num_processes = int(args.num_processes)
460 |     #
461 |     #
462 |     # main()
463 | 
464 |     ground_truths_file = '/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/gt_df_parcels_in_AOI.csv'
465 |     products_dir = '/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2018/cloud_0_30'
466 |     savedir = '/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/LABELS4'
467 |     res = 10
468 |     sample_size = 100  # 64
469 |     num_processes = 4
470 | 
471 |     # if not os.path.exists(savedir):
472 |     #     os.makedirs(savedir)
473 |     #
474 |     # main()
475 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/SS/make_image_timeseries_for_parcel_labels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For a set of extracted image crops and a labelled_dense label map, make a timeseries of all positions matched with labels
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import numpy as np
  7 | import os
  8 | import shutil
  9 | import pickle
 10 | if __name__ == "__main__" and __package__ is None:
 11 |     from sys import path
 12 |     from os.path import dirname as dir
 13 |     path.insert(0, dir(dir(path[0])))
 14 |     __package__ = "examples"
 15 | from utils.data_utils import find_number
 16 | from utils.date_utils import get_doy
 17 | from utils.multiprocessing_utils import run_pool
 18 | 
 19 | 
 20 | mult = {'B01': 1/6., 'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 21 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 22 | 
 23 | 
 24 | def match_labels_images(yearlocs):
 25 | 
 26 |     refband = bands[1]
 27 | 
 28 |     saved_files_info = []
 29 |     for jj, yearloc in enumerate(yearlocs):
 30 | 
 31 |         if jj % 1000 == 0:
 32 |             print("%d of %d" % (jj, len(yearlocs)))
 33 |         # yearloc = yearlocs[20000]
 34 |         try:
 35 | 
 36 |             idx = yearloc_groups[yearloc]
 37 |             data = iminfo.iloc[idx, :].sort_values(by='DOY').copy()
 38 |             data = data.drop_duplicates(subset=['DOY'], keep='first')  # some products downloaded twice
 39 | 
 40 |             Y = data['Year'].iloc[0]
 41 |             N = data['Nl'].iloc[0]
 42 |             W = data['Wl'].iloc[0]
 43 |             # il = data['il'].iloc[0]
 44 |             # jl = data['jl'].iloc[0]
 45 | 
 46 |             assert all(data['Year'] == Y)
 47 |             assert all(data['Nl'] == N)
 48 |             assert all(data['Wl'] == W)
 49 |             # assert all(data['il'] == il)
 50 |             # assert all(data['jl'] == jl)
 51 | 
 52 |             # timeseries_sample = {'B01': [], 'B02': [], 'B03': [], 'B04': [], 'B05': [], 'B06': [], 'B07': [],
 53 |             #                      'B08': [], 'B8A': [], 'B09': [], 'B10': [], 'B11': [], 'B12': [], 'doy': []}
 54 |             timeseries_sample = {band: [] for band in bands}
 55 |             timeseries_sample['doy'] = []
 56 |             for sample_info in data[['sample_path', 'DOY']].values:
 57 |                 # sample_info = data[['sample_path', 'DOY']].values[0]
 58 |                 impath, doy = sample_info
 59 | 
 60 |                 with open(impath, 'rb') as handle:
 61 |                     sample = pickle.load(handle, encoding='latin1')
 62 | 
 63 |                 # image falls in black region for this product (should have been excluded in extract_images_for_parcel_labels.py)
 64 |                 if sample[refband].sum() == 0:
 65 |                     # print('zero sum')
 66 |                     continue
 67 | 
 68 |                 # image does not match required size (should have been excluded in extract_images_for_parcel_labels.py)
 69 |                 height, width = sample[refband].shape
 70 |                 if (height != sample_size) or (width != sample_size):
 71 |                     # print('unequal size')
 72 |                     continue
 73 | 
 74 |                 # for key in ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B10', 'B11', 'B12']:
 75 |                 for key in bands:
 76 |                     timeseries_sample[key].append(sample[key])
 77 |                 timeseries_sample['doy'].append(np.array(doy))
 78 | 
 79 |             # for key in ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B10', 'B11', 'B12', 'doy']:
 80 |             for key in bands:
 81 |                 timeseries_sample[key] = np.stack(timeseries_sample[key])
 82 |             timeseries_sample['doy'] = np.stack(timeseries_sample['doy'])
 83 |             timeseries_sample['year'] = np.array(Y).astype(np.int32)
 84 | 
 85 |             timesteps = timeseries_sample[refband].shape[0]
 86 | 
 87 |             gt = saved_gt_info[(saved_gt_info['Ntl'] == yearloc[0]) & (saved_gt_info['Wtl'] == yearloc[1])]
 88 |             with open(gt['filepath'].values[0], 'rb') as handle:
 89 |                 labels = pickle.load(handle, encoding='latin1')
 90 |             for ltype in labels.keys():
 91 |                 timeseries_sample[ltype.lower()] = labels[ltype]
 92 | 
 93 |             savename = os.path.join(year_savedir, "%d_%d_%s.pickle" % (int(N), int(W), Y))
 94 |             with open(savename, 'wb') as handle:
 95 |                 pickle.dump(timeseries_sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
 96 | 
 97 |             saved_files_info.append([savename, Y, N, W, sample_size, sample_size, timesteps, "completed"])
 98 | 
 99 |         except:
100 | 
101 |             saved_files_info.append(["", Y, N, W, sample_size, sample_size, 0, "failed"])
102 | 
103 |     saved_files_info = pd.DataFrame(data=saved_files_info, columns=['sample_path', 'Year', 'N', 'W', 'dy', 'dx', 'dt',
104 |                                                                     'status'])
105 |     return saved_files_info
106 | 
107 | 
108 | def main():
109 | 
110 |     global yearloc_groups
111 |     global iminfo
112 |     global labels
113 |     global year_savedir
114 |     global saved_gt_info
115 | 
116 |     iminfo = pd.read_csv(os.path.join(windows_dir, "extracted_windows_data_info.csv"))
117 |     crs = iminfo['crs'].iloc[0]
118 | 
119 |     # remove non extracted locations
120 |     iminfo = iminfo[~pd.isnull(iminfo['sample_path'])].reset_index(drop=True)
121 |     iminfo['DOY'] = iminfo['Date'].apply(lambda s: get_doy(str(s)))
122 |     iminfo['Year'] = iminfo['Date'].apply(lambda s: str(s)[:4])
123 | 
124 |     # ground truths
125 |     # gtinfo = pd.read_csv(os.path.join(windows_dir, "extracted_windows_data_info.csv"))
126 |     #
127 |     # gtfiles = os.listdir(ground_truths_dir)
128 |     # years = [find_number(s, "Y") for s in gtfiles]
129 |     # files = {year: {} for year in set(years)}
130 |     # for i, file in enumerate(gtfiles):
131 |     #     if not file.startswith('INVALID'):
132 |     #         files[years[i]][file.split("_")[0]] = file
133 |     # print("found ground truths in raster for years %s" % ", ".join(list(files.keys())))
134 |     gtfiles = os.listdir(ground_truths_dir)
135 | 
136 |     saved_files_info = []
137 | 
138 |     for gtfile in gtfiles:
139 |         # gtfile = gtfiles[0]
140 | 
141 |         saved_gt_info = pd.read_csv(os.path.join(ground_truths_dir, gtfile, 'saved_data_info.csv'))
142 | 
143 |         year = find_number(gtfile, "Y")
144 |         CRSl = find_number(gtfile, "CRS")
145 | 
146 |         year_savedir = os.path.join(savedir, year)
147 |         if not os.path.isdir(year_savedir):
148 |             os.makedirs(year_savedir)
149 | 
150 |         yearloc_groups = iminfo[iminfo['Year'] == year].groupby(['Nl', 'Wl'], as_index=False).groups
151 |         yearlocs = list(yearloc_groups.keys())
152 | 
153 |         df = run_pool(yearlocs, match_labels_images, num_processes)
154 |         df = pd.concat(df)
155 | 
156 |         saved_files_info.append(df)
157 | 
158 | 
159 |     df = pd.concat(saved_files_info).reset_index(drop=True)
160 |     df['crs'] = crs
161 |     df.to_csv(os.path.join(savedir, "saved_timeseries_data_info.csv"), index=False)
162 | 
163 |     # delete windows dir
164 |     # shutil.rmtree(windows_dir)
165 | 
166 | 
167 | if __name__ == "__main__":
168 | 
169 |     # parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
170 |     # parser.add_argument('--ground_truths_dir', help='directory containing ground truth parcels raster')
171 |     # parser.add_argument('--products_dir', help='directory containing downloaded sentinel products')
172 |     # parser.add_argument('--windows_dir', help='directory containing extracted windows from sentinel products')
173 |     # parser.add_argument('--savedir', help='save directory for image timeseries with labels')
174 |     # parser.add_argument('--bands', default=None, help='which satellite image bands to use')
175 |     # parser.add_argument('--res', default=10, help='pixel size in meters')
176 |     # parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
177 |     # parser.add_argument('--num_processes', default=4, help='number of parallel processes')
178 |     # # ---------------------------------------------------------------------------------------------
179 |     #
180 |     # args = parser.parse_args()
181 |     #
182 |     # ground_truths_dir = args.ground_truths_dir
183 |     #
184 |     # products_dir = args.products_dir
185 |     #
186 |     # windows_dir = args.windows_dir
187 |     #
188 |     # savedir = args.savedir
189 |     # if not os.path.exists(savedir):
190 |     #     os.makedirs(savedir)
191 |     #
192 |     # res = int(args.res)
193 |     #
194 |     # sample_size = int(args.sample_size)
195 |     #
196 |     # num_processes = int(args.num_processes)
197 |     #
198 |     # bands = args.bands
199 |     #
200 |     # if bands == 'None':
201 |     #     bands = list(mult.keys())
202 |     # else:
203 |     #     bands = bands.split(',')
204 |     #
205 |     # main()
206 | 
207 | 
208 |     ground_truths_dir = '/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/LABELS4'
209 |     products_dir = '/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2018/cloud_0_30'
210 |     windows_dir = '/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/IMAGES'
211 |     savedir = '/media/michaeltrs/sdb/HD2/Data/Satellite_Imagery/RPG/T31FM_18/TIMESERIES2'
212 |     if not os.path.exists(savedir):
213 |         os.makedirs(savedir)
214 | 
215 |     res = 10
216 |     sample_size = 100
217 |     num_processes = 4
218 |     bands = 'None'
219 | 
220 | 
221 |     if bands == 'None':
222 |         bands = list(mult.keys())
223 |     else:
224 |         bands = bands.split(',')
225 | 
226 |     # main()
227 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/dataset/labelled_dense/__init__.py


--------------------------------------------------------------------------------
/dataset/labelled_dense/extract_images_for_labels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Given a set of S2 tiles and a labelled_dense lable map, extract crops of images matching the location of labels
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import rasterio
  7 | import numpy as np
  8 | import os
  9 | from glob import glob
 10 | import pickle
 11 | if __name__ == "__main__" and __package__ is None:
 12 |     from sys import path
 13 |     from os.path import dirname as dir
 14 |     path.insert(0, dir(dir(path[0])))
 15 |     __package__ = "examples"
 16 | from utils.data_utils import find_number
 17 | from utils.geospatial_data_utils import GeoTransform
 18 | from utils.multiprocessing_utils import run_pool
 19 | from utils.sentinel_products_utils import get_S2prod_info
 20 | 
 21 | 
 22 | mult = {'B01': 1/6., 'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 23 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 24 | 
 25 | 
 26 | def extract_images(imdirs):
 27 | 
 28 |     jp2s = ["%s.jp2" % i for i in bands]
 29 | 
 30 |     saved_files_info = []
 31 | 
 32 |     for ii, imdir in enumerate(imdirs):
 33 |         # ii, imdir = 0, imdirs[0]
 34 | 
 35 |         print("unfolding product %d of %d" % (ii, len(imdirs)))
 36 | 
 37 |         imname = "%s_%s" % (imdir.split("/")[-2].split("_")[-3], imdir.split("/")[-4].split("_")[-5])
 38 | 
 39 |         # read product
 40 |         data = {}
 41 |         for jp2 in jp2s:
 42 |             with rasterio.open("%s/%s_%s" % (imdir, imname, jp2)) as f:
 43 |                 data[jp2[:-4]] = f.read(1)
 44 | 
 45 |         geotransform_label2prod = GeoTransform(CRSl, str(f.crs).split(':')[1], loc2loc=CRSl != '4326')
 46 |         Wp, Np = np.array(f.transform)[2], np.array(f.transform)[5]
 47 | 
 48 |         prod_savedir = os.path.join(savedir, imdir.split("/")[-4].split(".")[0])
 49 |         if not os.path.exists(prod_savedir):
 50 |             os.makedirs(prod_savedir)
 51 | 
 52 |         for i in range(int(num_rows)):
 53 | 
 54 |             for j in range(int(num_cols)):
 55 |                 # i, j = 2, 0
 56 | 
 57 |                 if i * num_cols + j == 1000:
 58 |                     print("row %d of %d, column %d of %d" % (i, num_rows, j, num_cols))
 59 | 
 60 |                 # Nij = Nl - i * res * sample_size  # N for extracted label window
 61 |                 # Wij = Wl + j * res * sample_size  # W for extracted label window
 62 |                 # ip = (Np - Nij) / (res * sample_size)  # product row
 63 |                 # jp = (Wij - Wp) / (res * sample_size)  # product column
 64 |                 # Nl, Wl = geotransform_label2prod(Wl, Nl)
 65 |                 Nij = Nl - i * 10 * sample_size  # N for extracted label window
 66 |                 Wij = Wl + j * 10 * sample_size  # W for extracted label window
 67 |                 Wij, Nij = geotransform_label2prod(Wij, Nij)
 68 |                 ip = (Np - Nij) / (10 * sample_size)  # product row
 69 |                 jp = (Wij - Wp) / (10 * sample_size)  # product column
 70 | 
 71 |                 # exception: image id falls outside sentinel product
 72 |                 if (ip < 0) or (jp < 0):
 73 |                     saved_files_info.append(
 74 |                         [None, Nij, Wij, Nl, Wl, Np, Wp, i, j, ip, jp, sample_size, sample_size, date, imdir,
 75 |                          "sample outside Sentinel product"])
 76 |                     continue
 77 | 
 78 |                 date = imdir.split("/")[-4].split(".")[0].split("_")[2][:8]
 79 | 
 80 |                 # exception: no labels for this location
 81 |                 if labels[i * label_mult * sample_size: (i + 1) * label_mult * sample_size, j * label_mult * sample_size: (j + 1) * label_mult * sample_size].sum() == 0:
 82 |                     saved_files_info.append(
 83 |                         [None, Nij, Wij, Nl, Wl, Np, Wp, i, j, ip, jp, sample_size, sample_size, date, imdir, "no labels"])
 84 |                     continue
 85 | 
 86 |                 # load image data
 87 |                 sample = {}
 88 |                 for jp2 in jp2s:
 89 |                     xpmin = int(np.round(mult[jp2[:-4]] * ip * sample_size))
 90 |                     ypmin = int(np.round(mult[jp2[:-4]] * jp * sample_size))
 91 |                     sample[jp2[:-4]] = data[jp2[:-4]][xpmin: xpmin + int(mult[jp2[:-4]] * sample_size),
 92 |                                        ypmin: ypmin + int(mult[jp2[:-4]] * sample_size)]
 93 | 
 94 |                 # assert all images are square, intended to catch images at the edge of a product
 95 |                 if any([sample[k].shape[0] != sample[k].shape[1] for k in sample.keys()]):
 96 |                     continue
 97 | 
 98 |                 # exception: image is all zero for this location
 99 |                 if sample[jp2[:-4]].sum() == 0:
100 |                     saved_files_info.append(
101 |                         [None, Nij, Wij, Nl, Wl, Np, Wp, i, j, ip, jp, sample_size, sample_size, date, imdir, "no image"])
102 |                     continue
103 | 
104 |                 # none of the above exceptions apply, save image data
105 |                 sample_save_path = "%s/N%d_W%d_D%s.pickle" % (prod_savedir, int(Nij), int(Wij), date)
106 |                 with open(sample_save_path, 'wb') as handle:
107 |                     pickle.dump(sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
108 | 
109 |                 saved_files_info.append(
110 |                     [sample_save_path, Nij, Wij, Nl, Wl, Np, Wp, i, j, ip, jp, sample_size, sample_size, date, imdir, "ok"])
111 | 
112 |     df = pd.DataFrame(data=saved_files_info,
113 |                       columns=['sample_path', 'Nij', 'Wij', 'Nl', 'Wl', 'Np', 'Wp', 'il', 'jl', 'ip', 'jp',
114 |                                'height', 'width', 'Date', 'S2_prod_imdir', "comment"])
115 |     return df
116 | 
117 | 
118 | def main():
119 |     global labels
120 |     global Nl
121 |     global Wl
122 |     global CRSl
123 |     global num_rows
124 |     global num_cols
125 |     global label_mult
126 | 
127 |     # read all extracted ground truths files
128 |     gtfiles = os.listdir(ground_truths_dir)
129 |     years = [find_number(s, "Y") for s in gtfiles]
130 |     files = {year: {} for year in set(years)}
131 |     for i, file in enumerate(gtfiles):
132 |         if not file.startswith('INVALID'):
133 |             files[years[i]][file.split("_")[0]] = file
134 |     print("found ground truths in raster for years %s" % ", ".join(list(files.keys())))
135 | 
136 |     label_mult = int(10. / res)
137 | 
138 |     # get information on saved sentinel products
139 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
140 |     prod_df = get_S2prod_info(imdirs)
141 |     prod_df['Year'] = prod_df['Time'].apply(lambda s: s[:4])
142 | 
143 |     out = []
144 |     for year in set(years):
145 |         # year = years[0]
146 |         # ground truths
147 |         labels = np.loadtxt(os.path.join(ground_truths_dir, files[year]['LABELS']), dtype=np.float32)
148 |         Nl = int(find_number(files[year]['LABELS'], "N"))
149 |         Wl = int(find_number(files[year]['LABELS'], "W"))
150 | 
151 |         num_rows, num_cols = [d / (10 / res * sample_size) for d in labels.shape]
152 |         assert (np.ceil(num_rows) == num_rows) and (np.ceil(num_cols) == num_cols), \
153 |         "sample size should be fitting exactly in labels, this suggests an error in extract_labels_raster script"
154 |         CRSl = find_number(files[year]['LABELS'], "CRS")
155 | 
156 |         # sentinel products
157 |         products = prod_df[prod_df['Year'] == year]
158 |         imdirs = products['path'].tolist()
159 | 
160 |         df_year = run_pool(imdirs, extract_images, num_processes)
161 |         # df = extract_images([imdirs[0]])
162 |         out.append(pd.concat(df_year))
163 | 
164 | 
165 |     df = pd.concat(out).reset_index(drop=True)
166 |     df['crs'] = CRSl
167 |     df.to_csv(os.path.join(savedir, "extracted_windows_data_info.csv"), index=False)
168 | 
169 | 
170 | 
171 | if __name__ == "__main__":
172 | 
173 |     parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
174 |     parser.add_argument('--ground_truths_dir', help='directory containing ground truth parcels raster')
175 |     parser.add_argument('--products_dir', help='directory containing downloaded sentinel products')
176 |     parser.add_argument('--savedir', help='save directory to extract sentinel products windows')
177 |     parser.add_argument('--bands', default=None, help='which satellite image bands to use')
178 |     parser.add_argument('--res', default=10, help='pixel size in meters')
179 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
180 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
181 |     # ---------------------------------------------------------------------------------------------
182 | 
183 |     args = parser.parse_args()
184 | 
185 |     ground_truths_dir = args.ground_truths_dir
186 | 
187 |     products_dir = args.products_dir
188 | 
189 |     savedir = args.savedir
190 |     print("savedir: ", savedir)
191 |     if not os.path.exists(savedir):
192 |         os.makedirs(savedir)
193 | 
194 |     bands = args.bands
195 |     if bands == 'None':
196 |         bands = list(mult.keys())
197 |     else:
198 |         bands = bands.split(',')
199 | 
200 |     res = float(args.res)
201 |     assert np.ceil(10. / res) == 10. / res, "Label pixel size should divide min satellite pixel size (10m), but %.1f was selected" % res
202 | 
203 |     sample_size = int(args.sample_size)
204 | 
205 |     num_processes = int(args.num_processes)
206 | 
207 |     main()
208 | 
209 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/extract_images_for_parcel_labels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Given a set of S2 tiles and a labelled_dense lable map, extract crops of images matching the location of labels
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import rasterio
  7 | import numpy as np
  8 | import os
  9 | from glob import glob
 10 | import pickle
 11 | if __name__ == "__main__" and __package__ is None:
 12 |     from sys import path
 13 |     from os.path import dirname as dir
 14 |     path.insert(0, dir(dir(path[0])))
 15 |     __package__ = "examples"
 16 | from utils.data_utils import find_number
 17 | from utils.geospatial_data_utils import GeoTransform
 18 | from utils.multiprocessing_utils import run_pool
 19 | from utils.sentinel_products_utils import get_S2prod_info
 20 | 
 21 | 
 22 | mult = {'B01': 1/6., 'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 23 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 24 | 
 25 | 
 26 | def extract_images(imdirs):
 27 | 
 28 |     jp2s = ["%s.jp2" % i for i in bands]
 29 | 
 30 |     saved_files_info = []
 31 | 
 32 |     for ii, imdir in enumerate(imdirs):
 33 | 
 34 |         print("unfolding product %d of %d" % (ii, len(imdirs)))
 35 | 
 36 |         imname = "%s_%s" % (imdir.split("/")[-2].split("_")[-3], imdir.split("/")[-4].split("_")[-5])
 37 | 
 38 |         # read product
 39 |         data = {}
 40 |         for jp2 in jp2s:
 41 |             with rasterio.open("%s/%s_%s" % (imdir, imname, jp2)) as f:
 42 |                 data[jp2[:-4]] = f.read(1)
 43 | 
 44 |         geotransform_label2prod = GeoTransform(CRSl, str(f.crs).split(':')[1], loc2loc=CRSl != '4326')
 45 |         Wp, Np = np.array(f.transform)[2], np.array(f.transform)[5]
 46 | 
 47 |         prod_savedir = os.path.join(savedir, imdir.split("/")[-4].split(".")[0])
 48 |         if not os.path.exists(prod_savedir):
 49 |             os.makedirs(prod_savedir)
 50 | 
 51 |         for i in range(saved_gt_info.shape[0]):
 52 | 
 53 |             Nl = saved_gt_info.iloc[i]['Ntl']
 54 |             Wl = saved_gt_info.iloc[i]['Wtl']
 55 |             Wlp, Nlp = geotransform_label2prod(Wl, Nl)
 56 | 
 57 |             ip = int(np.round((Np - Nlp) / 10.))
 58 |             jp = int(np.round((Wlp - Wp) / 10.))
 59 | 
 60 |             date = imdir.split("/")[-4].split(".")[0].split("_")[2][:8]
 61 | 
 62 |             sample = {}
 63 |             for jp2 in jp2s:
 64 |                 xpmin = int(np.round(mult[jp2[:-4]] * ip))
 65 |                 ypmin = int(np.round(mult[jp2[:-4]] * jp))
 66 |                 sample[jp2[:-4]] = data[jp2[:-4]][xpmin: xpmin + int(mult[jp2[:-4]] * sample_size),
 67 |                                                   ypmin: ypmin + int(mult[jp2[:-4]] * sample_size)]
 68 | 
 69 |             # this parcel falls in black region for this product
 70 |             if sample[jp2[:-4]].sum() == 0:
 71 |                 saved_files_info.append(
 72 |                     ["", Nlp, Wlp, Nl, Wl, Np, Wp, ip, jp, sample_size, sample_size, date, imdir, "no image"])
 73 |                 continue
 74 | 
 75 |             sample_save_path = "%s/N%d_W%d_D%s_CRS%s.pickle" % (prod_savedir, int(Nl), int(Wl), date, CRSl)
 76 |             with open(sample_save_path, 'wb') as handle:
 77 |                 pickle.dump(sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
 78 | 
 79 |             saved_files_info.append(
 80 |                 [sample_save_path, Nlp, Wlp, Nl, Wl, Np, Wp, ip, jp, sample_size, sample_size, date, imdir, "ok"])
 81 | 
 82 |     df = pd.DataFrame(data=saved_files_info,
 83 |                       columns=['sample_path', 'Nlp', 'Wlp', 'Nl', 'Wl', 'Np', 'Wp', 'ip', 'jp',
 84 |                                'height', 'width', 'Date', 'S2_prod_imdir', "comment"])
 85 |     return df
 86 | 
 87 | 
 88 | def main():
 89 |     # ground truths
 90 |     gtdirs = [f for f in os.listdir(ground_truths_dir) if os.path.isdir(os.path.join(ground_truths_dir, f))]
 91 | 
 92 |     global CRSl
 93 |     global saved_gt_info
 94 | 
 95 |     # sentinel products
 96 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
 97 |     prod_df = get_S2prod_info(imdirs)
 98 |     prod_df['Year'] = prod_df['Time'].apply(lambda s: s[:4])
 99 | 
100 |     out = []
101 |     for gtdir in gtdirs:
102 | 
103 |         # ground truths
104 |         saved_gt_info = pd.read_csv(os.path.join(ground_truths_dir, gtdir, 'saved_data_info.csv'))
105 | 
106 |         year = find_number(gtdir, "Y")
107 |         CRSl = find_number(gtdir, "CRS")
108 | 
109 |         # sentinel products
110 |         products = prod_df[prod_df['Year'] == year]
111 |         imdirs = products['path'].tolist()
112 | 
113 |         df_year = run_pool(imdirs, extract_images, num_processes)
114 | 
115 |         out.append(pd.concat(df_year))
116 | 
117 |     df = pd.concat(out).reset_index(drop=True)
118 |     df['crs'] = CRSl
119 |     df.to_csv(os.path.join(savedir, "extracted_windows_data_info.csv"), index=False)
120 | 
121 | 
122 | if __name__ == "__main__":
123 | 
124 |     parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
125 |     parser.add_argument('--ground_truths_dir', help='directory containing ground truth parcels raster')
126 |     parser.add_argument('--products_dir', help='directory containing downloaded sentinel products')
127 |     parser.add_argument('--savedir', help='save directory to extract sentinel products windows')
128 |     parser.add_argument('--bands', default=None, help='which satellite image bands to use')
129 |     parser.add_argument('--res', default=10, help='pixel size in meters')
130 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
131 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
132 |     # ---------------------------------------------------------------------------------------------
133 | 
134 |     args = parser.parse_args()
135 | 
136 |     ground_truths_dir = args.ground_truths_dir
137 | 
138 |     products_dir = args.products_dir
139 | 
140 |     savedir = args.savedir
141 |     print("savedir: ", savedir)
142 |     if not os.path.exists(savedir):
143 |         os.makedirs(savedir)
144 | 
145 |     bands = args.bands
146 |     if bands == 'None':
147 |         bands = list(mult.keys())
148 |     else:
149 |         bands = bands.split(',')
150 | 
151 |     # res = int(args.res)
152 |     res = float(args.res)
153 | 
154 |     sample_size = int(args.sample_size)
155 | 
156 |     num_processes = int(args.num_processes)
157 | 
158 |     main()
159 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/extract_labels_raster.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | import numpy as np
  4 | from shapely import geometry
  5 | import os
  6 | from glob import glob
  7 | from multiprocessing import Pool
  8 | if __name__ == "__main__" and __package__ is None:
  9 |     from sys import path
 10 |     from os.path import dirname as dir
 11 |     path.insert(0, dir(dir(path[0])))
 12 |     __package__ = "examples"
 13 | from utils.geospatial_data_utils import GeoTransform, get_points_from_str_poly, closest_point_to_poly
 14 | from utils.multiprocessing_utils import split_df
 15 | from utils.sentinel_products_utils import get_S2prod_info
 16 | 
 17 | 
 18 | def is_valid(parcel_poly, pxmin, pymax):
 19 |     """
 20 |     checks if parcel_poly polygon has valid shape
 21 |     """
 22 |     isvalid = True
 23 |     i = 0
 24 |     j = 0
 25 |     pix_points = [[pxmin + loc[0] * res, pymax - loc[1] * res] for loc in
 26 |                   [[j, i], [j + 1, i], [j + 1, i + 1], [j, i + 1], [j, i]]]
 27 |     try:
 28 |         parcel_poly.intersection(geometry.Polygon(pix_points)).area
 29 |     except:
 30 |         isvalid = False
 31 |     return isvalid
 32 | 
 33 | 
 34 | def extract_labels_raster(inputs):
 35 |     # inputs = inputs[0]
 36 |     rank, geodata, anchor, dx, dy = inputs
 37 | 
 38 |     # arrays to save
 39 |     AOI_labels = np.zeros((int(np.round(dy / res)), int(np.round(dx / res))), dtype=np.float32) # + max_label + 1
 40 |     AOI_ids = np.zeros((int(np.round(dy / res)), int(np.round(dx / res))), dtype=np.float32)
 41 |     AOI_masks = AOI_ids.copy()
 42 |     # additional/helper arrays
 43 |     AOI_ratios = AOI_ids.copy()
 44 |     AOI_distances = AOI_ids.copy()
 45 |     # AOI_alphas = AOI_ids.copy()
 46 | 
 47 |     invalid_shapes = []
 48 |     for ii in range(geodata.shape[0]):
 49 |         # ii = 0
 50 |         print("process %d, parcel %d of %d" % (rank, ii+1, geodata.shape[0]))
 51 |         parcel_poly = geodata['geometry'][ii]
 52 |         label = geodata['ground_truth'][ii]
 53 |         id = geodata['id'][ii]
 54 | 
 55 |         points = get_points_from_str_poly(parcel_poly)
 56 |         pr = (points - anchor)
 57 |         parcel_poly = geometry.Polygon(pr)
 58 | 
 59 |         pxmin, pymin = pr.min(axis=0)
 60 |         pxmax, pymax = pr.max(axis=0)
 61 | 
 62 |         if not is_valid(parcel_poly, pxmin, pymax):
 63 |             try:
 64 |                 int_area = sum(
 65 |                     [geometry.Polygon(np.array(pol.coords[:])).area for pol in parcel_poly.buffer(0).interiors])
 66 |                 ext_area = geometry.Polygon(np.array(parcel_poly.buffer(0).exterior.coords[:])).area
 67 |                 if int_area / ext_area < 0.05:  # threshold for discarding a parcel polygon
 68 |                     print("included, number of interior areas %d, intarea: %.0f, extarea: %.0f, ratio: %.4f" %
 69 |                           (len(parcel_poly.buffer(0).interiors), int_area, ext_area, int_area / ext_area))
 70 |                     parcel_poly = geometry.Polygon(np.array(parcel_poly.buffer(0).exterior.coords[:]))
 71 |                     pr = np.stack([np.array(i) for i in parcel_poly.exterior.coords.xy]).T
 72 |                     pxmin, pymin = pr.min(axis=0)
 73 |                     pxmax, pymax = pr.max(axis=0)
 74 |                 else:
 75 |                     print("excluded, number of interior areas %d, intarea: %.0f, extarea: %.0f, ratio: %.4f" %
 76 |                           (len(parcel_poly.buffer(0).interiors), int_area, ext_area, int_area / ext_area))
 77 |                     invalid_shapes.append(geodata.iloc[ii])
 78 |                     continue
 79 |             except:
 80 |                 continue
 81 | 
 82 |         row0 = int((1 - pr[:, 1].max() / dy) * AOI_labels.shape[0])
 83 |         row1 = int((1 - pr[:, 1].min() / dy) * AOI_labels.shape[0])
 84 |         col0 = int(pr[:, 0].min() / dx * AOI_labels.shape[1])
 85 |         col1 = int(pr[:, 0].max() / dx * AOI_labels.shape[1]) + 1
 86 | 
 87 |         H, W = row1 - row0, col1 - col0
 88 | 
 89 |         for i in range(H):
 90 | 
 91 |             for j in range(W):
 92 |                 # i = j = 15
 93 |                 try:
 94 | 
 95 |                     pix_points = [[pxmin + loc[0] * res, pymax - loc[1] * res] for loc in
 96 |                                   [[j, i], [j + 1, i], [j + 1, i + 1], [j, i + 1], [j, i]]]
 97 | 
 98 |                     pix_poly = geometry.Polygon(pix_points)
 99 | 
100 |                     value = parcel_poly.intersection(pix_poly).area / res ** 2
101 | 
102 |                     if value == 0:  # no intersection
103 |                         continue
104 | 
105 |                     elif AOI_ratios[row0 + i, col0 + j] == 1.0:  # interior of at least another poly
106 | 
107 |                         if AOI_labels[row0 + i, col0 + j] != label:  # mask only if label conflict
108 |                             AOI_masks[row0 + i, col0 + j] = 2
109 |                         continue
110 | 
111 |                     elif AOI_ratios[row0 + i, col0 + j] > 0:  # at least partly assigned to another poly
112 |                         if AOI_labels[row0 + i, col0 + j] != label:  # mask only if label conflict
113 |                            AOI_masks[row0 + i, col0 + j] = 1
114 | 
115 |                     if value > AOI_ratios[row0 + i, col0 + j]:  # this poly covers a larger area, assign here
116 |                         AOI_labels[row0 + i, col0 + j] = label
117 |                         AOI_ratios[row0 + i, col0 + j] = value
118 |                         AOI_ids[row0 + i, col0 + j] = id
119 |                         pix_center = np.array(pix_points)[:-1].mean(axis=0)
120 |                         AOI_distances[row0 + i, col0 + j] = closest_point_to_poly(
121 |                             np.array(parcel_poly.exterior.coords.xy).T, pix_center, return_dist=True)
122 | 
123 |                 except:
124 |                     continue
125 | 
126 |     return AOI_labels, AOI_ids, AOI_masks, AOI_ratios, AOI_distances, pd.DataFrame(invalid_shapes)
127 | 
128 | 
129 | def main():
130 |     # read ground truth data
131 |     gt_df = pd.read_csv(ground_truths_file)
132 | 
133 |     # ensure all polygons use the same crs
134 |     assert (gt_df['crs'] == gt_df['crs'].iloc[0]).all(), \
135 |         "Polygons corresponding to multiple CRS were found in %s" % ground_truths_file
136 |     crs = gt_df['crs'].iloc[0]
137 | 
138 |     # find unique years
139 |     years = gt_df['year'].drop_duplicates().to_list()
140 |     print("found ground truth data for years %s" % ", ".join([str(i) for i in years]))
141 | 
142 |     # 0 class will indicate background, if 0 class already exists in labels add one
143 |     if 0 in gt_df['ground_truth'].drop_duplicates().tolist():
144 |         gt_df['ground_truth'] += 1
145 | 
146 |     # sentinel products
147 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
148 |     prod_df = get_S2prod_info(imdirs)
149 |     assert (prod_df['West']==prod_df['West'].iloc[0]).all() and (prod_df['North']==prod_df['North'].iloc[0]).all(),\
150 |     "Sentinel products corresponding to multiple tiles were found in %s" % products_dir
151 |     geotr = GeoTransform(intr=prod_df['crs'].iloc[0].split(':')[1], outtr=gt_df['crs'].iloc[0], loc2loc=True)
152 |     prod_WN = prod_df[['West', 'North']].iloc[0].tolist()
153 |     prod_WN = geotr(prod_WN[0], prod_WN[1])  # in ground truth data coordinate system
154 |     d = (10 * prod_df[['height', 'width']].iloc[0].values).tolist()
155 | 
156 |     # find all ground truth data that fall inside sentinel product
157 |     prod_poly = geometry.Polygon([[prod_WN[0] + loc[0] * d[0], prod_WN[1] - loc[1] * d[1]] for loc in
158 |                                   [[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]])
159 | 
160 |     def f(x):
161 |         try:
162 |             x = get_points_from_str_poly(x)
163 |             W = x[:, 0].min()
164 |             E = x[:, 0].max()
165 |             S = x[:, 1].min()
166 |             N = x[:, 1].max()
167 |             x = geometry.Polygon(x)
168 |             inratio = prod_poly.intersection(x).area / x.area
169 |             return np.array([N, S, W, E, inratio])
170 |         except:
171 |             return np.array([0, 0, 0, 0, 0])
172 | 
173 |     gt_df[['N', 'S', 'W', 'E', 'inratio']] = np.stack(gt_df['geometry'].apply(f).values)
174 |     gt_df = gt_df[gt_df['inratio'] == 1.0]
175 |     print("found %d polygons inside sentinel tile" % gt_df.shape[0])
176 | 
177 |     # increasing AOI size will allow extracting the parcels at the boundary of the true AOI placed at the center of the
178 |     # image. This shouldnt make a difference when splitting the AOI by grid a slabels will be zero in these locations
179 |     maxy = int(np.ceil(gt_df['N'].max())) + res * sample_size   # N
180 |     miny = int(np.floor(gt_df['S'].min())) - res * sample_size  # S!
181 |     maxx = int(np.ceil(gt_df['E'].max())) + res * sample_size   # E!
182 |     minx = int(np.floor(gt_df['W'].min())) - res * sample_size  # W
183 | 
184 |     # increase AOI dimensions to match integer multiple of sample size
185 |     if np.ceil((maxy - miny) / (sample_size * 10)) != (maxy - miny) / (sample_size * 10):
186 |         dy = (np.ceil((maxy - miny) / (sample_size * 10)) - (maxy - miny) / (sample_size * 10)) * (sample_size * 10)
187 |         miny = miny - dy
188 |     if np.ceil((maxx - minx) / (sample_size * 10)) != (maxx - minx) / (sample_size * 10):
189 |         dx = (np.ceil((maxx - minx) / (sample_size * 10)) - (maxx - minx) / (sample_size * 10)) * (sample_size * 10)
190 |         maxx = maxx + dx
191 |     dx = maxx - minx
192 |     dy = maxy - miny
193 |     anchor = minx, miny  # WS
194 | 
195 |     pool = Pool(num_processes)
196 | 
197 |     for year in years:
198 |         # year = years[0]
199 | 
200 |         geodata = gt_df[gt_df['year'] == year].reset_index(drop=True)
201 | 
202 |         inputs = [[i, df_, anchor, dx, dy] for i, df_ in enumerate(split_df(geodata, num_processes))]
203 | 
204 |         outputs = pool.map(extract_labels_raster, inputs)
205 |         AOI_labels = np.stack([out_[0] for out_ in outputs])
206 |         AOI_ids = np.stack([out_[1] for out_ in outputs])
207 |         AOI_masks = np.stack([out_[2] for out_ in outputs])
208 |         AOI_ratios = np.stack([out_[3] for out_ in outputs])
209 |         AOI_distances = np.stack([out_[4] for out_ in outputs])
210 |         invalid_shapes = pd.concat([out_[5] for out_ in outputs])
211 | 
212 |         labels = AOI_labels.max(axis=0)
213 |         masks = AOI_masks.max(axis=0)
214 |         ids = AOI_ids.sum(axis=0)
215 |         ratios = AOI_ratios.max(axis=0)
216 |         distances = AOI_distances.max(axis=0)
217 | 
218 |         locs = np.stack(np.where((AOI_labels > 0).sum(axis=0) > 1)).T
219 | 
220 |         for i, loc in enumerate(locs):
221 | 
222 |             if i % 1000 == 0:
223 |                 print("correcting inter process overlaps, step %d of %d" % (i, locs.shape[0]))
224 | 
225 |             if any(AOI_ratios[:, loc[0], loc[1]] == 1.0):
226 |                 masks[loc[0], loc[1]] = 2
227 |             else:
228 |                 masks[loc[0], loc[1]] = 1
229 | 
230 |             idx = np.argmax(AOI_ratios[:, loc[0], loc[1]])
231 |             labels[loc[0], loc[1]] = AOI_labels[idx, loc[0], loc[1]]
232 |             ids[loc[0], loc[1]] = AOI_ids[idx, loc[0], loc[1]]
233 |             ratios[loc[0], loc[1]] = AOI_ratios[idx, loc[0], loc[1]]
234 |             distances[loc[0], loc[1]] = AOI_distances[idx, loc[0], loc[1]]
235 | 
236 |         np.savetxt("%s/LABELS_Y%s_N%s_W%s_R%d_CRS%s.csv" %
237 |                    (savedir, str(year), str(int(maxy)), str(int(minx)), res, str(crs)), labels)
238 |         np.savetxt("%s/IDS_Y%s_N%s_W%s_R%d_CRS%s.csv" %
239 |                    (savedir, str(year), str(int(maxy)), str(int(minx)), res, str(crs)), ids)
240 |         np.savetxt("%s/MASKS_Y%s_N%s_W%s_R%d_CRS%s.csv" %
241 |                    (savedir, str(year), str(int(maxy)), str(int(minx)), res, str(crs)), masks)
242 |         np.savetxt("%s/RATIOS_Y%s_N%s_W%s_R%d_CRS%s.csv" %
243 |                    (savedir, str(year), str(int(maxy)), str(int(minx)), res, str(crs)), ratios)
244 |         np.savetxt("%s/DISTANCES_Y%s_N%s_W%s_R%d_CRS%s.csv" %
245 |                    (savedir, str(year), str(int(maxy)), str(int(minx)), res, str(crs)), distances)
246 | 
247 |         if invalid_shapes.shape[0] != 0:
248 |             invalid_shapes.to_csv(
249 |                 "%s/INVALID_Y%s_N%s_W%s_R%d_CRS%s.csv" %
250 |                 (savedir, str(year), str(maxy), str(int(maxx)), res, str(crs)), index=False)
251 | 
252 | 
253 | 
254 | if __name__ == "__main__":
255 | 
256 |     parser = argparse.ArgumentParser(description='Make raster from shapely polygons')
257 |     parser.add_argument('--ground_truths_file', help='filename containing ground truth parcel polygons')
258 |     parser.add_argument('--products_dir', help='directory containing sentinel products')
259 |     parser.add_argument('--savedir', help='save directory to extract ground truths in raster mode')
260 |     parser.add_argument('--res', default=10, help='pixel size in meters')
261 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset image samples')
262 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
263 | 
264 |     args = parser.parse_args()
265 | 
266 |     ground_truths_file = args.ground_truths_file
267 | 
268 |     products_dir = args.products_dir
269 | 
270 |     savedir = args.savedir
271 |     print("savedir: ", savedir)
272 |     if not os.path.exists(savedir):
273 |         os.makedirs(savedir)
274 | 
275 |     res = float(args.res)
276 |     assert np.ceil(10. / res) == 10. / res, "Label pixel size should divide min satellite pixel size (10m), but %.1f was selected" % res
277 | 
278 |     sample_size = int(args.sample_size)
279 | 
280 |     num_processes = int(args.num_processes)
281 | 
282 |     main()
283 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/extract_parcel_ground_truths.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | import numpy as np
  4 | from shapely import geometry
  5 | import os
  6 | from glob import glob
  7 | from multiprocessing import Pool
  8 | if __name__ == "__main__" and __package__ is None:
  9 |     from sys import path
 10 |     from os.path import dirname as dir
 11 |     path.insert(0, dir(dir(path[0])))
 12 |     __package__ = "examples"
 13 | from utils.geospatial_data_utils import GeoTransform, get_points_from_str_poly, simplify_poly_points, is_valid, str_line_eq
 14 | from utils.multiprocessing_utils import split_df
 15 | from utils.sentinel_products_utils import get_S2prod_info
 16 | import pickle
 17 | 
 18 | 
 19 | def extract_parcel_labels_raster(inputs):
 20 | 
 21 |     # inputs = inputs[0]
 22 |     rank, geodata, W, N, Wp, Np, year, crs = inputs
 23 | 
 24 |     # arrays to save
 25 |     year_savedir = os.path.join(savedir, 'Y%s_N%s_W%s_R%d_CRS%s' % (year, N, W, res, crs))
 26 |     if not os.path.exists(year_savedir):
 27 |         os.makedirs(year_savedir)
 28 | 
 29 |     saved_data_info = []
 30 |     for ii in range(geodata.shape[0]):
 31 |         print("process %d, parcel %d of %d" % (rank, ii+1, geodata.shape[0]))
 32 |         parcel_poly = geodata['geometry'][ii]
 33 |         label = geodata['ground_truth'][ii]
 34 |         id = geodata['id'][ii]
 35 | 
 36 |         points = get_points_from_str_poly(parcel_poly)
 37 |         anchor = np.array(geometry.Polygon(points).centroid)  # anchor is centroid of parcel
 38 |         # anchor = points.mean(axis=0)
 39 |         N0 = anchor[1] + sample_size * 10. / 2.  # Nmax of image
 40 |         W0 = anchor[0] - sample_size * 10. / 2.  # Wmin of image
 41 | 
 42 |         # correct for non integer offset wrt product Nmax, Wmax (top-left) coordinates
 43 |         dN = (Np - N0) % 60
 44 |         dW = (W0 - Wp) % 60
 45 |         N0 += dN
 46 |         W0 -= dW
 47 |         anchor = np.array([W0 + sample_size * 10. / 2., N0 - sample_size * 10. / 2.])  # recalculate centroid
 48 | 
 49 |         pr = (points - anchor + sample_size * 10. / 2)  # local polygon coordinates
 50 |         parcel_poly = geometry.Polygon(pr)
 51 | 
 52 |         ### Define criterion for removing very slender fields
 53 |         slenderness = parcel_poly.area / parcel_poly.length  # 1.0
 54 |         if slenderness < 5:
 55 |             continue
 56 | 
 57 |         # min, max coordinates
 58 |         pxmin, pymin = pr.min(axis=0)
 59 |         pxmax, pymax = pr.max(axis=0)
 60 | 
 61 |         # DONT DO VERY SMALL ONES
 62 |         if ((pxmax - pxmin) < 50) or ((pymax - pymin) < 50):
 63 |             continue
 64 | 
 65 |         if not is_valid(parcel_poly, pxmin, pymax):
 66 |             try:
 67 |                 int_area = sum(
 68 |                     [geometry.Polygon(np.array(pol.coords[:])).area for pol in parcel_poly.buffer(0).interiors])
 69 |                 ext_area = geometry.Polygon(np.array(parcel_poly.buffer(0).exterior.coords[:])).area
 70 |                 if int_area / ext_area < 0.05:  # threshold for discarding a parcel polygon
 71 |                     print("included, number of interior areas %d, intarea: %.0f, extarea: %.0f, ratio: %.4f" %
 72 |                           (len(parcel_poly.buffer(0).interiors), int_area, ext_area, int_area / ext_area))
 73 |                     parcel_poly = geometry.Polygon(np.array(parcel_poly.buffer(0).exterior.coords[:]))
 74 |                     pr = np.stack([np.array(i) for i in parcel_poly.exterior.coords.xy]).T
 75 |                     pxmin, pymin = pr.min(axis=0)
 76 |                     pxmax, pymax = pr.max(axis=0)
 77 |                 else:
 78 |                     print("excluded, number of interior areas %d, intarea: %.0f, extarea: %.0f, ratio: %.4f" %
 79 |                           (len(parcel_poly.buffer(0).interiors), int_area, ext_area, int_area / ext_area))
 80 |                     values = geodata.iloc[ii].to_list()
 81 |                     for v in [N0, W0, None]:
 82 |                         values.append(v)
 83 |                     saved_data_info.append(values)
 84 |                     continue
 85 |             except:
 86 |                 continue
 87 | 
 88 |         # define zero placeholder matrices
 89 |         ratios = np.zeros((label_mult * sample_size, label_mult * sample_size), dtype=np.float32)
 90 |         alpha = ratios.copy()
 91 |         global_beta = ratios.copy()
 92 |         local_beta = ratios.copy()
 93 | 
 94 |         # include 2 pixel threshold (this wont matter as external pixels will not update their values)
 95 |         row0 = int(np.floor((pymin / (sample_size * 10)) * label_mult * sample_size)) - 2  # min row containing parcel
 96 |         row1 = int(np.ceil((pymax / (sample_size * 10)) * label_mult * sample_size)) + 2  # max row containing parcel
 97 |         col0 = int(np.floor(pxmin / (sample_size * 10) * label_mult * sample_size)) - 2  # min col containing parcel
 98 |         col1 = int(np.ceil(pxmax / (sample_size * 10) * label_mult * sample_size)) + 2  # max col containing parcel
 99 | 
100 |         Height, Width = row1 - row0, col1 - col0
101 | 
102 |         for i in range(Height):
103 | 
104 |             for j in range(Width):
105 | 
106 |                 if (row0 + i) * (col0 + j) < 0:
107 |                     continue
108 | 
109 |                 try:
110 | 
111 |                     pix_points = [[res * (col0 + j + loc[1]), res * (row0 + i + loc[0])] for loc in
112 |                                   [[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5], [-0.5, -0.5]]]
113 | 
114 |                     pix_poly = geometry.Polygon(pix_points)
115 | 
116 |                     value = parcel_poly.intersection(pix_poly).area / res ** 2
117 |                     if (0 < value) and (value < 1):  # parcel cuts through pixel
118 |                         global_points = np.array(parcel_poly.boundary.intersection(pix_poly.boundary))
119 |                         if global_points.shape[0] > 2:  # !!!
120 |                             global_points = global_points[:2]
121 |                         global_params = str_line_eq(global_points)
122 |                         alpha[label_mult * sample_size - (row0 + i + 1), col0 + j + 1] = global_params[0]
123 |                         global_beta[label_mult * sample_size - (row0 + i + 1), col0 + j + 1] = global_params[1] / (sample_size * res)
124 |                         local_points = (global_points - np.array([res * (col0 + j + 0.5),  res * (row0 + i + 0.5)])) / res
125 |                         local_params = str_line_eq(local_points)
126 |                         local_beta[label_mult * sample_size - (row0 + i + 1), col0 + j + 1] = local_params[1]
127 | 
128 | 
129 |                     if value == 0:  # no intersection
130 |                         continue
131 | 
132 |                     ratios[label_mult * sample_size - (row0 + i + 0), col0 + j + 0] = value
133 | 
134 |                 except:
135 |                     continue
136 | 
137 |         idxN = int(np.round((N_ - N0) / res - 1., 0))
138 |         idxW = int(np.round((W0 - W_) / res - 1., 0))
139 | 
140 |         # add AOI raster ground truths
141 |         labels2d = raster['LABELS'][idxN: idxN + label_mult * sample_size, idxW: idxW + label_mult * sample_size]
142 |         ids2d = raster['IDS'][idxN: idxN + label_mult * sample_size, idxW: idxW + label_mult * sample_size]
143 |         masks2d = raster['MASKS'][idxN: idxN + label_mult * sample_size, idxW: idxW + label_mult * sample_size]
144 |         distances2d = raster['DISTANCES'][idxN: idxN + label_mult * sample_size, idxW: idxW + label_mult * sample_size]
145 |         ratios2d = raster['RATIOS'][idxN: idxN + label_mult * sample_size, idxW: idxW + label_mult * sample_size]
146 | 
147 |         # add simpilied polygons
148 |         simplified = simplify_poly_points(pr, Npoly)
149 | 
150 |         sample = {'N': N0, 'W': W0,
151 |                   'poly_var': pr / res, 'poly_fixed': simplified / res,
152 |                   'label': label, 'id': id,
153 |                   'labels2d': labels2d, 'ids2d': ids2d, 'masks2d': masks2d, 'distances2d': distances2d,
154 |                   'ratios': ratios, 'alpha': alpha, 'global_beta': global_beta, 'local_beta': local_beta}
155 | 
156 |         impath = os.path.join(year_savedir, 'N%d_E%d_ground_truths.pickle' % (N0, W0))
157 |         with open(impath, 'wb') as handle:
158 |             pickle.dump(sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
159 | 
160 |         values = geodata.iloc[ii].to_list()
161 |         for v in [N0, W0, impath]:
162 |             values.append(v)
163 |         saved_data_info.append(values)
164 | 
165 |     return saved_data_info
166 | 
167 | 
168 | def main():
169 |     global N_
170 |     global W_
171 |     global R_
172 |     global CRS_
173 |     global raster
174 |     global label_mult
175 | 
176 |     label_mult = int(10. / res)
177 | 
178 |     # ground truth data
179 |     gt_df = pd.read_csv(ground_truths_file)
180 |     if 'id' not in gt_df:
181 |         print('Column "id" not included. Assigning values from 1 to file size')
182 |         gt_df['id'] = range(1, gt_df.shape[0]+1)
183 |     # gt_df['id'] = range(1, gt_df.shape[0]+1)
184 |     assert (gt_df['crs'] == gt_df['crs'].iloc[0]).all(), \
185 |         "Polygons corresponding to multiple CRS were found in %s" % ground_truths_file
186 |     crs = gt_df['crs'].iloc[0]
187 |     yearly_grouped_gt = gt_df.groupby('year')
188 |     years = list(yearly_grouped_gt.groups.keys())
189 |     print("found ground truth data for years %s" % ", ".join([str(i) for i in years]))
190 |     if 0 in gt_df['ground_truth'].drop_duplicates():
191 |         gt_df['ground_truth'] += 1
192 | 
193 |     # AOI rasterized ground truths
194 |     raster_files = [fname for fname in os.listdir(raster_labels_dir) if fname.endswith('csv')]
195 |     raster = {}
196 |     meta = []
197 |     for raster_file in raster_files:
198 |         # raster_file = raster_files[0]
199 |         ftype_ = raster_file.split("_")[0]
200 |         year_ = raster_file.split("_")[1][1:]
201 |         N_ = raster_file.split("_")[2][1:]
202 |         W_ = raster_file.split("_")[3][1:]
203 |         R_ = raster_file.split("_")[4][1:]
204 |         CRS_ = raster_file.split("_")[5][3:].split('.')[0]
205 |         raster[ftype_] = np.loadtxt(os.path.join(raster_labels_dir, raster_file))
206 |         meta.append([year_, N_, W_, R_, CRS_])
207 |     meta = np.array(meta)
208 |     assert all([(meta[i] == meta[0]).all() for i in range(len(meta))]), \
209 |         'Not all AOI raster ground truth files correspond to the same location, time, resolution or crs'
210 |     N_ = int(N_)
211 |     W_ = int(W_)
212 |     R_ = int(R_)
213 |     CRS_ = int(CRS_)
214 | 
215 |     # sentinel products
216 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
217 |     prod_df = get_S2prod_info(imdirs)
218 |     assert (prod_df['West']==prod_df['West'].iloc[0]).all() and (prod_df['North']==prod_df['North'].iloc[0]).all(),\
219 |     "Sentinel products corresponding to multiple tiles were found in %s" % products_dir
220 |     geotr = GeoTransform(intr=prod_df['crs'].iloc[0].split(':')[1], outtr=gt_df['crs'].iloc[0], loc2loc=gt_df['crs'].iloc[0] != '4326')
221 |     prod_WN = prod_df[['West', 'North']].iloc[0].tolist()
222 |     prod_WN = geotr(prod_WN[0], prod_WN[1])  # in ground truth data coordinate system
223 |     d = (10 * prod_df[['height', 'width']].iloc[0].values).tolist()
224 | 
225 |     # find all ground truth data that fall inside sentinel product
226 |     prod_poly = geometry.Polygon([[prod_WN[0] + loc[0] * d[0], prod_WN[1] - loc[1] * d[1]] for loc in
227 |                                   [[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]])
228 |     print(prod_poly)
229 |     def f(x):
230 |         try:
231 |             x = get_points_from_str_poly(x)
232 |             W = x[:, 0].min()
233 |             E = x[:, 0].max()
234 |             S = x[:, 1].min()
235 |             N = x[:, 1].max()
236 |             x = geometry.Polygon(x)
237 |             inratio = prod_poly.intersection(x).area / x.area
238 |             return np.array([N, S, W, E, inratio])
239 |         except:
240 |             return np.array([0, 0, 0, 0, 0])
241 | 
242 |     gt_df[['N', 'S', 'W', 'E', 'inratio']] = np.stack(gt_df['geometry'].apply(f).values)
243 |     gt_df = gt_df[gt_df['inratio'] == 1.0]
244 |     print("found %d polygons inside sentinel tile" % gt_df.shape[0])
245 | 
246 |     N = int(np.ceil(gt_df['N'].max()))
247 |     W = int(np.floor(gt_df['W'].min()))
248 | 
249 |     pool = Pool(num_processes)
250 | 
251 |     for year in years:
252 |         # year = years[0]
253 | 
254 |         geodata = gt_df[gt_df['year'] == year].reset_index(drop=True)
255 | 
256 |         inputs = [[i, df_, W, N, prod_WN[0], prod_WN[1], year, crs] for i, df_ in enumerate(split_df(geodata, num_processes))]
257 | 
258 |         outputs = pool.map(extract_parcel_labels_raster, inputs)
259 | 
260 |         saved_data_info = pd.concat(pd.DataFrame(out) for out in outputs)
261 |         save_name = os.path.join(savedir, 'Y%s_N%s_W%s_R%d_CRS%s' % (year, N, W, res, crs), 'saved_data_info.csv')
262 |         saved_data_info.columns = ['id', 'ground_truth', 'crs', 'year', 'geometry', 'Np', 'Sp', 'Wp', 'Ep',
263 |        'inratio', 'Dy', 'Dx', 'D', 'Ntl', 'Wtl', 'filepath']
264 |         saved_data_info.to_csv(save_name, index=False)
265 | 
266 | 
267 | if __name__ == "__main__":
268 | 
269 |     parser = argparse.ArgumentParser(description='Make raster from shapely polygons')
270 |     parser.add_argument('--ground_truths_file', help='filename containing ground truth parcel polygons')
271 |     parser.add_argument('--raster_labels_dir', help='directory containing extracted raster ground truths')
272 |     parser.add_argument('--products_dir', help='directory containing sentinel products')
273 |     parser.add_argument('--savedir', help='save directory to extract ground truths in raster mode')
274 |     parser.add_argument('--res', default=10, help='pixel size in meters')
275 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
276 |     parser.add_argument('--Npoly', default=50, help='number of vertices for polygons')
277 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
278 | 
279 |     args = parser.parse_args()
280 | 
281 |     ground_truths_file = args.ground_truths_file
282 | 
283 |     raster_labels_dir = args.raster_labels_dir
284 | 
285 |     products_dir = args.products_dir
286 | 
287 |     savedir = args.savedir
288 |     print("savedir: ", savedir)
289 |     if not os.path.exists(savedir):
290 |         os.makedirs(savedir)
291 | 
292 |     # res = int(args.res)
293 |     res = float(args.res)
294 |     assert np.ceil(10. / res) == 10. / res, "Label pixel size should divide min satellite pixel size (10m), but %.1f was selected" % res
295 | 
296 |     sample_size = int(args.sample_size)
297 | 
298 |     Npoly = int(args.Npoly)
299 | 
300 |     num_processes = int(args.num_processes)
301 | 
302 |     main()
303 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/find_parcel_dimensions.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | from shapely import geometry
  6 | import os
  7 | from glob import glob
  8 | from multiprocessing import Pool
  9 | if __name__ == "__main__" and __package__ is None:
 10 |     from sys import path
 11 |     from os.path import dirname as dir
 12 |     path.insert(0, dir(dir(path[0])))
 13 |     __package__ = "examples"
 14 | from utils.geospatial_data_utils import GeoTransform, get_points_from_str_poly
 15 | from utils.multiprocessing_utils import split_df
 16 | from utils.sentinel_products_utils import get_S2prod_info
 17 | 
 18 | 
 19 | def get_nbins(x):
 20 |     N = x.shape[0]
 21 |     if N < 1e2:
 22 |         return 10
 23 |     if N < 1e3:
 24 |         return 25
 25 |     # if N < 1e4:
 26 |     else:
 27 |         return 100
 28 | 
 29 | 
 30 | def main():
 31 |     # ground truth data
 32 |     gt_df = pd.read_csv(ground_truths_file)
 33 |     # gt_df['id'] = range(1, gt_df.shape[0]+1)
 34 |     # gt_df['crs'] = 2154
 35 |     if 'id' not in gt_df:
 36 |         gt_df['id'] = range(1, gt_df.shape[0]+1)
 37 |     assert (gt_df['crs'] == gt_df['crs'].iloc[0]).all(), \
 38 |         "Polygons corresponding to multiple CRS were found in %s" % ground_truths_file
 39 |     crs = gt_df['crs'].iloc[0]
 40 |     yearly_grouped_gt = gt_df.groupby('year')
 41 |     years = list(yearly_grouped_gt.groups.keys())
 42 |     print("found ground truth data for years %s" % ", ".join([str(i) for i in years]))
 43 |     # if 0 in gt_df['ground_truth'].drop_duplicates():
 44 |     #     gt_df['ground_truth'] += 1
 45 | 
 46 |     # sentinel products
 47 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
 48 |     prod_df = get_S2prod_info(imdirs)
 49 |     assert (prod_df['West']==prod_df['West'].iloc[0]).all() and (prod_df['North']==prod_df['North'].iloc[0]).all(),\
 50 |     "Sentinel products corresponding to multiple tiles were found in %s" % products_dir
 51 |     geotr = GeoTransform(intr=prod_df['crs'].iloc[0].split(':')[1], outtr=gt_df['crs'].iloc[0], loc2loc=True)
 52 |     prod_WN = prod_df[['West', 'North']].iloc[0].tolist()
 53 |     prod_WN = geotr(prod_WN[0], prod_WN[1])  # in ground truth data coordinate system
 54 |     d = (10 * prod_df[['height', 'width']].iloc[0].values).tolist()
 55 | 
 56 |     # find all ground truth data that fall inside sentinel product
 57 |     prod_poly = geometry.Polygon([[prod_WN[0] + loc[0] * d[0], prod_WN[1] - loc[1] * d[1]] for loc in
 58 |                                   [[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]])
 59 |     print(prod_poly)
 60 |     def f(x):
 61 |         try:
 62 |             x = get_points_from_str_poly(x)
 63 |             W = x[:, 0].min()
 64 |             E = x[:, 0].max()
 65 |             S = x[:, 1].min()
 66 |             N = x[:, 1].max()
 67 |             num_vertices = x.shape[0]
 68 |             x = geometry.Polygon(x)
 69 |             inratio = prod_poly.intersection(x).area / x.area
 70 |             return np.array([N, S, W, E, inratio, num_vertices])
 71 |         except:
 72 |             return np.array([0, 0, 0, 0, 0, 0])
 73 | 
 74 |     gt_df[['N', 'S', 'W', 'E', 'inratio', 'num_vertices']] = np.stack(gt_df['geometry'].apply(f).values)
 75 |     gt_df = gt_df[gt_df['inratio'] == 1.0]
 76 |     print("found %d polygons inside sentinel tile" % gt_df.shape[0])
 77 | 
 78 |     gt_df['Dy'] = np.abs(gt_df['N'] - gt_df['S'])
 79 |     gt_df['Dx'] = np.abs(gt_df['E'] - gt_df['W'])
 80 |     gt_df['D'] = gt_df[['Dx', 'Dy']].max(axis=1)
 81 |     # gt_df['D'].max()
 82 |     # gt_df[gt_df['D'] < 480].shape[0] / gt_df.shape[0]
 83 |     # gt_df[gt_df['D'] > 700].shape[0] / gt_df.shape[0]
 84 |     gt_df.to_csv(os.path.join(save_dir, 'gt_df_parcels_in_AOI.csv'), index=False)
 85 | 
 86 |     # if cutoff is None:
 87 |     # x = np.random.normal(mu, sigma, size=100)
 88 |     print('maxD   | %obj >maxD')
 89 |     print('-------------------')
 90 |     for maxd in [240, 320, 480, 640, 1000, 1280, 1600]:
 91 |         r = gt_df[gt_df['D'] < maxd].shape[0] / gt_df.shape[0]
 92 |         print('%s|%s' % (str(maxd).ljust(7), ('%.4f' % r).rjust(9)))
 93 |     plt.ioff()
 94 |     fig, ax = plt.subplots(figsize=(8, 4))
 95 |     n_bins = get_nbins(gt_df['D'])
 96 |     # plot the cumulative histogram
 97 |     n, bins, patches = ax.hist(gt_df['D'].values, n_bins, density=True, histtype='step',
 98 |                                cumulative=True, label='cummulative sum')
 99 |     ax.hist(gt_df['D'].values, bins=bins, density=True, histtype='step', cumulative=-1,
100 |             label='reversed cummulative sum')
101 |     ax.grid(True)
102 |     ax.legend(loc='right')
103 |     ax.set_title('Cumulative step histograms')
104 |     ax.set_xlabel('Object largest x-y dimension')
105 |     ax.set_ylabel('Likelihood of occurrence')
106 |     # plt.show()
107 |     plt.savefig(os.path.join(save_dir, 'parcel_dimensions_cumsum.png'))
108 | 
109 |     plt.figure()
110 |     plt.hist(gt_df['num_vertices'], 100, density=True)
111 |     plt.grid()
112 |     plt.xlabel('Number of AF Vertices')
113 |     plt.ylabel('density')
114 |     plt.savefig(os.path.join(save_dir, 'number_of_vertices_hist.png'))
115 | 
116 |     # else:
117 |     #     gt_df = gt_df[gt_df['D'] < cutoff]
118 |     #     print('Number of samples is %d for max object dimension <%dm' % (gt_df.shape[0], cutoff))
119 |     #     gt_df.to_csv(os.path.join(save_dir, 'gt_df_maxd_lt_%d.csv' % cutoff), index=False)
120 | 
121 | 
122 | 
123 | if __name__ == "__main__":
124 | 
125 |     parser = argparse.ArgumentParser(description='Make raster from shapely polygons')
126 |     parser.add_argument('--ground_truths_file', help='filename containing ground truth parcel polygons')
127 |     parser.add_argument('--products_dir', help='directory containing sentinel products')
128 |     parser.add_argument('--save_dir', help='save directory to extract ground truths in raster mode')
129 |     # parser.add_argument('--cutoff', default=None, help='max allowed parcel size. If None to script will save a cumsum '
130 |     #                                                    'histogram to help decide the max alloed size')
131 |     # parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
132 |     # parser.add_argument('--num_processes', default=4, help='number of parallel processes')
133 | 
134 |     args = parser.parse_args()
135 | 
136 |     ground_truths_file = args.ground_truths_file
137 | 
138 |     products_dir = args.products_dir
139 | 
140 |     save_dir = args.save_dir
141 |     print("save_dir: ", save_dir)
142 |     if not os.path.exists(save_dir):
143 |         os.makedirs(save_dir)
144 | 
145 |     # cutoff = int(args.cutoff)
146 | 
147 |     main()
148 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/make_image_timeseries_for_labels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For a set of extracted image crops and a labelled_dense label map, make a timeseries of all positions matched with labels
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import numpy as np
  7 | import os
  8 | import shutil
  9 | import pickle
 10 | if __name__ == "__main__" and __package__ is None:
 11 |     from sys import path
 12 |     from os.path import dirname as dir
 13 |     path.insert(0, dir(dir(path[0])))
 14 |     __package__ = "examples"
 15 | from utils.data_utils import find_number
 16 | from utils.date_utils import get_doy
 17 | from utils.multiprocessing_utils import run_pool
 18 | 
 19 | 
 20 | mult = {'B01': 1/6., 'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 21 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 22 | 
 23 | 
 24 | def match_labels_images(yearlocs):
 25 | 
 26 |     refband = bands[0]
 27 | 
 28 |     saved_files_info = []
 29 |     for yearloc in yearlocs:
 30 | 
 31 |         try:
 32 | 
 33 |             idx = yearloc_groups[yearloc]
 34 |             data = iminfo.iloc[idx, :].sort_values(by='DOY').copy()
 35 |             data = data.drop_duplicates(subset=['DOY'], keep='first')  # some products downloaded twice
 36 | 
 37 |             Y = data['Year'].iloc[0]
 38 |             N = data['Nij'].iloc[0]
 39 |             W = data['Wij'].iloc[0]
 40 |             il = data['il'].iloc[0]
 41 |             jl = data['jl'].iloc[0]
 42 | 
 43 |             assert all(data['Year'] == Y)
 44 |             assert all(data['Nij'] == N)
 45 |             assert all(data['Wij'] == W)
 46 |             assert all(data['il'] == il)
 47 |             assert all(data['jl'] == jl)
 48 | 
 49 |             timeseries_sample = {band: [] for band in bands}
 50 |             timeseries_sample['doy'] = []
 51 |             for sample_info in data[['sample_path', 'DOY']].values:
 52 | 
 53 |                 impath, doy = sample_info
 54 | 
 55 |                 with open(impath, 'rb') as handle:
 56 |                     sample = pickle.load(handle, encoding='latin1')
 57 | 
 58 |                 for key in bands:
 59 |                     timeseries_sample[key].append(sample[key])
 60 |                 timeseries_sample['doy'].append(np.array(doy))
 61 | 
 62 |             for key in bands:
 63 |                 timeseries_sample[key] = np.stack(timeseries_sample[key])
 64 |             timeseries_sample['doy'] = np.stack(timeseries_sample['doy'])
 65 |             timeseries_sample['year'] = np.array(Y).astype(np.int32)
 66 | 
 67 |             timesteps = timeseries_sample[refband].shape[0]
 68 | 
 69 |             for ltype in labels.keys():
 70 |                 timeseries_sample[ltype.lower()] = \
 71 |                     labels[ltype][il * label_mult * sample_size: (il + 1) * label_mult * sample_size, jl * label_mult * sample_size: (jl + 1) * label_mult * sample_size]
 72 | 
 73 |             savename = os.path.join(year_savedir, "%d_%d_%s.pickle" % (int(N), int(W), Y))
 74 |             with open(savename, 'wb') as handle:
 75 |                 pickle.dump(timeseries_sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
 76 | 
 77 |             saved_files_info.append([savename, Y, N, W, sample_size, sample_size, timesteps, il, jl, "completed"])
 78 | 
 79 |         except:
 80 | 
 81 |             saved_files_info.append(["", Y, N, W, sample_size, sample_size, 0, il, jl, "failed"])
 82 | 
 83 |     saved_files_info = pd.DataFrame(data=saved_files_info, columns=['sample_path', 'Year', 'N', 'W', 'dy', 'dx', 'dt',
 84 |                                                                     'label_win_i', 'label_win_j', 'status'])
 85 |     return saved_files_info
 86 | 
 87 | 
 88 | def main():
 89 | 
 90 |     global yearloc_groups
 91 |     global iminfo
 92 |     global labels
 93 |     global year_savedir
 94 |     global label_mult
 95 | 
 96 |     # ratio of image to label pixel size
 97 |     label_mult = int(10 / res)
 98 | 
 99 |     # read info on extracted image windows
100 |     iminfo = pd.read_csv(os.path.join(windows_dir, "extracted_windows_data_info.csv"))
101 |     crs = iminfo['crs'].iloc[0]
102 | 
103 |     # remove non extracted locations
104 |     iminfo = iminfo[~pd.isnull(iminfo['sample_path'])].reset_index(drop=True)
105 |     iminfo['DOY'] = iminfo['Date'].apply(lambda s: get_doy(str(s)))
106 |     iminfo['Year'] = iminfo['Date'].apply(lambda s: str(s)[:4])
107 | 
108 |     # ground truths
109 |     gtfiles = os.listdir(ground_truths_dir)
110 |     years = [find_number(s, "Y") for s in gtfiles]
111 |     files = {year: {} for year in set(years)}
112 |     for i, file in enumerate(gtfiles):
113 |         if not file.startswith('INVALID'):
114 |             files[years[i]][file.split("_")[0]] = file
115 |     print("found ground truths in raster for years %s" % ", ".join(list(files.keys())))
116 | 
117 |     saved_files_info = []
118 | 
119 |     for year in set(years):
120 | 
121 |         year_savedir = os.path.join(savedir, year)
122 |         if not os.path.isdir(year_savedir):
123 |             os.makedirs(year_savedir)
124 | 
125 |         labels = {}
126 |         for ltype in files[year]:
127 | 
128 |             labels[ltype] = np.loadtxt(os.path.join(ground_truths_dir, files[year][ltype]), dtype=np.float32)
129 | 
130 |         yearloc_groups = iminfo[iminfo['Year'] == year].groupby(['Nij', 'Wij'], as_index=False).groups
131 |         yearlocs = list(yearloc_groups.keys())
132 | 
133 |         df = run_pool(yearlocs, match_labels_images, num_processes)
134 |         df = pd.concat(df)
135 | 
136 |         saved_files_info.append(df)
137 | 
138 |     df = pd.concat(saved_files_info).reset_index(drop=True)
139 |     df['crs'] = crs
140 |     df.to_csv(os.path.join(savedir, "saved_timeseries_data_info.csv"), index=False)
141 | 
142 |     # delete previously saved image windows
143 |     shutil.rmtree(windows_dir)
144 | 
145 | 
146 | if __name__ == "__main__":
147 | 
148 |     parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
149 |     parser.add_argument('--ground_truths_dir', help='directory containing ground truth parcels raster')
150 |     parser.add_argument('--products_dir', help='directory containing downloaded sentinel products')
151 |     parser.add_argument('--windows_dir', help='directory containing extracted windows from sentinel products')
152 |     parser.add_argument('--savedir', help='save directory for image timeseries with labels')
153 |     parser.add_argument('--bands', default=None, help='which satellite image bands to use')
154 |     parser.add_argument('--res', default=10, help='pixel size in meters')
155 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
156 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
157 |     # ---------------------------------------------------------------------------------------------
158 | 
159 |     args = parser.parse_args()
160 | 
161 |     ground_truths_dir = args.ground_truths_dir
162 | 
163 |     products_dir = args.products_dir
164 | 
165 |     windows_dir = args.windows_dir
166 | 
167 |     savedir = args.savedir
168 |     if not os.path.exists(savedir):
169 |         os.makedirs(savedir)
170 | 
171 |     res = float(args.res)
172 |     assert np.ceil(10. / res) == 10. / res, "Label pixel size should divide min satellite pixel size (10m), but %.1f was selected" % res
173 | 
174 |     sample_size = int(args.sample_size)
175 | 
176 |     num_processes = int(args.num_processes)
177 | 
178 |     bands = args.bands
179 |     if bands == 'None':
180 |         bands = list(mult.keys())
181 |     else:
182 |         bands = bands.split(',')
183 | 
184 |     main()
185 | 
186 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/make_image_timeseries_for_parcel_labels.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For a set of extracted image crops and a labelled_dense label map, make a timeseries of all positions matched with labels
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import numpy as np
  7 | import os
  8 | import shutil
  9 | import pickle
 10 | if __name__ == "__main__" and __package__ is None:
 11 |     from sys import path
 12 |     from os.path import dirname as dir
 13 |     path.insert(0, dir(dir(path[0])))
 14 |     __package__ = "examples"
 15 | from utils.data_utils import find_number
 16 | from utils.date_utils import get_doy
 17 | from utils.multiprocessing_utils import run_pool
 18 | 
 19 | 
 20 | mult = {'B01': 1/6., 'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 21 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 22 | 
 23 | 
 24 | def match_labels_images(yearlocs):
 25 | 
 26 |     refband = bands[1]
 27 | 
 28 |     saved_files_info = []
 29 |     for jj, yearloc in enumerate(yearlocs):
 30 | 
 31 |         if jj % 1000 == 0:
 32 |             print("%d of %d" % (jj, len(yearlocs)))
 33 |         try:
 34 | 
 35 |             idx = yearloc_groups[yearloc]
 36 |             data = iminfo.iloc[idx, :].sort_values(by='DOY').copy()
 37 |             data = data.drop_duplicates(subset=['DOY'], keep='first')  # some products downloaded twice
 38 | 
 39 |             Y = data['Year'].iloc[0]
 40 |             N = data['Nl'].iloc[0]
 41 |             W = data['Wl'].iloc[0]
 42 | 
 43 |             assert all(data['Year'] == Y)
 44 |             assert all(data['Nl'] == N)
 45 |             assert all(data['Wl'] == W)
 46 | 
 47 |             timeseries_sample = {band: [] for band in bands}
 48 |             timeseries_sample['doy'] = []
 49 |             for sample_info in data[['sample_path', 'DOY']].values:
 50 |                 impath, doy = sample_info
 51 | 
 52 |                 with open(impath, 'rb') as handle:
 53 |                     sample = pickle.load(handle, encoding='latin1')
 54 | 
 55 |                 # image falls in black region for this product (should have been excluded in extract_images_for_parcel_labels.py)
 56 |                 if sample[refband].sum() == 0:
 57 |                     # print('zero sum')
 58 |                     continue
 59 | 
 60 |                 # image does not match required size (should have been excluded in extract_images_for_parcel_labels.py)
 61 |                 height, width = sample[refband].shape
 62 |                 if (height != sample_size) or (width != sample_size):
 63 |                     # print('unequal size')
 64 |                     continue
 65 | 
 66 |                 for key in bands:
 67 |                     timeseries_sample[key].append(sample[key])
 68 |                 timeseries_sample['doy'].append(np.array(doy))
 69 | 
 70 |             for key in bands:
 71 |                 timeseries_sample[key] = np.stack(timeseries_sample[key])
 72 |             timeseries_sample['doy'] = np.stack(timeseries_sample['doy'])
 73 |             timeseries_sample['year'] = np.array(Y).astype(np.int32)
 74 | 
 75 |             timesteps = timeseries_sample[refband].shape[0]
 76 | 
 77 |             gt = saved_gt_info[(saved_gt_info['Ntl'] == yearloc[0]) & (saved_gt_info['Wtl'] == yearloc[1])]
 78 |             with open(gt['filepath'].values[0], 'rb') as handle:
 79 |                 labels = pickle.load(handle, encoding='latin1')
 80 |             for ltype in labels.keys():
 81 |                 timeseries_sample[ltype.lower()] = labels[ltype]
 82 | 
 83 |             savename = os.path.join(year_savedir, "%d_%d_%s.pickle" % (int(N), int(W), Y))
 84 |             with open(savename, 'wb') as handle:
 85 |                 pickle.dump(timeseries_sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
 86 | 
 87 |             saved_files_info.append([savename, Y, N, W, sample_size, sample_size, timesteps, "completed"])
 88 | 
 89 |         except:
 90 | 
 91 |             saved_files_info.append(["", Y, N, W, sample_size, sample_size, 0, "failed"])
 92 | 
 93 |     saved_files_info = pd.DataFrame(data=saved_files_info, columns=['sample_path', 'Year', 'N', 'W', 'dy', 'dx', 'dt',
 94 |                                                                     'status'])
 95 |     return saved_files_info
 96 | 
 97 | 
 98 | def main():
 99 | 
100 |     global yearloc_groups
101 |     global iminfo
102 |     global labels
103 |     global year_savedir
104 |     global saved_gt_info
105 | 
106 |     iminfo = pd.read_csv(os.path.join(windows_dir, "extracted_windows_data_info.csv"))
107 |     crs = iminfo['crs'].iloc[0]
108 | 
109 |     # remove non extracted locations
110 |     iminfo = iminfo[~pd.isnull(iminfo['sample_path'])].reset_index(drop=True)
111 |     iminfo['DOY'] = iminfo['Date'].apply(lambda s: get_doy(str(s)))
112 |     iminfo['Year'] = iminfo['Date'].apply(lambda s: str(s)[:4])
113 | 
114 |     # ground truths
115 |     gtfiles = [f for f in os.listdir(ground_truths_dir) if os.path.isdir(os.path.join(ground_truths_dir, f))]
116 | 
117 |     saved_files_info = []
118 | 
119 |     for gtfile in gtfiles:
120 |         # gtfile = gtfiles[0]
121 | 
122 |         saved_gt_info = pd.read_csv(os.path.join(ground_truths_dir, gtfile, 'saved_data_info.csv'))
123 | 
124 |         year = find_number(gtfile, "Y")
125 |         # CRSl = find_number(gtfile, "CRS")
126 | 
127 |         year_savedir = os.path.join(savedir, year)
128 |         if not os.path.isdir(year_savedir):
129 |             os.makedirs(year_savedir)
130 | 
131 |         yearloc_groups = iminfo[iminfo['Year'] == year].groupby(['Nl', 'Wl'], as_index=False).groups
132 |         yearlocs = list(yearloc_groups.keys())
133 | 
134 |         df = run_pool(yearlocs, match_labels_images, num_processes)
135 |         df = pd.concat(df)
136 | 
137 |         saved_files_info.append(df)
138 | 
139 | 
140 |     df = pd.concat(saved_files_info).reset_index(drop=True)
141 |     df['crs'] = crs
142 |     df.to_csv(os.path.join(savedir, "saved_timeseries_data_info.csv"), index=False)
143 | 
144 |     # delete windows dir
145 |     shutil.rmtree(windows_dir)
146 | 
147 | 
148 | if __name__ == "__main__":
149 | 
150 |     parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
151 |     parser.add_argument('--ground_truths_dir', help='directory containing ground truth parcels raster')
152 |     parser.add_argument('--products_dir', help='directory containing downloaded sentinel products')
153 |     parser.add_argument('--windows_dir', help='directory containing extracted windows from sentinel products')
154 |     parser.add_argument('--savedir', help='save directory for image timeseries with labels')
155 |     parser.add_argument('--bands', default=None, help='which satellite image bands to use')
156 |     parser.add_argument('--res', default=10, help='pixel size in meters')
157 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
158 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
159 |     # ---------------------------------------------------------------------------------------------
160 | 
161 |     args = parser.parse_args()
162 | 
163 |     ground_truths_dir = args.ground_truths_dir
164 | 
165 |     products_dir = args.products_dir
166 | 
167 |     windows_dir = args.windows_dir
168 | 
169 |     savedir = args.savedir
170 |     if not os.path.exists(savedir):
171 |         os.makedirs(savedir)
172 | 
173 |     # res = int(args.res)
174 |     res = float(args.res)
175 | 
176 |     sample_size = int(args.sample_size)
177 | 
178 |     num_processes = int(args.num_processes)
179 | 
180 |     bands = args.bands
181 | 
182 |     if bands == 'None':
183 |         bands = list(mult.keys())
184 |     else:
185 |         bands = bands.split(',')
186 | 
187 |     main()
188 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/make_labelled_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | bands='None'
 4 | for ARGUMENT in "$@"
 5 | do
 6 | 
 7 |     KEY=$(echo $ARGUMENT | cut -f1 -d=)
 8 |     VALUE=$(echo $ARGUMENT | cut -f2 -d=)
 9 | 
10 |     case "$KEY" in
11 | 
12 |           ground_truths_file)   ground_truths_file=${VALUE} ;;
13 |           products_dir)         products_dir=${VALUE} ;;
14 |           labels_dir)           labels_dir=${VALUE} ;;
15 |           windows_dir)          windows_dir=${VALUE} ;;
16 |           timeseries_dir)       timeseries_dir=${VALUE} ;;
17 |           res)                  res=${VALUE} ;;
18 |           sample_size)          sample_size=${VALUE} ;;
19 |           num_processes)        num_processes=${VALUE} ;;
20 |           bands)                bands=${VALUE} ;;
21 |             *)
22 |     esac
23 | 
24 | done
25 | 
26 | # 1:ground_truths_file, 2:products_dir, 3:labels_dir, 4:windows_dir, 5:timeseries_dir, 6:res, 7:sample_size, 8:num_processes
27 | python dataset/labelled_dense/extract_labels_raster.py --ground_truths_file $ground_truths_file \
28 |                                                        --products_dir $products_dir \
29 |                                                        --savedir $labels_dir \
30 |                                                        --res $res \
31 |                                                        --sample_size $sample_size \
32 |                                                        --num_processes $num_processes
33 | 
34 | python dataset/labelled_dense/extract_images_for_labels.py --ground_truths_dir $labels_dir \
35 |                                                            --products_dir $products_dir \
36 |                                                            --savedir $windows_dir \
37 |                                                            --bands $bands \
38 |                                                            --res $res \
39 |                                                            --sample_size $sample_size \
40 |                                                            --num_processes $num_processes
41 | 
42 | python dataset/labelled_dense/make_image_timeseries_for_labels.py --ground_truths_dir $labels_dir \
43 |                                                                   --products_dir $products_dir \
44 |                                                                   --windows_dir $windows_dir \
45 |                                                                   --savedir $timeseries_dir \
46 |                                                                   --bands $bands \
47 |                                                                   --res $res \
48 |                                                                   --sample_size $sample_size \
49 |                                                                   --num_processes $num_processes
50 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/make_labelled_parcel_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | bands='None'
 4 | for ARGUMENT in "$@"
 5 | do
 6 | 
 7 |     KEY=$(echo $ARGUMENT | cut -f1 -d=)
 8 |     VALUE=$(echo $ARGUMENT | cut -f2 -d=)
 9 | 
10 |     case "$KEY" in
11 | 
12 |           ground_truths_file)   ground_truths_file=${VALUE} ;;
13 |           products_dir)         products_dir=${VALUE} ;;
14 |           labels_dir)           labels_dir=${VALUE} ;;
15 |           windows_dir)          windows_dir=${VALUE} ;;
16 |           timeseries_dir)       timeseries_dir=${VALUE} ;;
17 |           res)                  res=${VALUE} ;;
18 |           sample_size)          sample_size=${VALUE} ;;
19 |           Npoly)                Npoly=${VALUE} ;;
20 |           num_processes)        num_processes=${VALUE} ;;
21 |           bands)                bands=${VALUE} ;;
22 |             *)
23 |     esac
24 | 
25 | done
26 | 
27 | # 1:ground_truths_file, 2:products_dir, 3:labels_dir, 4:windows_dir, 5:timeseries_dir, 6:res, 7:sample_size, 8:num_processes
28 | python dataset/labelled_dense/extract_labels_raster.py --ground_truths_file $ground_truths_file \
29 |                                                        --products_dir $products_dir \
30 |                                                        --savedir $labels_dir \
31 |                                                        --res $res \
32 |                                                        --sample_size $sample_size \
33 |                                                        --num_processes $num_processes
34 | 
35 | python dataset/labelled_dense/extract_parcel_ground_truths.py --ground_truths_file $ground_truths_file \
36 |                                                            --raster_labels_dir $labels_dir \
37 |                                                            --products_dir $products_dir \
38 |                                                            --savedir $labels_dir \
39 |                                                            --res $res \
40 |                                                            --sample_size $sample_size \
41 |                                                            --Npoly $Npoly \
42 |                                                            --num_processes $num_processes
43 | 
44 | python dataset/labelled_dense/extract_images_for_parcel_labels.py --ground_truths_dir $labels_dir \
45 |                                                            --products_dir $products_dir \
46 |                                                            --savedir $windows_dir \
47 |                                                            --bands $bands \
48 |                                                            --res $res \
49 |                                                            --sample_size $sample_size \
50 |                                                            --num_processes $num_processes
51 | 
52 | python dataset/labelled_dense/make_image_timeseries_for_parcel_labels.py --ground_truths_dir $labels_dir \
53 |                                                                   --products_dir $products_dir \
54 |                                                                   --windows_dir $windows_dir \
55 |                                                                   --savedir $timeseries_dir \
56 |                                                                   --bands $bands \
57 |                                                                   --res $res \
58 |                                                                   --sample_size $sample_size \
59 |                                                                   --num_processes $num_processes
60 | 


--------------------------------------------------------------------------------
/dataset/labelled_dense/split_ground_truths_by_location.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "helpful-notion",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from sentinelsat import SentinelAPI, read_geojson, geojson_to_wkt\n",
 13 |     "import os\n",
 14 |     "from collections import OrderedDict\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "%matplotlib inline\n",
 17 |     "if __name__ == \"__main__\" and __package__ is None:\n",
 18 |     "    from sys import path\n",
 19 |     "    from os.path import dirname as dir\n",
 20 |     "\n",
 21 |     "    path.append(dir(path[0]))\n",
 22 |     "    __package__ = \"examples\"\n",
 23 |     "from utils.date_utils import get_doy"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "blank-eugene",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### User input"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 12,
 37 |    "id": "organic-contribution",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "savedir = '/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/ARISE/Tanzania/S2-products'\n",
 42 |     "year = '2022'\n",
 43 |     "date_range = ('0101', '0428')  # (mindate: 'mmdd', maxdate: 'mmdd')\n",
 44 |     "cloudcoverpercentage = (0, 70)   # (min %, max %)\n",
 45 |     "minprodsize = 400                # Mb\n",
 46 |     "numproducts = 60\n",
 47 |     "tile = '37MDN'\n",
 48 |     "platformname = 'Sentinel-2'\n",
 49 |     "processinglevel = 'Level-1C'"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "creative-destiny",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "### Read user credentials"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 13,
 63 |    "id": "following-stanford",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "cred = pd.read_csv(\"pw.csv\", header=None)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "id": "altered-antique",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Query for Sentinel products"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 14,
 81 |    "id": "becoming-fifty",
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "querying...\n",
 89 |       "found 10 products\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "api = SentinelAPI(cred[0][0], cred[0][1], 'https://scihub.copernicus.eu/dhus')\n",
 95 |     "print(\"querying...\")\n",
 96 |     "products = api.query(tileid=tile,\n",
 97 |     "                     platformname=platformname,\n",
 98 |     "                     cloudcoverpercentage=cloudcoverpercentage,\n",
 99 |     "                     date=(\"%s%s\" % (year, date_range[0]), \"%s%s\" % (year, date_range[1])),\n",
100 |     "                     processinglevel=processinglevel)\n",
101 |     "df = api.to_dataframe(products)\n",
102 |     "print(\"found %d products\" % len(products))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "pressing-commerce",
108 |    "metadata": {},
109 |    "source": [
110 |     "### Remove very small size products"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 15,
116 |    "id": "nasty-adjustment",
117 |    "metadata": {
118 |     "pycharm": {
119 |      "name": "#%%\n"
120 |     }
121 |    },
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "keeping 10 products with larger than 400Mb\n",
128 |       "number of products found (10) is less than initially selected (60)\n",
129 |       "changing number of selected products to 10\n",
130 |       "you may want to change selection criteria in 'User input' cell to find more products\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "sizes = np.array([float(s.split(\" \")[0]) for s in df['size'].values])\n",
136 |     "products2keep = OrderedDict()\n",
137 |     "for i, prodkey in enumerate(list(products.keys())):\n",
138 |     "    if sizes[i] >= minprodsize:\n",
139 |     "        # print(sizes[i])\n",
140 |     "        products2keep[prodkey] = products[prodkey]\n",
141 |     "df2keep = api.to_dataframe(products2keep).reset_index()\n",
142 |     "print(\"keeping %d products with larger than %dMb\" % (len(products2keep), minprodsize))\n",
143 |     "\n",
144 |     "if len(products2keep) < numproducts:\n",
145 |     "    print(\"number of products found (%d) is less than initially selected (%d)\" % (len(products2keep), numproducts))\n",
146 |     "    print(\"changing number of selected products to %d\" % (len(products2keep)))\n",
147 |     "    print(\"you may want to change selection criteria in 'User input' cell to find more products\")\n",
148 |     "    numproducts = len(products2keep)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "id": "atomic-joseph",
154 |    "metadata": {
155 |     "pycharm": {
156 |      "name": "#%% md\n"
157 |     }
158 |    },
159 |    "source": [
160 |     "### Spread products evenly in time and visualize"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 16,
166 |    "id": "turkish-fetish",
167 |    "metadata": {
168 |     "pycharm": {
169 |      "name": "#%%\n"
170 |     }
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "ccfactor = 0.0    # cloud cover factor when selecting products\n",
175 |     "def distance(doys, target_doy, ccfactor=0):\n",
176 |     "    \"\"\"\n",
177 |     "    distance function for selecting products depending on \n",
178 |     "    proximity to desired date and cloud cover\n",
179 |     "    \"\"\"\n",
180 |     "    dist = np.abs(doys['doy'] - target_doy) + ccfactor * doys['cloudcoverpercentage']\n",
181 |     "    return dist"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 17,
187 |    "id": "delayed-benefit",
188 |    "metadata": {
189 |     "scrolled": true
190 |    },
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "   doy       size cloudcoverpercentage\n",
197 |       "0    5  820.48 MB              55.1488\n",
198 |       "1   40  794.63 MB              9.41109\n",
199 |       "2   60  803.36 MB              33.9301\n",
200 |       "3   65  768.85 MB              47.6807\n",
201 |       "4   70  795.89 MB               36.565\n",
202 |       "5   75  784.40 MB              66.0153\n",
203 |       "6   85  812.33 MB              27.2806\n",
204 |       "7   95  819.05 MB              27.7287\n",
205 |       "8  100  806.83 MB              38.3053\n",
206 |       "9  115  782.31 MB              65.9704\n"
207 |      ]
208 |     },
209 |     {
210 |      "data": {
211 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcAAAAEGCAYAAADylEXaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXUklEQVR4nO3dfXRU9Z3H8c83TyRoiEGmIAEbLRAICSoIpwJSn9d2taiIx7Urxarbokj3KHXrATm1Xf+olt09rQ+cItKiUlkEV9d2rcX6RF3Q4BMxgsYttFJjggSJECCT/PaPewPDZCaZBJJM/L1f58zJzL3f+5vv/BLy4d6Z3GvOOQEA4JuM3m4AAIDeQAACALxEAAIAvEQAAgC8RAACALyU1ZniQYMGueLiYknS1q3BspKSY91Sao7l88ePlWjsVGq62mdXx+7KcwHoeZs2bdrpnIv0dh84UqcCsLi4WBUVFZKkc84Jlr344jHuKEXH8vnjx0o0dio1qYzd1edPRW9/TwAkZmbbe7sHtMUhUACAlwhAAICXCEAAgJc69R4gAKB3bNq06UtZWVkPSSoTOy+paJFUGY1Gb5gwYUJtogICEAD6gKysrIeGDBkyJhKJ1GdkZHAS5w60tLRYXV1daU1NzUOSvpmohv9FAEDfUBaJRPYQfqnJyMhwkUjkMwV7zIlrerAfAEDXZRB+nRPOV9KcIwABAF4iAAEAx9SMGTOKly9fXtjZ7bZu3ZqzZMmSgT31fAQgACAtfPDBB/1WrVrV6QDsKgIQANChPXv2ZJxzzjkjSkpKSkeOHDl26dKlha+88kr/iRMnlowdO3bM1KlTR27fvj07frtkNZWVlf0mT548qqSkpLS0tHTMu+++22/BggVFFRUVx48ePbr0rrvu+lI0GtV3v/vdYWVlZWNGjRpVeu+99w6SpJaWFs2aNevk4uLissmTJ4/auXNnl/6igT+DAIA+aNIkHdPT3r/2mra2t37t2rUDhgwZ0vTiiy9WS9Knn36aecEFF4z87W9/Wz106NDo0qVLC+fPn1+0evXqba3bHDhwwObNm3dyopprrrnmlPnz59fMmjVr9759+6y5udnuvvvuHYsXLx78wgsvVEvSz372s0EFBQXNlZWV7zU2NtrEiRNHX3rppXs2btzYv7q6ul91dXXlRx99lF1eXj529uzZn3b2NROAAIAOjR8/vnHBggXD58yZUzR9+vTPTjzxxOgHH3yQd955542Sgr2ySCTSFLvNO++80y9RTX19fcYnn3ySM2vWrN2S1L9/fyepzSdc161bN2DLli39n3766UJJamhoyKyqqsp96aWX8q+66qpdWVlZKi4ubjrrrLMauvKaCEAA6IM62mM71saNG3fgjTfeqFqzZk3BnXfeWTRt2rQ9I0aMaHzrrbe2JNvGOWeJaurr61N6+805Z4sXL/7LjBkz9sQuf+aZZwq69iqOxHuAAIAObdu2LTs/P7/lpptu2nXrrbfWVFRUHLdr166sdevWHScFhzsrKipyY7cZN27c/kQ1hYWFLUOGDDn4yCOPnCBJjY2N1tDQkFFQUND8+eefZ7Zuf+GFF3724IMPRg4cOGBSsEe5Z8+ejK997WsNTzzxxMBoNKrt27dnb9iwIb8rr4k9QABAhzZt2pR3xx13DMvIyFBWVpZ74IEHtmdlZbl58+ad3NDQkNnc3Gxz5sz55Mwzz9zfuk1ubq57/PHHP0xU8+ijj/75xhtv/PJPfvKTodnZ2W716tUfTpo0qTEzM9OVlJSUXnPNNTsXLlxYu23btn7l5eVjnHM2cODApt/97ncfXnvttbuff/75ASNGjCgbOnTogTPOOOPzrrwmAhAA0KEZM2bsmTFjRlX88oqKijaHYtesWbOt9f7kyZMbE9WUl5cf2LBhw/vxy+OX3XfffTsk7YivW7FixV9S7z4xDoECALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALrFyy+/3H/27NnDpeCP3SdPnjxq9OjRpUuXLu30pYu6A38HCADoFtOmTds3bdq0fZL06quv9pekLVu2tPlbwmSi0aiysrovptgDBACkZOvWrTkjR44c2/p40aJFg2+99dahkyZNKpkzZ05ReXn5mOLi4rJnn332eEl65pln8s8999wRO3bsyLruuutO2bx5c//Ro0eXvvvuu/2eeuqp/DFjxpSOGjWqdObMmcWNjY0mSUVFReVz5swpKi0tHfPwww8XFhUVld98881Fo0ePLi0rKxuzfv36/lOnTh05fPjwsnvuuSdyNK+HPUAA6IsmTTqml0PSa68d1cm1o9Gobd68+b1Vq1YV/PjHPx568cUXHzqjS1FRUfSBBx7Y3nqpo3379tn5559f8txzz20dN27cgcsvv7z43nvvjSxatKhWkk488cRoVVXVe5J01113DTv55JMPbtmyper6668f/p3vfKd448aNWxobGzPKy8vH3n777XVd7Zk9QADAUZs5c2a9JE2ePHnvRx99lNNe7dtvv507bNiwA+PGjTsgSbNnz/50/fr1h05oPWvWrPrY+quuumq3JJWXl+8bP3783sLCwpahQ4dGc3JyWnbu3JmpLmIPEAD6oqPcY+uKrKws19LScujx/v37D+1E5ebmurBGzc3NdjTPk5+f3xL7uHXsjIwM5eTkHLpuYEZGhpqamrr8XOwBAgBSMmzYsOiuXbuyampqMhsbG+33v/99l67Ld9ppp+3fsWNHTmVlZT9JWrFixYlnn312ly5qezTYAwQApKRfv37utttu+3jixIljBg8e3DRixIj9HW/VVv/+/d2SJUu2zZw58yvNzc067bTT9s2fP7/L7+V1FQEIAEjZwoULaxcuXFibbP1JJ50U3bFjx2ZJuuSSSxouueSShvj7kjR9+vSG6dOnt/mTiNZtEz2eN2/ep5I+TVbbWRwCBQB4iQAEAHiJAASAvqGlpaXlqD5d6ZtwvlqSrScAAaBvqKyrqysgBFPT0tJidXV1BZIqk9XwIRgA6AOi0egNNTU1D9XU1JSJnZdUtEiqjEajNyQrIAABoA+YMGFCraRv9nYfXyT8LwIA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOAlAhAA4CUCEADgJQIQAOClzgVgNCq9/rpUV6emJqmhQaqra6e+ru5QfYc6Uyupad9B7avZo53vpVbf3vjxr6WpSfq8/qDqnztcn6jmiNefZPyk8xRT3+HYHfSf8Lk6OZ8A4B3nXMq3CWbOFRS4ppw8N9hqXGamc3l5zq1c6dpauTJYWVDQTlEXap1z6+eudGfrJTdFr7i9ynPr57Zf3974K1c6l5HhDr2WuXOdG2w17my95HYreK3r565sUxP7eP3cxOPHj33oaWP6iZ/L+LFXrkxtfmKfa1b2SteUk/p8AuhekipcJ37XcuuZmwXfm9ScaeZ+Ht6/SferUmPVrGzl5Ehr10qFheHK+nrpiiukgwcPb9ymqAu1knZvq1fut67QbVosSbpft2i/crT/sbU6obhtfXvj16uwzaoC1atY22Vq0f26RZK0Xzm6SM+pWdlthi9QvdbqCuXqyPF3L1+ry68rbPO0Ty6v1wnXHfmksXMZ70s5wfjWzvzEvsRMNek5XdSmn2TzCaD7TZkyZZNz7sze7gNH6vJ7gPfoBxqpaklSVpZUUxOzsqZGyo77Zd6mqAu1kuqrahRVtu7XLYcCKqos1Vclrm9v/ESrTlKNfqrbD43dOn7ra413koJ+4sevr6pJ+LT1VW2fNHYu4xVl1Kg5o/35iX0dI1WtpgT9JJtPAPBWZ3YXJ0jOhbe9ynODVOuk4ChbbW3M/n5tbbAwpr5tURdqnXN1VbVur46s36s8V1eVuL698ROtGqTE47e+1vhbonqXF/ST6Gnrqto+aXvjD8+tdS0dzE/s60jWT7L5BND9xCHQtLx1fg8wM1PRnDx9L3uZDg6IKC9PWrZMikRiaiKRYGFGhpSZqcRFXaiVNGhMRG/OXaZmZSiqTO1Tnt6cu0yDxiSub2/81lV5edKAAcHXq+dG9L3sw+NHc4Lx9+ZFDtXMnXt4m715QT/x4w8aE2kz9rJlQf+x/cTPZezYeXnSTx+OyDqYn9jXcXBA0L+z1OYTAHzVufcAjz/eVZSUSM8+qzpFtG2bVFzczu/WKVOk/fulZ5/t+BdwZ2olNU2aooN79qvxyWeTh1+K49fV6YjXUlcn5V4wRbnar+x1QX2imiNef5Lx29Ql6Cd+LhNuk8L8HLHdZZ2bTwDdx8x4DzANZXWq2kzKzw/2npTC79Xs7OCWyi/gztRKyu6frez+2ToulfDrYPxI5MjFkYikwmxJh+sT1RwxVJLx29QlqI+fy4TbpDA/R2zXyfkEAN/wh/AAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgAAAL5lzLvViswZJW7uvnW4zSNLO3m6ik/pizxJ99zT67lld7fvLzrnIsW4GRyerk/VbnXNndksn3cjMKvpa332xZ4m+exp996y+2jcS4xAoAMBLBCAAwEudDcBfdksX3a8v9t0Xe5bou6fRd8/qq30jgU59CAYAgC8KDoECALxEAAIAvJRSAJrZxWa21cyqzeyH3d3U0TCzbWa22czeMrOKcNlAM/uDmX0Qfi1Mgz4fNrNaM6uMWZawTwv8PJz/d8xsfJr1/SMz2xHO+Vtm9o2YdXeEfW81s7/rpZ6Hm9kLZlZlZu+a2ffD5Wk93+30ne7znWtmr5nZ22Hfd4XLTzGzjWF/q8wsJ1zeL3xcHa4vTrO+f2Vmf46Z79PD5Wnxc4Kj4Jxr9yYpU9KHkk6VlCPpbUmlHW3XWzdJ2yQNilt2j6Qfhvd/KOmnadDnNEnjJVV21Kekb0j6H0km6auSNqZZ3z+SND9BbWn489JP0inhz1FmL/R8kqTx4f18Se+HvaX1fLfTd7rPt0k6PryfLWljOI//KenqcPkSSXPC+zdJWhLev1rSql6a72R9/0rSlQnq0+LnhFvXb6nsAU6SVO2c+z/n3EFJj0uansJ26WS6pF+H938t6bLeayXgnHtZ0q64xcn6nC5phQtskHSCmZ3UI43GSdJ3MtMlPe6cO+Cc+7OkagU/Tz3KOfexc+6N8H6DpPckFSnN57udvpNJl/l2zrnPw4fZ4c1JOk/SE+Hy+Plu/T48Iel8M7Oe6fawdvpOJi1+TtB1qQRgkaS/xjz+SO3/I+xtTtJzZrbJzP4pXDbYOfdxeL9G0uDeaa1DyfrsC9+DueFhoIdjDjGnXd/h4bUzFPzvvs/Md1zfUprPt5llmtlbkmol/UHB3uhu51w0QW+H+g7XfybpxB5tOBTft3Oudb7vDuf7382sX7gsbeYbXfNF/BDMVOfceElfl3SzmU2LXemcc2r/f3Vpoa/0GXpQ0lcknS7pY0mLe7WbJMzseElrJP2zc25P7Lp0nu8Efaf9fDvnmp1zp0sapmAvdHTvdpSa+L7NrEzSHQr6nyhpoKR/6b0OcSylEoA7JA2PeTwsXJaWnHM7wq+1kp5U8I/vk9ZDE+HX2t7rsF3J+kzr74Fz7pPwF0eLpKU6fNgtbfo2s2wFIfKYc25tuDjt5ztR331hvls553ZLekHSWQoOEbaefzi2t0N9h+sLJH3as50eKabvi8ND0c45d0DScqXxfKNzUgnA1yWNDD/BlaPgTeqnu7etrjGz48wsv/W+pIskVSro99th2bclPdU7HXYoWZ9PS5oVfursq5I+izl01+vi3ve4XMGcS0HfV4ef8jtF0khJr/VCfyZpmaT3nHP/FrMqrec7Wd99YL4jZnZCeD9P0oUK3r98QdKVYVn8fLd+H66U9Mdwj7xHJel7S8x/kkzB+5ax893rPyc4Cql8UkbBp53eV3Acf0Fvf3KnnT5PVfApuLclvdvaq4L3E56X9IGkdZIGpkGvv1Fw+KpJwXsH1yfrU8GnzO4P53+zpDPTrO9Hwr7eUfBL4aSY+gVh31slfb2Xep6q4PDmO5LeCm/fSPf5bqfvdJ/vcZLeDPurlLQoXH6qgkCulrRaUr9weW74uDpcf2qa9f3HcL4rJT2qw58UTYufE25dv3EqNACAl76IH4IBAKBDBCAAwEsEIADASwQgAMBLBCAAwEsEIHpEeAWD+d04fiS8ksCbZnZ2dz0PgC+OrI5LgD7hfEmbnXM39MSTmVmWO3xeSwB9EHuA6DZmtsDM3jez9ZJKYpbfaGavh9ddW2Nm/c0sP7zmWnZYMyD2ccy2xWb2x/DExM+b2cnh9dnukTQ9vF5bXkz9eWb2XzGPLzSzJ8P7F5nZ/5rZG2a2OjznpsxsUdhfpZn9svXKBGb2opn9hwXXmfx+d80bgJ5BAKJbmNkEBafNO13B2Usmxqxe65yb6Jw7TcEpsq53weV+XpT092HN1WFdU9zQv5D0a+fcOEmPSfq5c+4tSYsUXEfudOdcY0z9C5JGm1kkfHydpIfNbJCkhZIucMHJ0ysk3RrW3Bf2VyYpT9IlMePlOOfOdM6l3QmoAXQOAYjucrakJ51z+1xwBYPY88eWmdkrZrZZ0rckjQ2XP6QgoBR+XZ5g3LMkrQzvP6LgdGFJueBUR49I+sfwPI9nKbiI6VcVXED2T+Hlb74t6cvhZueG7yduVnANu7ExQ65q7/kA9B28B4je8CtJlznn3jaz2ZLOkSTn3J/CQ5znKLiSeWWyATppuaT/lrRf0mrnXDQ8rPkH59w/xBaaWa6kBxSc1/GvZvYjBeeqbLX3GPUEoJexB4ju8rKky8wsL7xCx6Ux6/IlfRy+v/etuO1WKNjDS7T3J0mvKjg8qnDbVzpqxDn3N0l/U3DIs3XcDZKmmNkI6dCVREbpcNjtDN8TvDJ+PABfDOwBols4594ws1UKrsxRq+CyWq3uVHBl87rwa37Musck/auCq04kcouk5Wb2g3D765LUxXtMUsQ5917YX1249/kbO3yF74XOuffNbKmCM//XxPUN4AuEq0EgrZjZlZKmO+euPcbj3ifpTefcsmM5LoC+iz1ApA0z+4Wkryv41OixHHeTgvfubjuW4wLo29gDBAB4iQ/BAAC8RAACALxEAAIAvEQAAgC8RAACALz0/1Vt0kiPfK/SAAAAAElFTkSuQmCC\n",
212 |       "text/plain": [
213 |        "<Figure size 432x288 with 1 Axes>"
214 |       ]
215 |      },
216 |      "metadata": {
217 |       "needs_background": "light"
218 |      },
219 |      "output_type": "display_data"
220 |     }
221 |    ],
222 |    "source": [
223 |     "start_doy = get_doy(\"%s%s\" % (year, date_range[0]))\n",
224 |     "end_doy = get_doy(\"%s%s\" % (year, date_range[1]))\n",
225 |     "uniform_doy_list = np.linspace(start_doy, end_doy, numproducts).tolist()\n",
226 |     "\n",
227 |     "doys = df2keep.copy()  # [['datatakesensingstart', 'cloudcoverpercentage', 'size']]\n",
228 |     "doys['doy'] = pd.DataFrame(\n",
229 |     "    doys['datatakesensingstart'].apply(lambda s: get_doy(str(s).split(' ')[0].replace('-', ''))))\n",
230 |     "\n",
231 |     "idx_list = []\n",
232 |     "for doy_ in uniform_doy_list:\n",
233 |     "    # print(doy_)\n",
234 |     "    doys['distance'] = distance(doys, doy_, ccfactor)\n",
235 |     "    idx = doys['distance'].argmin()\n",
236 |     "    idx_list.append(pd.DataFrame(doys.iloc[idx, :]).T)\n",
237 |     "    doys = doys.drop(index=idx).reset_index(drop=True)\n",
238 |     "prod2keep = pd.concat(idx_list).reset_index(drop=True)  # df2keep.iloc[idx_list].reset_index(drop=True)\n",
239 |     "prod2keep['doy'] = pd.DataFrame(\n",
240 |     "    prod2keep['datatakesensingstart'].apply(lambda s: get_doy(str(s).split(' ')[0].replace('-', ''))))\n",
241 |     "\n",
242 |     "# visualize\n",
243 |     "plt.scatter(prod2keep['doy'].values, np.zeros(prod2keep.shape[0]), s=20, c='b')\n",
244 |     "plt.scatter(uniform_doy_list, np.zeros(len(uniform_doy_list)), s=20, c='r')\n",
245 |     "plt.vlines(prod2keep['doy'].values, 0, 1, color='b', label='selected')\n",
246 |     "plt.vlines(uniform_doy_list, 0, -1, color='r', label='uniform')\n",
247 |     "plt.hlines(0, 1, 365, color='k', alpha=0.3)\n",
248 |     "plt.ylim(-1, 1)\n",
249 |     "plt.xlim(0, 365)\n",
250 |     "plt.yticks([], [])\n",
251 |     "plt.xlabel('day of year')\n",
252 |     "plt.legend(bbox_to_anchor=(1.3, 1))\n",
253 |     "\n",
254 |     "# examine\n",
255 |     "print(prod2keep[['doy', 'size', 'cloudcoverpercentage']])  # .columns)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "id": "auburn-nomination",
261 |    "metadata": {
262 |     "pycharm": {
263 |      "name": "#%% md\n"
264 |     }
265 |    },
266 |    "source": [
267 |     "### Save selected products to disk"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 18,
273 |    "id": "constitutional-newman",
274 |    "metadata": {
275 |     "pycharm": {
276 |      "name": "#%%\n"
277 |     }
278 |    },
279 |    "outputs": [
280 |     {
281 |      "name": "stdout",
282 |      "output_type": "stream",
283 |      "text": [
284 |       "saving products info to /media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/ARISE/Tanzania/S2-products/Sentinel-2_Level-1C_tile37MDN_minsize400Mb_10dates_year2022_from0101to0428_mincc0maxcc70.csv\n"
285 |      ]
286 |     }
287 |    ],
288 |    "source": [
289 |     "savename = '%s/%s_%s_tile%s_minsize%dMb_%ddates_year%s_from%sto%s_mincc%dmaxcc%d.csv' % \\\n",
290 |     "           (savedir, platformname, processinglevel, tile, minprodsize, numproducts, year, \n",
291 |     "            date_range[0], date_range[1], cloudcoverpercentage[0], cloudcoverpercentage[1])\n",
292 |     "\n",
293 |     "if not os.path.exists(os.path.dirname(savename)):\n",
294 |     "    print(\"making new directory %s\" % os.path.dirname(savename))\n",
295 |     "    os.makedirs(os.path.dirname(savename))\n",
296 |     "\n",
297 |     "print(\"saving products info to %s\" % savename)\n",
298 |     "prod2keep.to_csv(savename, index=False)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "id": "amino-blade",
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": []
308 |   }
309 |  ],
310 |  "metadata": {
311 |   "kernelspec": {
312 |    "display_name": "Python 3",
313 |    "language": "python",
314 |    "name": "python3"
315 |   },
316 |   "language_info": {
317 |    "codemirror_mode": {
318 |     "name": "ipython",
319 |     "version": 3
320 |    },
321 |    "file_extension": ".py",
322 |    "mimetype": "text/x-python",
323 |    "name": "python",
324 |    "nbconvert_exporter": "python",
325 |    "pygments_lexer": "ipython3",
326 |    "version": "3.8.2"
327 |   }
328 |  },
329 |  "nbformat": 4,
330 |  "nbformat_minor": 5
331 | }
332 | 


--------------------------------------------------------------------------------
/dataset/unlabelled/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/dataset/unlabelled/__init__.py


--------------------------------------------------------------------------------
/dataset/unlabelled/extract_images.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Given a directory of Sentinel tiles extract crops of images
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import rasterio
  7 | import numpy as np
  8 | import os
  9 | from glob import glob
 10 | import pickle
 11 | if __name__ == "__main__" and __package__ is None:
 12 |     from sys import path
 13 |     from os.path import dirname as dir
 14 |     path.insert(0, dir(dir(path[0])))
 15 |     __package__ = "examples"
 16 | from utils.geospatial_data_utils import GeoTransform
 17 | from utils.multiprocessing_utils import run_pool
 18 | from utils.sentinel_products_utils import get_S2prod_info
 19 | 
 20 | 
 21 | mult = {'B01': 1/6., 'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 22 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 23 | 
 24 | 
 25 | def extract_images(imdirs):
 26 | 
 27 |     jp2s = ["%s.jp2" % i for i in bands]
 28 |     # print('jp2s: ', jp2s)
 29 | 
 30 |     refband = None
 31 |     for band in bands:
 32 |         if mult[band] == 1.0:
 33 |             refband = band
 34 |             break
 35 |     assert refband is not None, "in curerent implementation at least one 10m band should be included"
 36 | 
 37 |     saved_files_info = []
 38 | 
 39 |     for ii, imdir in enumerate(imdirs):
 40 | 
 41 |         print("processing product %d of %d in current process" % (ii+1, len(imdirs)))
 42 | 
 43 |         imname = "%s_%s" % (imdir.split("/")[-2].split("_")[-3], imdir.split("/")[-4].split("_")[-5])
 44 |         date = imdir.split("/")[-4].split(".")[0].split("_")[2][:8]
 45 | 
 46 |         # read product
 47 |         data = {}
 48 |         for jp2 in jp2s:
 49 |             with rasterio.open("%s/%s_%s" % (imdir, imname, jp2)) as f:
 50 |                 data[jp2[:-4]] = f.read(1)
 51 | 
 52 |         if anchor is not None:
 53 |             Nanchor, Wanchor, CRSanchor = anchor
 54 | 
 55 |             geotransform_prod2anchor = GeoTransform(CRSanchor, str(f.crs).split(':')[1], loc2loc=True)
 56 |             Wp, Np = geotransform_prod2anchor(np.array(f.transform)[2], np.array(f.transform)[5])
 57 | 
 58 |             dN = divmod((Np - Nanchor) / (sample_size * res), 1)[1] * sample_size * res
 59 |             dW = divmod((Wanchor - Wp) / (sample_size * res), 1)[1] * sample_size * res
 60 | 
 61 |         else:
 62 |             Wp, Np = np.array(f.transform)[2], np.array(f.transform)[5]
 63 |             dN = dW = 0
 64 | 
 65 |         num_rows = (data[refband].shape[0] * 10 - dN) / (sample_size * res)
 66 |         num_cols = (data[refband].shape[0] * 10 - dW) / (sample_size * res)
 67 | 
 68 |         prod_savedir = os.path.join(savedir, imdir.split("/")[-4].split(".")[0])
 69 |         if not os.path.exists(prod_savedir):
 70 |             os.makedirs(prod_savedir)
 71 | 
 72 |         for i in range(int(num_rows)):
 73 | 
 74 |             for j in range(int(num_cols)):
 75 | 
 76 |                 Nij = Np - dN - i * res * sample_size  # N for extracted label window
 77 |                 Wij = Wp + dW + j * res * sample_size  # W for extracted label window
 78 | 
 79 |                 ip = (Np - Nij) / (res * sample_size)  # product row
 80 |                 jp = (Wij - Wp) / (res * sample_size)  # product column
 81 | 
 82 |                 sample = {}
 83 |                 for jp2 in jp2s:
 84 |                     xpmin = int(np.round(mult[jp2[:-4]] * ip * sample_size))
 85 |                     ypmin = int(np.round(mult[jp2[:-4]] * jp * sample_size))
 86 |                     sample[jp2[:-4]] = data[jp2[:-4]][xpmin: xpmin + int(mult[jp2[:-4]] * sample_size),
 87 |                                        ypmin: ypmin + int(mult[jp2[:-4]] * sample_size)]
 88 | 
 89 |                 if sample[jp2[:-4]].sum() == 0:
 90 |                     saved_files_info.append(
 91 |                         [None, Nij, Wij, Np, Wp, i, j, ip, jp, sample_size, sample_size, date, imdir, "no image"])
 92 |                     continue
 93 | 
 94 |                 sample_save_path = "%s/N%d_W%d_D%s.pickle" % (prod_savedir, int(Nij), int(Wij), date)
 95 |                 with open(sample_save_path, 'wb') as handle:
 96 |                     pickle.dump(sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
 97 | 
 98 |                 saved_files_info.append(
 99 |                     [sample_save_path, Nij, Wij, Np, Wp, i, j, ip, jp, sample_size, sample_size, date, imdir, "ok"])
100 | 
101 |     df = pd.DataFrame(data=saved_files_info,
102 |                       columns=['sample_path', 'Nij', 'Wij', 'Np', 'Wp', 'il', 'jl', 'ip', 'jp',
103 |                                'height', 'width', 'Date', 'S2_prod_imdir', "comment"])
104 |     # print('process finished')
105 |     return df
106 | 
107 | 
108 | def main():
109 | 
110 |     # sentinel products
111 |     imdirs = glob("%s/*.SAFE/GRANULE/**/IMG_DATA" % products_dir)
112 |     prod_df = get_S2prod_info(imdirs)
113 |     prod_df['Year'] = prod_df['Time'].apply(lambda s: s[:4])
114 |     years = prod_df['Year'].drop_duplicates().tolist()
115 | 
116 |     out = []
117 |     for year in years:
118 | 
119 |         # sentinel products
120 |         products = prod_df[prod_df['Year'] == year]
121 |         imdirs = products['path'].tolist()
122 | 
123 |         df_year = run_pool(imdirs, extract_images, num_processes)
124 |         # print('process finished 2')
125 | 
126 |         out.append(pd.concat(df_year))
127 |         # print('process finished 3')
128 | 
129 |     # print('pool finished')
130 |     df = pd.concat(out).reset_index(drop=True)
131 |     df.to_csv(os.path.join(savedir, "extracted_windows_data_info.csv"), index=False)
132 | 
133 | 
134 | if __name__ == "__main__":
135 | 
136 |     parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
137 |     parser.add_argument('--products_dir', help='directory containing sentinel products')
138 |     parser.add_argument('--savedir', help='save directory to extract ground truths in raster mode')
139 |     parser.add_argument('--bands', default=None, help='which satellite image bands to use')
140 |     parser.add_argument('--res', default=10, help='pixel size in meters')
141 |     parser.add_argument('--anchor', default=None, help='anchor point for grid (N, W, crs)')
142 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
143 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
144 |     # ---------------------------------------------------------------------------------------------
145 | 
146 |     args = parser.parse_args()
147 | 
148 |     products_dir = args.products_dir
149 | 
150 |     bands = args.bands
151 |     if bands == 'None':
152 |         bands = list(mult.keys())
153 |     else:
154 |         bands = bands.split(',')
155 | 
156 |     savedir = args.savedir
157 |     if not os.path.exists(savedir):
158 |         os.makedirs(savedir)
159 | 
160 |     res = int(args.res)
161 | 
162 |     sample_size = int(args.sample_size)
163 | 
164 |     num_processes = int(args.num_processes)
165 | 
166 |     anchor = args.anchor
167 |     if anchor == 'None':
168 |         anchor = None
169 |     else:
170 |         anchor = [int(i) for i in anchor.split(",")]
171 | 
172 |     main()
173 | 


--------------------------------------------------------------------------------
/dataset/unlabelled/make_image_timeseries.py:
--------------------------------------------------------------------------------
  1 | """
  2 | For a set of extracted image crops, make a timeseries for all locations
  3 | """
  4 | import argparse
  5 | import pandas as pd
  6 | import numpy as np
  7 | import os
  8 | import shutil
  9 | import pickle
 10 | from multiprocessing import Pool
 11 | if __name__ == "__main__" and __package__ is None:
 12 |     from sys import path
 13 |     from os.path import dirname as dir
 14 |     path.insert(0, dir(dir(path[0])))
 15 |     __package__ = "examples"
 16 | from utils.date_utils import get_doy
 17 | from utils.multiprocessing_utils import split_num_segments
 18 | 
 19 | 
 20 | mult = {'B01': 1/6., 'B02': 1., 'B03': 1., 'B04': 1., 'B05': 1./2., 'B06': 1./2., 'B07': 1./2., 'B08': 1., 'B8A': 1./2,
 21 |         'B09': 1./6., 'B10': 1./6., 'B11': 1./2., 'B12': 1./2.}
 22 | 
 23 | 
 24 | def make_image_timeseries(inputs):
 25 |     rank, yearlocs, yearloc_groups, iminfo, year_savedir = inputs#[0]
 26 | 
 27 |     refband = bands[0]
 28 | 
 29 |     saved_files_info = []
 30 |     for ii, yearloc in enumerate(yearlocs):
 31 |         # ii, yearloc = 0, yearlocs[0]
 32 |         if ii % 1e3 == 0:
 33 |             print("process %d, location %d of %d" % (rank, ii+1, len(yearlocs)))
 34 | 
 35 |         idx = yearloc_groups[yearloc]
 36 |         data = iminfo.iloc[idx, :].sort_values(by='DOY').copy()
 37 |         data = data.drop_duplicates(subset=['DOY'], keep='first')  # some products downloaded twice
 38 | 
 39 |         Y = data['Year'].iloc[0]
 40 |         N = data['Nij'].iloc[0]
 41 |         W = data['Wij'].iloc[0]
 42 |         il = data['il'].iloc[0]
 43 |         jl = data['jl'].iloc[0]
 44 | 
 45 |         assert all(data['Year'] == Y)
 46 |         assert all(data['Nij'] == N)
 47 |         assert all(data['Wij'] == W)
 48 |         assert all(data['il'] == il)
 49 |         assert all(data['jl'] == jl)
 50 | 
 51 |         timeseries_sample = {band: [] for band in bands}
 52 |         timeseries_sample['doy'] = []
 53 |         # timeseries_sample = {'B01': [], 'B02': [], 'B03': [], 'B04': [], 'B05': [], 'B06': [], 'B07': [],
 54 |         #                      'B08': [], 'B8A': [], 'B09': [], 'B10': [], 'B11': [], 'B12': [], 'doy': []}
 55 |         for sample_info in data[['sample_path', 'DOY']].values:
 56 | 
 57 |             impath, doy = sample_info
 58 | 
 59 |             with open(impath, 'rb') as handle:
 60 |                 sample = pickle.load(handle, encoding='latin1')
 61 | 
 62 |             # for key in ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B10', 'B11', 'B12']:
 63 |             for key in bands:
 64 |                 timeseries_sample[key].append(sample[key])
 65 |             timeseries_sample['doy'].append(np.array(doy))
 66 | 
 67 |         # for key in ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B10', 'B11', 'B12', 'doy']:
 68 |         for key in bands:
 69 |             timeseries_sample[key] = np.stack(timeseries_sample[key])
 70 |         timeseries_sample['doy'] = np.stack(timeseries_sample['doy'])
 71 |         timeseries_sample['year'] = np.array(Y).astype(np.int32)
 72 | 
 73 |         timesteps = timeseries_sample[refband].shape[0]
 74 | 
 75 |         savename = os.path.join(year_savedir, "%d_%d_%s.pickle" % (int(N), int(W), Y))
 76 |         with open(savename, 'wb') as handle:
 77 |             pickle.dump(timeseries_sample, handle, protocol=pickle.HIGHEST_PROTOCOL)
 78 | 
 79 |         saved_files_info.append([savename, Y, N, W, sample_size, sample_size, timesteps, il, jl, "completed"])
 80 | 
 81 |     saved_files_info = pd.DataFrame(data=saved_files_info, columns=['sample_path', 'Year', 'N', 'W', 'dy', 'dx', 'dt',
 82 |                                                                     'win_i', 'win_j', 'status'])
 83 |     return saved_files_info
 84 | 
 85 | 
 86 | def main():
 87 | 
 88 |     global yearloc_groups
 89 |     global iminfo
 90 |     global year_savedir
 91 | 
 92 |     iminfo = pd.read_csv(os.path.join(windows_dir, "extracted_windows_data_info.csv"))
 93 |     iminfo = iminfo[~pd.isnull(iminfo['sample_path'])].reset_index(drop=True)
 94 |     iminfo['DOY'] = iminfo['Date'].apply(lambda s: get_doy(str(s)))
 95 |     iminfo['Year'] = iminfo['Date'].apply(lambda s: str(s)[:4])
 96 |     years = iminfo['Year'].drop_duplicates().tolist()
 97 |     print("found windows for years %s" % ", ".join(years))
 98 | 
 99 |     pool = Pool(num_processes)
100 | 
101 |     saved_files_info = []
102 | 
103 |     for year in set(years):
104 | 
105 |         year_savedir = os.path.join(savedir, year)
106 |         if not os.path.isdir(year_savedir):
107 |             os.makedirs(year_savedir)
108 | 
109 |         yearloc_groups = iminfo[iminfo['Year'] == year].copy().groupby(['Nij', 'Wij'], as_index=False).groups
110 |         yearlocs = list(yearloc_groups.keys())
111 | 
112 |         inputs = [[i, yearlocs_, yearloc_groups, iminfo, year_savedir]
113 |                   for i, yearlocs_ in enumerate(split_num_segments(yearlocs, num_processes))]
114 | 
115 |         df = pool.map(make_image_timeseries, inputs)
116 |         df = pd.concat(df)
117 | 
118 |         saved_files_info.append(df)
119 | 
120 |     df = pd.concat(saved_files_info).reset_index(drop=True)
121 |     df.to_csv(os.path.join(savedir, "saved_timeseries_data_info.csv"), index=False)
122 | 
123 |     paths = df['sample_path'].apply(lambda s: s[len(savedir)+1:])
124 |     paths.to_csv(os.path.join(savedir, "data_paths.csv"), header=None, index=False)
125 | 
126 |     # delete windows dir
127 |     shutil.rmtree(windows_dir)
128 | 
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
133 |     parser.add_argument('--windows_dir', help='directory containing sentinel products')
134 |     parser.add_argument('--savedir', help='save directory to extract ground truths in raster mode')
135 |     parser.add_argument('--bands', default=None, help='which satellite image bands to use')
136 |     parser.add_argument('--res', default=10, help='pixel size in meters')
137 |     parser.add_argument('--sample_size', default=24, help='spatial resolution of dataset samples')
138 |     parser.add_argument('--num_processes', default=4, help='number of parallel processes')
139 |     # ---------------------------------------------------------------------------------------------
140 | 
141 |     args = parser.parse_args()
142 | 
143 |     windows_dir = args.windows_dir
144 | 
145 |     savedir = args.savedir
146 |     if not os.path.exists(savedir):
147 |         os.makedirs(savedir)
148 | 
149 |     res = int(args.res)
150 | 
151 |     sample_size = int(args.sample_size)
152 | 
153 |     num_processes = int(args.num_processes)
154 | 
155 |     bands = args.bands
156 |     if bands == 'None':
157 |         bands = list(mult.keys())
158 |     else:
159 |         bands = bands.split(',')
160 | 
161 |     main()
162 | 


--------------------------------------------------------------------------------
/dataset/unlabelled/make_unlabelled_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | anchor='None'
 4 | bands='None'
 5 | for ARGUMENT in "$@"
 6 | do
 7 | 
 8 |     KEY=$(echo $ARGUMENT | cut -f1 -d=)
 9 |     VALUE=$(echo $ARGUMENT | cut -f2 -d=)
10 | 
11 |     case "$KEY" in
12 | 
13 |           products_dir)     products_dir=${VALUE} ;;
14 |           windows_dir)      windows_dir=${VALUE} ;;
15 |           timeseries_dir)   timeseries_dir=${VALUE} ;;
16 |           res)              res=${VALUE} ;;
17 |           sample_size)      sample_size=${VALUE} ;;
18 |           num_processes)    num_processes=${VALUE} ;;
19 |           anchor)           anchor=${VALUE} ;;
20 |           bands)            bands=${VALUE} ;;
21 |             *)
22 |     esac
23 | 
24 | done
25 | 
26 | 
27 | python dataset/unlabelled/extract_images.py --products_dir $products_dir \
28 |                                        --bands $bands \
29 |                                        --savedir $windows_dir \
30 |                                        --anchor $anchor \
31 |                                        --res $res \
32 |                                        --sample_size $sample_size \
33 |                                        --num_processes $num_processes
34 | 
35 | python dataset/unlabelled/make_image_timeseries.py --windows_dir $windows_dir \
36 |                                               --savedir $timeseries_dir \
37 |                                               --bands $bands \
38 |                                               --res $res \
39 |                                               --sample_size $sample_size \
40 |                                               --num_processes $num_processes
41 | 


--------------------------------------------------------------------------------
/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/diagram.png


--------------------------------------------------------------------------------
/download/README.md:
--------------------------------------------------------------------------------
 1 | # Data download
 2 | 
 3 | ## General Description
 4 | We propose to split the task of downloading all relevant products to an AOI during a specific time period to the 
 5 | following subtasks:
 6 | 1) for an AOI find all overlaping Sentinel tiles using **`find_tiles_for_aoi.ipynb`**
 7 | 2) make a list of all products to download for each tile and period of interest and save a file with all selected 
 8 | products to disk using **`find_products_for_tile.ipynb`**. During this step some compromises might be needed to reduce 
 9 | the total download time.
10 | 3) download all selected products with **`aoi_download.sh`**
11 | 
12 | Downloading data can take a significant amount of time. 
13 | We propose to perform steps 1,2 manually using the provided .ipynb files to ensure an optimal selection of products and 
14 | automate the final part of downloading a list of pre-selected products. 
15 | 
16 | ## Authentication
17 | All scripts make use of the [sentinelsat]{https://github.com/sentinelsat/sentinelsat} library for querying and 
18 | downloading Sentinel products from the ESA [Copernicus Open Access Hub]{https://scihub.copernicus.eu/} (COAH). 
19 | You will need to [sign up]{https://scihub.copernicus.eu/dhus/#/self-registration} to COAH and save the user name and 
20 | password in a two row file **pw.csv** with the following form:
21 | ```
22 | username
23 | password
24 | ``` 
25 | 
26 | ### Find Sentinel tiles for AOI
27 | Notebook **`find_tiles_for_aoi.ipynb`**
28 | - AOI is defined as a rectangle. Define the coordinates of the North-West (NW) and South-East (SE) corners of the AOI 
29 | as well as coordinate system used
30 | - output is all tiles that overlap with defined rectangle and part of the area of teh rectangle covered by each tile. 
31 | Note that for very large AOIs all tiles will cover a small portion of the defined rectangle
32 | 
33 | ### Find a list of products 
34 | Notebook **find_products_for_tile.ipynb**
35 | - specify the following parameters:
36 |     - savedir: where to save products list
37 |     - year: 'yyyy'
38 |     - date_range: minimum and maximum dates for the same year (mindate: 'mmdd', maxdate: 'mmdd')
39 |     - cloudcoverpercentage: minimum and maximum percentage of cloud cover (min %, max %)
40 |     - minprodsize: minimum size of product in Mb
41 |     - numproducts: number of products to select 
42 |     - tile: Sentinel tile name e.g. '35PNK' from previous step
43 |     - platformname: Sentinel mission name i.e. 'Sentinel-2'
44 |     - processinglevel: processing level of products i.e. 'Level-1C'
45 | - the script queries COAH for available products given the parameters set and selects products such that they are 
46 | equally spaced in the defined time period. All selected products are saved in a .csv file.
47 | 
48 | ### Download data from file
49 | Pass one or more generated .csv files containing selected products to **download.sh** separated by commas:
50 | ```
51 | sh download/download.sh file1.csv,file2.csv,...
52 | ``` 
53 | All products will be downloaded in the parent directory of the first .csv file.
54 | 


--------------------------------------------------------------------------------
/download/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/download/__init__.py


--------------------------------------------------------------------------------
/download/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | x=1
 3 | while [ $x -le 1000 ]
 4 | do
 5 |   echo "Attempt $x"
 6 |   python download/sentinelsat_download_tileid.py --products_file $1
 7 |   x=$(( $x + 1 ))
 8 |   sleep 1800
 9 | done
10 | 


--------------------------------------------------------------------------------
/download/find_S2_products_for_tile.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 51,
  6 |    "id": "helpful-notion",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from sentinelsat import SentinelAPI, read_geojson, geojson_to_wkt\n",
 13 |     "import os\n",
 14 |     "from collections import OrderedDict\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "%matplotlib inline\n",
 17 |     "if __name__ == \"__main__\" and __package__ is None:\n",
 18 |     "    from sys import path\n",
 19 |     "    from os.path import dirname as dir\n",
 20 |     "\n",
 21 |     "    path.append(dir(path[0]))\n",
 22 |     "    __package__ = \"examples\"\n",
 23 |     "from utils.date_utils import get_doy"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "blank-eugene",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### User input"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 52,
 37 |    "id": "organic-contribution",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "savedir = ''\n",
 42 |     "year = '2021'\n",
 43 |     "date_range = ('0101', '0530')  # (mindate: 'mmdd', maxdate: 'mmdd')\n",
 44 |     "cloudcoverpercentage = (0, 70)   # (min %, max %)\n",
 45 |     "minprodsize = 400                # Mb\n",
 46 |     "numproducts = 40\n",
 47 |     "tile = '32UPU'\n",
 48 |     "platformname = 'Sentinel-2'\n",
 49 |     "processinglevel = 'Level-1C'"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "creative-destiny",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "### Read user credentials"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 53,
 63 |    "id": "following-stanford",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "cred = pd.read_csv(\"pw.csv\", header=None)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "id": "altered-antique",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Query for Sentinel products"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 54,
 81 |    "id": "becoming-fifty",
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "querying...\n",
 89 |       "found 22 products\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "api = SentinelAPI(cred[0][0], cred[0][1], 'https://scihub.copernicus.eu/dhus')\n",
 95 |     "print(\"querying...\")\n",
 96 |     "products = api.query(tileid=tile,\n",
 97 |     "                     platformname=platformname,\n",
 98 |     "                     cloudcoverpercentage=cloudcoverpercentage,\n",
 99 |     "                     date=(\"%s%s\" % (year, date_range[0]), \"%s%s\" % (year, date_range[1])),\n",
100 |     "                     processinglevel=processinglevel)\n",
101 |     "df = api.to_dataframe(products)\n",
102 |     "print(\"found %d products\" % len(products))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "pressing-commerce",
108 |    "metadata": {},
109 |    "source": [
110 |     "### Remove very small size products"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 55,
116 |    "id": "nasty-adjustment",
117 |    "metadata": {
118 |     "pycharm": {
119 |      "name": "#%%\n"
120 |     }
121 |    },
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "keeping 22 products with larger than 400Mb\n",
128 |       "number of products found (22) is less than initially selected (40)\n",
129 |       "changing number of selected products to 22\n",
130 |       "you may want to change selection criteria in 'User input' cell to find more products\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "sizes = np.array([float(s.split(\" \")[0]) for s in df['size'].values])\n",
136 |     "products2keep = OrderedDict()\n",
137 |     "for i, prodkey in enumerate(list(products.keys())):\n",
138 |     "    if sizes[i] >= minprodsize:\n",
139 |     "        # print(sizes[i])\n",
140 |     "        products2keep[prodkey] = products[prodkey]\n",
141 |     "df2keep = api.to_dataframe(products2keep).reset_index()\n",
142 |     "print(\"keeping %d products with larger than %dMb\" % (len(products2keep), minprodsize))\n",
143 |     "\n",
144 |     "if len(products2keep) < numproducts:\n",
145 |     "    print(\"number of products found (%d) is less than initially selected (%d)\" % (len(products2keep), numproducts))\n",
146 |     "    print(\"changing number of selected products to %d\" % (len(products2keep)))\n",
147 |     "    print(\"you may want to change selection criteria in 'User input' cell to find more products\")\n",
148 |     "    numproducts = len(products2keep)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "id": "atomic-joseph",
154 |    "metadata": {
155 |     "pycharm": {
156 |      "name": "#%% md\n"
157 |     }
158 |    },
159 |    "source": [
160 |     "### Spread products evenly in time and visualize"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 56,
166 |    "id": "turkish-fetish",
167 |    "metadata": {
168 |     "pycharm": {
169 |      "name": "#%%\n"
170 |     }
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "ccfactor = 0.0    # cloud cover factor when selecting products\n",
175 |     "def distance(doys, target_doy, ccfactor=0):\n",
176 |     "    \"\"\"\n",
177 |     "    distance function for selecting products depending on \n",
178 |     "    proximity to desired date and cloud cover\n",
179 |     "    \"\"\"\n",
180 |     "    dist = np.abs(doys['doy'] - target_doy) + ccfactor * doys['cloudcoverpercentage']\n",
181 |     "    return dist"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 57,
187 |    "id": "delayed-benefit",
188 |    "metadata": {
189 |     "scrolled": true
190 |    },
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "    doy       size cloudcoverpercentage\n",
197 |       "0     8  509.94 MB              64.8212\n",
198 |       "1    13  632.77 MB              25.6059\n",
199 |       "2    21  843.46 MB              20.0165\n",
200 |       "3    36  790.31 MB              69.4045\n",
201 |       "4    43  699.40 MB              10.1653\n",
202 |       "5    48  605.60 MB              34.0831\n",
203 |       "6    53  620.62 MB              24.2112\n",
204 |       "7    56  789.05 MB                    0\n",
205 |       "8    58  569.29 MB              53.8954\n",
206 |       "9    66  804.37 MB              15.0384\n",
207 |       "10   71  804.90 MB              39.1288\n",
208 |       "11   83  625.95 MB                    0\n",
209 |       "12   88  607.89 MB                6.254\n",
210 |       "13   91  802.14 MB              23.6216\n",
211 |       "14  111  808.88 MB              22.0033\n",
212 |       "15  113  611.37 MB                    0\n",
213 |       "16  116  799.76 MB              25.1599\n",
214 |       "17  118  596.73 MB              30.6784\n",
215 |       "18  136  800.57 MB              68.9963\n",
216 |       "19  143  567.81 MB              57.6438\n",
217 |       "20   63  510.67 MB              46.9177\n",
218 |       "21   61  804.79 MB               0.0178\n"
219 |      ]
220 |     },
221 |     {
222 |      "data": {
223 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcAAAAEGCAYAAADylEXaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAY70lEQVR4nO3df3RV5Z3v8c9zcvJTQwoSgQRs7ACBQFBBWDUVarV6bccO7UIcrzNiqO3torXYUcZbF5RV2+m9a7TednWodRVHW20ZGcTeem2nWq2/GK/QYFFCJEpvYSpjTFCEADHJyfneP/Y+5CQ55+QHJDnxeb/WOivnPHs/+/meJ4FP9t4nezszEwAAvomMdgEAAIwGAhAA4CUCEADgJQIQAOAlAhAA4KXoYFaeOHGiVVRU9GlvbAy+VlZmbhvIskzS9RvI+MmvE88TEm1tbVJhYc91Bvqe+qs11fiDff/pxklVd8JQxxiO7x/gq507dx4ys9LRrgM9DSoAKyoqVFdX16f9kkuCr88+m7ltIMsySddvIOMnv048T0i07dolnX9+z3UG+p76qzXV+IN9/+nGSVV3wlDHGI7vH+Ar59yB0a4BfXEIFADgJQIQAOAlAhAA4KVBnQMEAIyOnTt3nh2NRu+TNFfsvAxEXFJ9LBb7woIFC5pTrUAAAsAYEI1G75s8efLs0tLSw5FIhIs49yMej7uWlpaqpqam+yT9Vap1+C0CAMaGuaWlpUcJv4GJRCJWWlp6RMEec+p1RrAeAMDQRQi/wQnnK23OEYAAAC8RgACA02rZsmUVDzzwwPjB9mtsbMy79957J4zUeAQgACArvPHGG/mbN28edAAOFQEIAOjX0aNHI5dccsn0ysrKqhkzZszZuHHj+BdeeKFo4cKFlXPmzJl98cUXzzhw4EBu737p1qmvr8+vqamZWVlZWVVVVTV7z549+WvXri2vq6s7c9asWVV33HHH2bFYTF/60pemzp07d/bMmTOr7rrrromSFI/HtWLFinMqKirm1tTUzDx06NCQ/qKBP4MAgDFo0SKd1svR79ihxkzLH3300XGTJ0/ufPbZZ/dJ0jvvvJPzyU9+csavfvWrfWVlZbGNGzeOX7NmTfmWLVv2J/q0t7e71atXn5Nqneuuu+7cNWvWNK1YseK9EydOuK6uLved73zn4N133z3pmWee2SdJ3/3udyeWlJR01dfXv9bW1uYWLlw46zOf+czR7du3F+3bty9/37599W+++WZudXX1nNra2ncG+54JQABAv+bPn9+2du3aaatWrSpfunTpkbPOOiv2xhtvFF566aUzpWCvrLS0tDO5z6uvvpqfap3Dhw9H3n777bwVK1a8J0lFRUUmqc8nXJ966qlxe/fuLXrsscfGS1Jra2tOQ0NDwXPPPVd8zTXXvBuNRlVRUdF50UUXtQ7lPRGAADAG9bfHdrrNmzev/eWXX27YunVryTe+8Y3yJUuWHJ0+fXrbrl279qbrY2Yu1TqHDx8e0Ok3M3N33333fyxbtuxocvvjjz9eMrR30RPnAAEA/dq/f39ucXFx/Mtf/vK7t9xyS1NdXd0Z7777bvSpp546QwoOd9bV1RUk95k3b977qdYZP358fPLkyR0PPfTQhySpra3Ntba2RkpKSrqOHTuWk+h/+eWXH/nRj35U2t7e7qRgj/Lo0aORj3/8462PPPLIhFgspgMHDuS+9NJLxUN5T+wBAgD6tXPnzsLbb799aiQSUTQatXvuuedANBq11atXn9Pa2prT1dXlVq1a9faFF174fqJPQUGBPfzww39Mtc7PfvazP33xi1/88Le//e2y3Nxc27Jlyx8XLVrUlpOTY5WVlVXXXXfdoXXr1jXv378/v7q6eraZuQkTJnT++te//uP111//3tNPPz1u+vTpc8vKytovuOCCY0N5TwQgAKBfy5YtO7ps2bKG3u11dXV9DsVu3bp1f+J5TU1NW6p1qqur21966aXXe7f3btuwYcNBSQd7r/fggw/+x8CrT41DoAAALxGAAAAvEYAAAC8RgAAALxGAAAAvEYAAAC8RgACAYfH8888X1dbWTpOCP3avqamZOWvWrKqNGzcO+tZFw4G/AwQADIslS5acWLJkyQlJevHFF4skae/evX3+ljCdWCymaHT4Yoo9QADAgDQ2NubNmDFjTuL1+vXrJ91yyy1lixYtqly1alV5dXX17IqKirm/+c1vzpSkxx9/vPgTn/jE9IMHD0ZXrlx57u7du4tmzZpVtWfPnvxf/vKXxbNnz66aOXNm1fLlyyva2tqcJJWXl1evWrWqvKqqavb9998/vry8vPorX/lK+axZs6rmzp07e9u2bUUXX3zxjGnTps298847S0/l/bAHCABj0aJFp/V2SNqx45Qurh2Lxdzu3btf27x5c8m3vvWtsiuvvPLkFV3Ky8tj99xzz4HErY5OnDjhLrvsssonn3yycd68ee2f+9znKu66667S9evXN0vSWWedFWtoaHhNku64446p55xzTsfevXsbbrzxxmmf//znK7Zv3763ra0tUl1dPee2225rGWrN7AECAE7Z8uXLD0tSTU3N8TfffDMv07qvvPJKwdSpU9vnzZvXLkm1tbXvbNu27eQFrVesWHE4ef1rrrnmPUmqrq4+MX/+/OPjx4+Pl5WVxfLy8uKHDh3K0RCxBwgAY9Ep7rENRTQatXg8fvL1+++/f3InqqCgwMJ11NXV5U5lnOLi4njy68S2I5GI8vLyTt43MBKJqLOzc8hjsQcIABiQqVOnxt59991oU1NTTltbm3viiSeGdF++88477/2DBw/m1dfX50vSgw8+eNbixYuHdFPbU8EeIABgQPLz8+3WW299a+HChbMnTZrUOX369Pf779VXUVGR3XvvvfuXL1/+F11dXTrvvPNOrFmzZsjn8oaKAAQADNi6deua161b15xu+ZQpU2IHDx7cLUlXXXVV61VXXdXa+7kkLV26tHXp0qV9/iQi0TfV69WrV78j6Z106w4Wh0ABAF4iAAEAXiIAAWBsiMfj8VP6dKVvwvmKp1tOAALA2FDf0tJSQggOTDwedy0tLSWS6tOtw4dgAGAMiMViX2hqarqvqalprth5GYi4pPpYLPaFdCsQgAAwBixYsKBZ0l+Ndh0fJPwWAQDwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8BIBCADwEgEIAPASAQgA8NLgAjAWk37/e6mlpUdz54kOnWg6qkOvdbd3dkqtrQraevXp7JSOHe7Q4Sf7bktS0DbAcdJtr/f4nSc61NoaLE4sazvSvb3OzuDtHT4c9IkdPqpjhzt6lJCx7l41J8ZoaQmWJW8v09z0NwepxklV94kTPccf7Djp5jrjskzjAEC2MbMBPxY4Z1ZSYlZYaLZpk5mZbbtpky3Wc/YxvWDHVWjbbtpkmzaZRSJmkyNNdlyF1l7Y3WfTJrNJrskW6zl7TyXWmde9LTMLnhcWDmicxOq9t7ftpk09xj+RV2KL9ZxNjjRZbq6Zc919Ets7W00mmZUq6PMxvWCL9ZxdnxvUnLHuXjUnxs/JMVuRu8nao93buy6yyZxLPTf9zUGqcZyzlHWXqskikWD8zrwU2xrCXGdclq5mACapzgbxfy2PkXlEB5mWevHIkeB5ba2OHDHlb1ipVt0tSdqlNrVvqNXXfjxOLn6mJmmPdqlNamsLutfWar2ZJtt7alVce3RE6gja3bhxJ7erjo6TfTKN8/CccfrazdIUO9Bje+0bahXV2ZqkxmD8jja16lVNiterLf5hHdOZmqw9alX85PamaKtaVK0y7dYutem4dgXDd96qa28YJ+f6jmO1tXJm0sqVPWpu31ArpzJJ0g1dtapTx8nt3Ri/VY/obE2yxh5zo9paKcMcKM04EZWpSzkq06s96i5TvY7EK3VDvFY71CF1DGycdHP964uCPgUbavss+7c5ppKbV/atedw4afz4Af94AcBIGlwA9ugZ1bEdDcpRrn6or55sjimqKWrSBEn/qNt6dOmKRDUn3qBV+r6KdbxHe7SpKXiRmxv8RzqAcf68o0nlEel/6rYe24spqkv1jL6m759s+6G+qmMq0s36gRTWltznTv29vqe/O9knMdYxFanMNaXs0xWJKtrQ0KfmmKKaoX2SpE7lqkAdPbbXu7bE+1SGOVA/4yTmOnmc7+nvTo4/0HHSzfXhhqBPaYplx3Y0qCRVzU1NBCCA7DWY3cUFwX5I8CgstEPbGuy4CrvbJDuuQptW0GwT1dxnWbyw0M7L79snXlho1twcPAp7Lss0TuO2ZptW0Hec4yq0SqXuM1Gpa8vUZ2p+6nHihYVmDQ19ah7KONbPHIzUOOnmuqWh2VoaUo9zaFvf2k6OA4BDoFn6GNTKC6TgxFav80UxRaxTOT3OARYWmtUWBMu6Ijk9zgFen9vdJ+U5wMQJtH7GSazee3vbbtrUY/yYC/rUFmyy3FyzvDyzv4323N51bpNJZn+tnu3J5wDT1t2r5sT448YFfbqSx4lssry81HPT3xykGic311LW/dcKarg+d5PFXYptDWGuMy5LVzMAAjBLH4MLwDPOMJs/v89v9h0La+xY5Xxraehub24227EjWNa7T3Oz2dF5NdYxr++2zMyspm+fdOOk217v8Vsamm3Hju6dnx07zI6f37295mazJ54IHh0Lg229/ERzjxIy1t2r5sQYzc3BsuTtZZqb/uYg1Tip6m5o6Dn+YMdJN9cZl2UaB/AYAZidj8GdA3ROKi6WSkt7NOcW5Sq3KFdnzO5uLy0NVyvKlZTbo09pqaTxfdu7N5gbPAYwTrrt9R5/4uxSTezdpyRXKune3hVXhAv/R65UlKsLruh/nHQ1nxw/XJY7vuf20s1Nf3OQapz+6k67rQzL0s11xmWZxgGALMMfwgMAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC8RAACALxEAAIAvEQAAgC85Mxs4Cs71yqpcfjKGTYTJR0a7SIGaSzWLFH3SKPukTXUuj9sZqWnuxicmugg1280swuHpZJh5JyrG2t1j8WaJeoeadQ9ssZq3UiNQ6AAAC8RgAAALw02AH88LFUMv7FY91isWaLukUbdI2us1o0UBvUhGAAAPig4BAoA8BIBCADw0oAC0Dl3pXOu0Tm3zzn39eEu6lQ45/Y753Y753Y55+rCtgnOud86594Iv47Pgjrvd841O+fqk9pS1ukCPwjn/1Xn3Pwsq/ubzrmD4Zzvcs59OmnZ7WHdjc65/zJKNU9zzj3jnGtwzu1xzt0ctmf1fGeoO9vnu8A5t8M590pY9x1h+7nOue1hfZudc3lhe374el+4vCLL6v6Jc+5PSfN9ftieFT8nOAVmlvEhKUfSHyV9RFKepFckVfXXb7QekvZLmtir7U5JXw+ff13SP2ZBnUskzZdU31+dkj4t6d8kOUkflbQ9y+r+pqQ1KdatCn9e8iWdG/4c5YxCzVMkzQ+fF0t6Pawtq+c7Q93ZPt9O0pnh81xJ28N5/FdJ14bt90paFT7/sqR7w+fXSto8SvOdru6fSLo6xfpZ8XPCY+iPgewBLpK0z8z+n5l1SHpY0tIB9MsmSyX9NHz+U0mfHb1SAmb2vKR3ezWnq3OppAct8JKkDznnpoxIob2kqTudpZIeNrN2M/uTpH0Kfp5GlJm9ZWYvh89bJb0mqVxZPt8Z6k4nW+bbzOxY+DI3fJikSyU9Erb3nu/E9+ERSZc559zIVNstQ93pZMXPCYZuIAFYLunPSa/fVOZ/hKPNJD3pnNvpnPtvYdskM3srfN4kadLolNavdHWOhe/BTeFhoPuTDjFnXd3h4bULFPx2P2bmu1fdUpbPt3Muxzm3S1KzpN8q2Bt9z8xiKWo7WXe4/Iiks0a04FDvus0sMd/fCef7e865/LAta+YbQ/NB/BDMxWY2X9KnJH3FObckeaGZmTL/VpcVxkqdoR9J+gtJ50t6S9Ldo1pNGs65MyVtlfQ1MzuavCyb5ztF3Vk/32bWZWbnS5qqYC901uhWNDC963bOzZV0u4L6F0qaIOm/j16FOJ0GEoAHJU1Lej01bMtKZnYw/Nos6RcK/vG9nTg0EX5tHr0KM0pXZ1Z/D8zs7fA/jrikjeo+7JY1dTvnchWEyM/N7NGwOevnO1XdY2G+E8zsPUnPSLpIwSHCxPWHk2s7WXe4vETSOyNbaU9JdV8ZHoo2M2uX9ICyeL4xOAMJwN9LmhF+gitPwUnqx4a3rKFxzp3hnCtOPJd0haR6BfXeEK52g6Rfjk6F/UpX52OSVoSfOvuopCNJh+5GXa/zHp9TMOdSUPe14af8zpU0Q9KOUajPSfpnSa+Z2f9KWpTV852u7jEw36XOuQ+FzwslXa7g/OUzkq4OV+s934nvw9WSfhfukY+oNHXvTfolySk4b5k836P+c4JTMJBPyij4tNPrCo7jrx3tT+5kqPMjCj4F94qkPYlaFZxPeFrSG5KekjQhC2r9FwWHrzoVnDu4MV2dCj5l9sNw/ndLujDL6n4orOtVBf8pTElaf21Yd6OkT41SzRcrOLz5qqRd4ePT2T7fGerO9vmeJ+kPYX31ktaH7R9REMj7JG2RlB+2F4Sv94XLP5Jldf8unO96ST9T9ydFs+LnhMfQH1wKDQDgpQ/ih2AAAOgXAQgA8BIBCADwEgEIAPASAQgA8BIBiBER3sFgzTBuvzS8k8AfnHOLh2scAB8c0f5XAcaEyyTtNrMvjMRgzrmodV/XEsAYxB4gho1zbq1z7nXn3DZJlUntX3TO/T6879pW51yRc644vOdabrjOuOTXSX0rnHO/Cy9M/LRz7pzw/mx3Sloa3q+tMGn9S51z/zvp9eXOuV+Ez69wzv1f59zLzrkt4TU35ZxbH9ZX75z7ceLOBM65Z51z33fBfSZvHq55AzAyCEAMC+fcAgWXzTtfwdVLFiYtftTMFprZeQoukXWjBbf7eVbSX4brXBuu19lr0/8k6admNk/SzyX9wMx2SVqv4D5y55tZW9L6z0ia5ZwrDV+vlHS/c26ipHWSPmnBxdPrJN0SrrMhrG+upEJJVyVtL8/MLjSzrLsANYDBIQAxXBZL+oWZnbDgDgbJ14+d65x7wTm3W9LfSJoTtt+nIKAUfn0gxXYvkrQpfP6QgsuFpWXBpY4ekvS34XUeL1JwE9OPKriB7L+Ht7+5QdKHw26fCM8n7lZwD7s5SZvcnGk8AGMH5wAxGn4i6bNm9opzrlbSJZJkZv8eHuK8RMGdzOvTbWCQHpD0fyS9L2mLmcXCw5q/NbP/mryic65A0j0Kruv4Z+fcNxVcqzLh+GmqCcAoYw8Qw+V5SZ91zhWGd+j4TNKyYklvhef3/qZXvwcV7OGl2vuTpBcVHB5V2PeF/goxs/+U9J8KDnkmtvuSpI8556ZLJ+8kMlPdYXcoPCd4de/tAfhgYA8Qw8LMXnbObVZwZ45mBbfVSviGgjubt4Rfi5OW/VzSPyi460QqX5X0gHPu78P+K9Os19vPJZWa2WthfS3h3ue/uO47fK8zs9edcxsVXPm/qVfdAD5AuBsEsopz7mpJS83s+tO83Q2S/mBm/3w6twtg7GIPEFnDOfdPkj6l4FOjp3O7OxWcu7v1dG4XwNjGHiAAwEt8CAYA4CUCEADgJQIQAOAlAhAA4CUCEADgpf8PqtOQteONilYAAAAASUVORK5CYII=\n",
224 |       "text/plain": [
225 |        "<Figure size 432x288 with 1 Axes>"
226 |       ]
227 |      },
228 |      "metadata": {
229 |       "needs_background": "light"
230 |      },
231 |      "output_type": "display_data"
232 |     }
233 |    ],
234 |    "source": [
235 |     "start_doy = get_doy(\"%s%s\" % (year, date_range[0]))\n",
236 |     "end_doy = get_doy(\"%s%s\" % (year, date_range[1]))\n",
237 |     "uniform_doy_list = np.linspace(start_doy, end_doy, numproducts).tolist()\n",
238 |     "\n",
239 |     "doys = df2keep.copy()  # [['datatakesensingstart', 'cloudcoverpercentage', 'size']]\n",
240 |     "doys['doy'] = pd.DataFrame(\n",
241 |     "    doys['datatakesensingstart'].apply(lambda s: get_doy(str(s).split(' ')[0].replace('-', ''))))\n",
242 |     "\n",
243 |     "idx_list = []\n",
244 |     "for doy_ in uniform_doy_list:\n",
245 |     "    # print(doy_)\n",
246 |     "    doys['distance'] = distance(doys, doy_, ccfactor)\n",
247 |     "    idx = doys['distance'].argmin()\n",
248 |     "    idx_list.append(pd.DataFrame(doys.iloc[idx, :]).T)\n",
249 |     "    doys = doys.drop(index=idx).reset_index(drop=True)\n",
250 |     "prod2keep = pd.concat(idx_list).reset_index(drop=True)  # df2keep.iloc[idx_list].reset_index(drop=True)\n",
251 |     "prod2keep['doy'] = pd.DataFrame(\n",
252 |     "    prod2keep['datatakesensingstart'].apply(lambda s: get_doy(str(s).split(' ')[0].replace('-', ''))))\n",
253 |     "\n",
254 |     "# visualize\n",
255 |     "plt.scatter(prod2keep['doy'].values, np.zeros(prod2keep.shape[0]), s=20, c='b')\n",
256 |     "plt.scatter(uniform_doy_list, np.zeros(len(uniform_doy_list)), s=20, c='r')\n",
257 |     "plt.vlines(prod2keep['doy'].values, 0, 1, color='b', label='selected')\n",
258 |     "plt.vlines(uniform_doy_list, 0, -1, color='r', label='uniform')\n",
259 |     "plt.hlines(0, 1, 365, color='k', alpha=0.3)\n",
260 |     "plt.ylim(-1, 1)\n",
261 |     "plt.xlim(0, 365)\n",
262 |     "plt.yticks([], [])\n",
263 |     "plt.xlabel('day of year')\n",
264 |     "plt.legend(bbox_to_anchor=(1.3, 1))\n",
265 |     "\n",
266 |     "# examine\n",
267 |     "print(prod2keep[['doy', 'size', 'cloudcoverpercentage']])  # .columns)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "id": "auburn-nomination",
273 |    "metadata": {
274 |     "pycharm": {
275 |      "name": "#%% md\n"
276 |     }
277 |    },
278 |    "source": [
279 |     "### Save selected products to disk"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 58,
285 |    "id": "constitutional-newman",
286 |    "metadata": {
287 |     "pycharm": {
288 |      "name": "#%%\n"
289 |     }
290 |    },
291 |    "outputs": [
292 |     {
293 |      "name": "stdout",
294 |      "output_type": "stream",
295 |      "text": [
296 |       "making new directory /media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/Germany/S2_products/T32UPV/2021\n",
297 |       "saving products info to /media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/Germany/S2_products/T32UPV/2021/Sentinel-2_Level-1C_tile32UPU_minsize400Mb_22dates_year2021_from0101to0530_mincc0maxcc70.csv\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "savename = '%s/%s_%s_tile%s_minsize%dMb_%ddates_year%s_from%sto%s_mincc%dmaxcc%d.csv' % \\\n",
303 |     "           (savedir, platformname, processinglevel, tile, minprodsize, numproducts, year, \n",
304 |     "            date_range[0], date_range[1], cloudcoverpercentage[0], cloudcoverpercentage[1])\n",
305 |     "\n",
306 |     "if not os.path.exists(os.path.dirname(savename)):\n",
307 |     "    print(\"making new directory %s\" % os.path.dirname(savename))\n",
308 |     "    os.makedirs(os.path.dirname(savename))\n",
309 |     "\n",
310 |     "print(\"saving products info to %s\" % savename)\n",
311 |     "prod2keep.to_csv(savename, index=False)"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "id": "amino-blade",
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": []
321 |   }
322 |  ],
323 |  "metadata": {
324 |   "kernelspec": {
325 |    "display_name": "Python 3",
326 |    "language": "python",
327 |    "name": "python3"
328 |   },
329 |   "language_info": {
330 |    "codemirror_mode": {
331 |     "name": "ipython",
332 |     "version": 3
333 |    },
334 |    "file_extension": ".py",
335 |    "mimetype": "text/x-python",
336 |    "name": "python",
337 |    "nbconvert_exporter": "python",
338 |    "pygments_lexer": "ipython3",
339 |    "version": "3.8.2"
340 |   }
341 |  },
342 |  "nbformat": 4,
343 |  "nbformat_minor": 5
344 | }


--------------------------------------------------------------------------------
/download/find_S2_tiles_for_aoi.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "declared-banana",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "from sentinelsat import SentinelAPI, geojson_to_wkt\n",
 12 |     "import shapely.wkt\n",
 13 |     "from shapely.geometry import Polygon\n",
 14 |     "if __name__ == \"__main__\" and __package__ is None:\n",
 15 |     "    from sys import path\n",
 16 |     "    from os.path import dirname as dir\n",
 17 |     "    path.append(dir(path[0]))\n",
 18 |     "    __package__ = \"examples\"\n",
 19 |     "from utils.geospatial_data_utils import GeoTransform, make_rect_poly"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "id": "urban-adapter",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "### User input"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "id": "adequate-mandate",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "NW = (9.5, 26.5)  # north-west coordinates of AOI box\n",
 38 |     "SE = (7, 28.5)  # south east coordinates of AOI box\n",
 39 |     "CRS = '4326'  # '2154'  # coordinate reference system for AOI"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "id": "accredited-tutorial",
 46 |    "metadata": {
 47 |     "scrolled": true
 48 |    },
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stderr",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "/home/michaeltrs/Programming/miniconda3/envs/satdata/lib/python3.8/site-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6\n",
 55 |       "  return _prepare_from_string(\" \".join(pjargs))\n",
 56 |       "/home/michaeltrs/Programming/miniconda3/envs/satdata/lib/python3.8/site-packages/pyproj/crs/crs.py:294: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6\n",
 57 |       "  projstring = _prepare_from_string(\" \".join((projstring, projkwargs)))\n",
 58 |       "/home/michaeltrs/Programming/miniconda3/envs/satdata/lib/python3.8/site-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6\n",
 59 |       "  return _prepare_from_string(\" \".join(pjargs))\n",
 60 |       "/home/michaeltrs/Programming/miniconda3/envs/satdata/lib/python3.8/site-packages/pyproj/crs/crs.py:294: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6\n",
 61 |       "  projstring = _prepare_from_string(\" \".join((projstring, projkwargs)))\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "transform = GeoTransform(CRS, '4326', loc2loc=False)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "id": "rapid-rochester",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "### Make rectangular polygon for AOI extent"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "id": "developed-volume",
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "AOI area:  5.0\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "NW_glob = transform(NW[1], NW[0])\n",
 93 |     "SE_glob = transform(SE[1], SE[0])\n",
 94 |     "AOI = Polygon([[NW_glob[1], NW_glob[0]],\n",
 95 |     "               [NW_glob[1], SE_glob[0]],\n",
 96 |     "               [SE_glob[1], SE_glob[0]],\n",
 97 |     "               [SE_glob[1], NW_glob[0]],\n",
 98 |     "               [NW_glob[1], NW_glob[0]]])\n",
 99 |     "print('AOI area: ', AOI.area)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "id": "searching-idaho",
105 |    "metadata": {},
106 |    "source": [
107 |     "### Query for products"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "id": "approved-anxiety",
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "querying...\n"
121 |      ]
122 |     },
123 |     {
124 |      "name": "stderr",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "Querying products: 100%|██████████| 121/121 [00:02<00:00,  9.53 products/s]\n"
128 |      ]
129 |     },
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "found tiles overlapping with AOI: 35PPL, 35NPH, 35NPJ, 35PPK, 35PNK, 35NNH, 35NNJ, 35PNL, 35PML, 35PMK, 35NMJ, 35NMH\n",
135 |       "finding overlap with AOI:\n",
136 |       "----------------------------------------------\n",
137 |       "tile id  | AOI/Tile overlap | Tile/AOI overlap\n",
138 |       "----------------------------------------------\n",
139 |       "35PPL   |      0.0391        |      0.2733\n",
140 |       "35NPH   |      0.0281        |      0.1422\n",
141 |       "35NPJ   |      0.1178        |      0.5962\n",
142 |       "35PPK   |      0.1007        |      0.5554\n",
143 |       "35PNK   |      0.0060        |      1.0000\n",
144 |       "35NNH   |      0.0150        |      0.1894\n",
145 |       "35NNJ   |      0.0403        |      1.0000\n",
146 |       "35PNL   |      0.1084        |      0.5458\n",
147 |       "35PML   |      0.0638        |      0.3213\n",
148 |       "35PMK   |      0.1169        |      0.5900\n",
149 |       "35NMJ   |      0.1169        |      0.5911\n",
150 |       "35NMH   |      0.0280        |      0.1416\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "poly = make_rect_poly(NW_glob, SE_glob)\n",
156 |     "footprint = geojson_to_wkt(poly)\n",
157 |     "cred = pd.read_csv(\"pw.csv\", header=None)\n",
158 |     "api = SentinelAPI(cred[0][0], cred[0][1], 'https://scihub.copernicus.eu/dhus')\n",
159 |     "print(\"querying...\")\n",
160 |     "products = api.query(footprint,\n",
161 |     "                     platformname='Sentinel-2',\n",
162 |     "                     cloudcoverpercentage=(0,100),\n",
163 |     "                     area_relation='Intersects',\n",
164 |     "                     date=('20200101', '20200201'),\n",
165 |     "                     processinglevel='Level-1C')\n",
166 |     "\n",
167 |     "# find unique tiles\n",
168 |     "tiles = {}\n",
169 |     "tileids = []\n",
170 |     "for prod in products:\n",
171 |     "    if products[prod]['tileid'] not in tileids:\n",
172 |     "        tileids.append(products[prod]['tileid'])\n",
173 |     "        tiles[prod] = products[prod]\n",
174 |     "    # print(products[prod].keys())\n",
175 |     "    # break\n",
176 |     "print(\"found tiles overlapping with AOI: %s\" % \", \".join(tileids))\n",
177 |     "\n",
178 |     "# find overlap with AOI for each tile\n",
179 |     "print(\"finding overlap with AOI:\")\n",
180 |     "print(\"----------------------------------------------\")\n",
181 |     "print(\"tile id  | AOI/Tile overlap | Tile/AOI overlap\")\n",
182 |     "print(\"----------------------------------------------\")\n",
183 |     "for i, pr in enumerate(list(tiles.keys())):\n",
184 |     "    meta = api.get_product_odata(pr)\n",
185 |     "    tile = shapely.wkt.loads(meta['footprint'])\n",
186 |     "    aoi_cover_ratio = AOI.intersection(tile).area/AOI.area\n",
187 |     "    tile_cover_ratio = AOI.intersection(tile).area/tile.area\n",
188 |     "    print(\"%s   |      %.4f        |      %.4f\"  \n",
189 |     "          % (tileids[i], aoi_cover_ratio, tile_cover_ratio))"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "id": "bridal-active",
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": []
199 |   }
200 |  ],
201 |  "metadata": {
202 |   "kernelspec": {
203 |    "display_name": "Python 3",
204 |    "language": "python",
205 |    "name": "python3"
206 |   },
207 |   "language_info": {
208 |    "codemirror_mode": {
209 |     "name": "ipython",
210 |     "version": 3
211 |    },
212 |    "file_extension": ".py",
213 |    "mimetype": "text/x-python",
214 |    "name": "python",
215 |    "nbconvert_exporter": "python",
216 |    "pygments_lexer": "ipython3",
217 |    "version": "3.8.2"
218 |   }
219 |  },
220 |  "nbformat": 4,
221 |  "nbformat_minor": 5
222 | }
223 | 


--------------------------------------------------------------------------------
/download/get_downloaded_products_info.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "generic-astronomy",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "from glob import glob\n",
 14 |     "from datetime import datetime\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "%matplotlib inline\n",
 17 |     "if __name__ == \"__main__\" and __package__ is None:\n",
 18 |     "    from sys import path\n",
 19 |     "    from os.path import dirname as dir\n",
 20 |     "    path.append(dir(path[0]))\n",
 21 |     "    __package__ = \"examples\"\n",
 22 |     "from utils.sentinel_products_utils import get_S2prod_info\n",
 23 |     "from utils.date_utils import get_doy"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "activated-audit",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### User input"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 16,
 37 |    "id": "personal-metallic",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "products_dir = \"/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2017/cloud_0_30\"\n",
 42 |     "# \"/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2017/cloud_0_70\"\n",
 43 |     "ext =  \".zip\"  # \".SAFE\"  #"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "flexible-paper",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "### Find products \n",
 52 |     "\n",
 53 |     "If a directory contains yest unzipped products we will parse product info from the filename following the [Sentinel product naming convention](https://sentinel.esa.int/web/sentinel/user-guides/sentinel-2-msi/naming-convention).\n",
 54 |     "\n",
 55 |     "#### Compact Naming Convention\n",
 56 |     "\n",
 57 |     "The compact naming convention is arranged as follows:\n",
 58 |     "\n",
 59 |     "MMM_MSIXXX_YYYYMMDDHHMMSS_Nxxyy_ROOO_Txxxxx_<Product Discriminator>.ext\n",
 60 |     "\n",
 61 |     "The products contain two dates.\n",
 62 |     "\n",
 63 |     "The first date (YYYYMMDDHHMMSS) is the datatake sensing time.\n",
 64 |     "The second date is the \"<Product Discriminator>\" field, which is 15 characters in length, and is used to distinguish between different end user products from the same datatake. Depending on the instance, the time in this field can be earlier or slightly later than the datatake sensing time.\n",
 65 |     "\n",
 66 |     "The other components of the filename are:\n",
 67 |     "\n",
 68 |     "- MMM: is the mission ID(S2A/S2B)\n",
 69 |     "- MSIXXX: MSIL1C denotes the Level-1C product level/ MSIL2A denotes the Level-2A product level\n",
 70 |     "- YYYYMMDDHHMMSS: the datatake sensing start time\n",
 71 |     "- Nxxyy: the PDGS Processing Baseline number (e.g. N0204)\n",
 72 |     "- ROOO: Relative Orbit number (R001 - R143)\n",
 73 |     "- Txxxxx: Tile Number field\n",
 74 |     "- ext: file extension either zip or SAFE: Product Format (Standard Archive Format for Europe)\n"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 17,
 80 |    "id": "fifty-wedding",
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "num data:  23\n",
 88 |       "                                             filename   tile platformname  \\\n",
 89 |       "17  S2A_MSIL1C_20170413T104021_N0204_R008_T31TFM_2...  31TFM   Sentinel-2   \n",
 90 |       "18  S2A_MSIL1C_20170423T104021_N0204_R008_T31TFM_2...  31TFM   Sentinel-2   \n",
 91 |       "9   S2A_MSIL1C_20170510T103031_N0205_R108_T31TFM_2...  31TFM   Sentinel-2   \n",
 92 |       "22  S2A_MSIL1C_20170602T104021_N0205_R008_T31TFM_2...  31TFM   Sentinel-2   \n",
 93 |       "16  S2A_MSIL1C_20170619T103021_N0205_R108_T31TFM_2...  31TFM   Sentinel-2   \n",
 94 |       "\n",
 95 |       "   processinglevel  year      date          Mb  doy  \n",
 96 |       "17        Level-1C  2017  20170413  835.608218  103  \n",
 97 |       "18        Level-1C  2017  20170423  841.460811  113  \n",
 98 |       "9         Level-1C  2017  20170510  414.762817  130  \n",
 99 |       "22        Level-1C  2017  20170602  836.677158  153  \n",
100 |       "16        Level-1C  2017  20170619  312.798229  170  \n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "filenames = [os.path.basename(fn) for fn in glob(\"%s/*%s\" % (products_dir, ext))]\n",
106 |     "# print(filenames)\n",
107 |     "prodinfo = []\n",
108 |     "for fn in filenames:\n",
109 |     "    info = fn.split('_')\n",
110 |     "    year = info[2][:4]\n",
111 |     "    date = info[2][:8]\n",
112 |     "    size = int(os.path.getsize(os.path.join(products_dir, fn)))/1e6\n",
113 |     "    tile = info[5][1:]\n",
114 |     "    platformname = \"Sentinel-%s\" % info[0][1]\n",
115 |     "    processinglevel = \"Level-%s\" % info[1][-2:]\n",
116 |     "    prodinfo.append([fn, tile, platformname, processinglevel, year, date, size])\n",
117 |     "prodinfo = pd.DataFrame(\n",
118 |     "        prodinfo, columns=['filename', 'tile', 'platformname', 'processinglevel', 'year', 'date', 'Mb'])\n",
119 |     "prodinfo['doy'] = prodinfo['date'].apply(lambda s: get_doy(s))\n",
120 |     "prodinfo = prodinfo.sort_values('doy')\n",
121 |     "print(\"num data: \", prodinfo.shape[0])\n",
122 |     "print(prodinfo.head(5))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "medium-owner",
128 |    "metadata": {},
129 |    "source": [
130 |     "### Visualize downloaded product dates"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 18,
136 |    "id": "announced-mongolia",
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "<matplotlib.legend.Legend at 0x7f8de85398e0>"
143 |       ]
144 |      },
145 |      "execution_count": 18,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     },
149 |     {
150 |      "data": {
151 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdAAAAEWCAYAAADW7MapAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAcZ0lEQVR4nO3de3xV5Z3v8e8vIUCQm0C8oGBUhBIQocG+RKFSqk7B+0AdHSui2PF4plJPtdqedjxOX3WOzJnaqW3VjiIqXrCC1to6tiBYsd4KFQUvXKygIJGLgAkIEvI7f6xnwzYmOztPsrN39PN+vfYre6/1PGv91rM3+WZd2MvcXQAAoHmK8l0AAADtEQEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQNEsZrbGzE7Jcw0/NrPNZlbVxuudYmbPZph/rpm9a2Y1ZjaiLWsD0PYI0Dwzs05mNsPM1ppZtZktNbPxafMrzGyxmW0Nj/lmVpHPmluqqSBqom9/SVdLqnD3Q1q3shb7D0nfcveu7v5ySxdmZv9hZqvC5+JNM5tcb/5wM1tiZjvDz+Fp875iZgvNbLuZranXr38I+fSHm9nVLa0Z+DwhQPOvg6R3JZ0sqYekH0r6tZmVh/nvSZokqZekPpJ+K2l2a6zYzDq0xnLaWH9JW9x9Y74LacARkl6L6WhmxQ1M3iHpTCWfi4sl/czMTgztO0p6TNJ9kg6UdI+kx8L0VN+7JH23/kLd/Z0Q8l3dvaukYyXVSZobUzvweUWA5pm773D3G9x9jbvXufvvJL0tqTLM3xbmuSSTtFfSgMaWZ2ZPm9n/NbOXzOxDM3vMzHqFeeVhT2Oqmb0jaYGZFZnZD8Me8EYzu9fMeqQt76Iwb4uZ/aDeuu42sx+nvR5rZuvSXvczs0fMbFPo/wszGyzpdkmjwp7PttB2gpm9Hva21pvZNQ1s2ymS5knqG/reHaafZWavmdm2sP2D0/q4mQ1Ie72v5lS9ZnZ12PYNZnZJWtveZvbbMI4vSTq6kTHvZGY1koolvWJmb4Xpg0M920J9Z9Wr4zYze8LMdkj6Sv3luvv/cfc3w+fiRUmLJI0Ks8cq+ePrP919t7vfouTzMS70fcndZ0n6W0M11zNZ0jPuviaLtgACArTAmNnBkgaq3p5MCJpdkn4u6d+aWMxkSZdKOlRSraRb6s0/WdJgSX8naUp4fEXSUZK6SvpFWGeFpNskXSSpr6Tekg7PcjuKJf1O0lpJ5ZIOkzTb3d+Q9D8kPR/2gHqGLjMkXe7u3SQNlbSg/jLdfb6k8ZLeC32nmNlASQ9KukpSmaQnJD2etifWlEOU7OEdJmmqpF+a2YFh3i+VjPmhSsbz0oYWEAKsa3h5nLsfbWYlkh6X9EdJB0m6UtL9ZjYores/SrpRUjdJGQ9pm1mppOO1/3MxRNKr/snv4nw1TM+amZmSz8s9zekHgAAtKOGX7v2S7nH3N9PnhaDpIelbkpo6vzbL3Ze7+w5J/yLpvHqHCG8Ie74fSbpQ0s3u/jd3r5H0fUnnh8O7kyT9zt2fcffdYVl1WW7Ol5SE7nfDuna5e6aQ2COpwsy6u/tWd/9rluv5B0m/d/d57r5HyXnIUkknZtl/j6Qfufsed39CUo2kQWG8Jkq6PtS/XM0LmROU/DFyk7t/7O4LlPxBcUFam8fc/c9hD3NXE8u7XdIrkv4QXneVtL1em+1Kwrg5Rks6WNKcZvYDPvcI0AJhZkWSZkn6WElIfkoIxNsl3WtmB2VY3Ltpz9dKKlFy/rSh+X1Dm/T2HZT8Uu2b3jasf0tT2xL0k7TW3WuzbD9R0gRJa83sT2Y2qqkOwSfqd/c6JTUflmX/LfVq3KkknMq0//x0Svo4ZVPXu6Ge9P7pdb2rLJjZ/1OyV35e2h5njaTu9Zp2l1TdjBql5Nzq3PDHE4BmIEALQDiMNkNJaE0Me1KNKZLURZkDol/a8/5K9rI2p01LP+z3npKLX9Lb10p6X9KG9GWZWRclh3FTdoRaUtKvin1XUv9GLlT61C2A3P0v7n62ksOdv5H06wb6NeQT9Yex7CdpfZi0M0ONmWxSMg71xzJb70nqF/4wSu+/Pu11k7dCMrN/VXLY+jR3/zBt1muShoXtTRmmZlzEFA4Lf10cvgWiEKCF4TYl5yTPDIdV9zGzU81shJkVm1l3STdL2irpjQzL+4Yl//2li6QfSZrj7nsbafugpP9lZkeaWVcl51cfCntlcySdYWajwznFH+mTn5mlkiaYWS8zO0TJeciUl5QE8E1mdoCZdTazk8K89yUdnjpPaWYdzexCM+sR/nj4UNkfKv61pNPN7KvhEPjVknZLei6txn8M4/c1Jed/mxTG6xFJN5hZl3A++OIsa5KkF5WE97VmVmJmY5VcUZv1FdRm9n0l50lPcff6e/5PK7mgbFq4iCl11GJB6FtkZp2VHH2wMP71zwufq+SztLAZ2wUgIEDzzMyOkHS5pOGSqmz//8u7MDTpqSTktkt6S8mVoF9r4pzZLEl3S6qS1FnStAxt7wrtn1Fy9e8uJRe8yN1fk/TPkh5QEoZbJa1L6ztLyXm5NUoulnkoNSME0JlKrhh+J/T7hzB7gZI9pSozS+0ZXyRpjZl9qOQio9T2Z+TuKyR9Q8nFVZvDOs90949Dk2+HadvCMn+TzXKDbyk5nFulZDxnZtsxrP9MJXuPmyXdKmly/XPbTfg3JXutq9M+F/87bfnnKLkAaJuSC5zOSdvuL0v6SMlFVf3D8z/WW/7FSs6Xc1NgIILxb+ezxcyelnSfu9+Z71oA4LOMPVAAACIQoAAAROAQLgAAEdgDBQAgQrO+TLxPnz5eXl6eo1LQkBUrkp+DBrWsTUvX0dbLTPVPyWY56evMtP7W2t5s19dYv5T0Prl8v1varzEtGceYmtrivW1tS5Ys2ezuZfmuA62rWQFaXl6uxYsX56oWNGDs2OTn00+3rE1L19HWy0z1T8lmOenrzLT+1trebNfXWL+U9D65fL9b2q8xLRnHmJra4r1tbWbWnG+xQjvBIVwAACIQoAAARCBAAQCI0KxzoACAtrVkyZKDOnTocKeSO/Kw09O26iQtr62tvayysnJj/ZkEKAAUsA4dOtx5yCGHDC4rK9taVFTEf9xvQ3V1dbZp06aKqqqqOyWdVX8+f80AQGEbWlZW9iHh2faKioq8rKxsu5K9/0/Pb+N6AADNU0R45k8Y+wazkgAFACACAQoAyNp3vvOdvtdff/3Brb3cFStWdDzmmGOGtMayJk6cWD5z5swDc71uAhQAgAgEKAAgo+uuu+6Q8vLyoZWVlYNWrVrVSZKee+650uOOO+4LAwcOrDj11FOP3rRpU/H69es7DBkyZLAkPf/886VmVrlq1aqOktSvX7+h1dXVRRMnTiyfMmVKvxEjRnzh8MMPP7ahPcWdO3fapEmTygcOHFgxePDgiscff7yblOwpVlZWDqqoqBhcUVExeN68eQdIUl1dnSZPnty/vLx86Iknnjhw8+bN+/6HyaJFi7ocf/zxg4YMGTJ49OjRx6xdu7YkNX3QoEEVgwYNqrj55psPihkX/hsLALQjX/qSWvWr8l96SRlvF7Bo0aIujz76aK9ly5a9vmfPHg0fPrxixIgRO6dMmXLkT3/603dOP/30mquuuqrvdddd1/euu+56d/fu3UUffPBB0cKFC7sOGTJk5/z587u6e03v3r1ru3XrVidJ77//fsnixYvfXLp0aedzzz13wCWXXLI1fZ3Tp08/yMy0cuXK119++eXOEyZMOOatt95a3rdv39pFixat7NKliy9btqzTBRdccNTy5cvfmDVrVs/Vq1d3Wr169fJ169aVHHvssUOmTJmyZffu3TZt2rT+v//971f37du39o477jjwmmuuOezhhx9eM3Xq1PKf/exn74wfP77m8ssvPzxm7AhQAECjFi5c2HXChAnbUuF32mmnbduxY0dRdXV18emnn14jSd/85je3fP3rXz9KkkaOHFkzf/78rs8++2y3a6+9dsOTTz7Zw911wgkn1KSWedZZZ20rLi5WZWXlri1btpTUX+dzzz3X9corr9woSSNGjNjVt2/fj5ctW9Z5wIABH0+dOvWI119/vbSoqEhr167tJEl/+tOfup133nkfdOjQQeXl5XtGjRpVLUmvvvpqp1WrVpWOGzduoJTsqZaVle3ZvHlzcXV1dfH48eNrJOnSSy/dsmDBgh7NHRsCFADakab2GPNtzJgx1c8880y3devWdbzwwgu3/eQnPzlEkp9xxhnbU206d+6877/luGf/P3RuvPHGgw866KA9c+fOfbuurk6lpaWVmdq7uw0YMOCjpUuXvpk+ffPmzcXZb1HjOAcKAGjUuHHjap544omeNTU1tnXr1qJ58+b1POCAA+q6d+++98knn+wqSTNmzOg9atSoGkk65ZRTaubOndvryCOP3F1cXKyePXvWLly4sMepp55ak3lN+5100kk19913Xy8p2YvcsGFDx2HDhu3avn178aGHHrqnuLhYt956a++9e/dKkk4++eTqOXPm9KqtrdXatWtLXnjhhW6SNGzYsF0ffPBBh/nz5x8gSbt377bFixd37tOnz95u3brt/cMf/tBVku6+++5eMWPDHigAoFGjR4/eee65534wdOjQIb17994zbNiwHZI0c+bMt6+44oojpk2bVtS/f//dDz744BpJGjRo0MfubmPGjKmWpFGjRtVs2LChY1lZ2d5s13nttddunDx58hEDBw6sKC4u1q9+9as1paWlftVVV22cOHHi0bNnz+49bty47aWlpXWSdNFFF2176qmnug8YMGBo3759d48YMaJGSvZ0Z8+e/da0adP6V1dXF+/du9euuOKK90eOHLlrxowZay677LJyM9PYsWM/jBkbAhQAkNH06dOrpk+fXlV/+iuvvPJmQ+2rqqpeTT2/6aabqm666aZ9fefOnbsmve3OnTtflpLgXbVq1WuS1KVLF58zZ84n2knSscceu3vlypWvp17fdttt6yWpqKhI99577zsN1XLiiSd+tHjx4k8d9h4zZszOFStWvJ42aV1D/TPhEC4AABEIUAAAIhCgAFDY6urq6izfRXxehbGva2geAQoAhW35pk2behCibS/cD7SHpOUNzeciIgAoYLW1tZdVVVXdWVVVNVTs9LS1OknLa2trL2toJgEKAAWssrJyo6Sz8l0HPo2/ZgAAiECAAgAQgQAFACACAQoAQAQCFACACAQoAAARCFAAACIQoAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQAEAiECAAgAQgQAFACACAQoAQAQCFACACAQoAAARCFAAACIQoAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQAEAiECAAgAQgQAFACACAQoAQAQCFACACAQoAAARCFAAACIQoAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQAEAiECAAgAQgQAFACACAQoAQAQCFACACAQoAAARCFAAACIQoAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQAEAiECAAgAQgQAFACACAQoAQAQCFACACAQoAAARCFAAACIQoAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQAEAiECAAgAQgQAFACACAQoAQAQCFACACAQoAAARCFAAACIQoAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQAEAiECAAgAQgQAFACACAQoAQAQCFACACAQoAAARCFAAACIQoAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEQhQAAAiEKAAAEQgQAEAiECAAgAQgQAtcHv2SNXV0qZNLWvT0nW09TJT/XfuzH456evMtP7W2t5s19dYv4a2LZfvd0v7peqt/7Ml4xhTU1u8t0BW3D3rR2VlpaPtPPCAe1GRe3Gxe2lp8jqmTUvXkYu6s+lfVOQuJT+bWk76OktK3M0aXn9rbW+262vOtuXy/W5pv1S9Zp/8mc1705o1ZeqTi89ya5G02Jvxu5ZH+3hY8t5mZ+TIkX7LLbfkLs2xz9at0t//vfTxx/undewoPfKIdOCB2bdp6TpyUXdz+ze1nEx90vtJrbO92a6vOXV27Jj8zMX73Zr9GtPccYypKVMfqfU/y63ppJNOWuLuI/NdB1oXh3ALVFWVVFLyyWkdOiTTm9OmpetorlzU1NRyMvVJ79da25vt+prTr6goeWRaTmz9rdmvMc0dx5iaMvXJxWcZaFJzdlc5hNt2Nm5MDkNJ+x+lpcn05rRp6TpyUXdz+ze1nEx90vu11vZmu77m9OvcOXfvd2v2a+42t2ZNmfrk4rPcmsQh3M/ko1mNCdC29cADyS+B7t0znxNrqk1L15GLurPp37mz7/tFmM25xdQ6S0rcO3ZseP2ttb3Zrq8525bL97ul/VL1lpR88mc2701r1pSpTy4+y62FAP1sPpp9DnTx4sU52xvGp23aJK1ZI5WXS2Vl8W1auo62Xmaqf9euUk1NdstJX6fU+Ppba3uzXV9j/Rratly+3y3tl6q3/s+WjGNMTZn65OKz3BrMjHOgn0EEKADkGAH62cRFRAAARCBAAQCIQIACABCBAAUAIAIBCgBABAIUAIAIBCgAABEIUAAAIhCgAABEIEABAIhAgAIAEIEABQAgAgEKAEAEAhQAgAgEKAAAEZp1P1Azq5a0Infl5EwfSZvzXUQztceaJepua9TdtmLrPsLdC+gW32gNHZrZfkV7vCmsmS1ub3W3x5ol6m5r1N222mvdyA0O4QIAEIEABQAgQnMD9L9yUkXutce622PNEnW3NepuW+21buRAsy4iAgAACQ7hAgAQgQAFACBCVgFqZl8zsxVmttrMvpfrolrCzNaY2TIzW2pmi8O0XmY2z8xWhZ8HFkCdd5nZRjNbnjatwTotcUsY/1fN7IsFVvcNZrY+jPlSM5uQNu/7oe4VZvZ3eaq5n5ktNLPXzew1M/t2mF7Q452h7kIf785m9pKZvRLq/tcw/UgzezHU95CZdQzTO4XXq8P88gKr+24zezttvIeH6QXxOUEeuXvGh6RiSW9JOkpSR0mvSKpoql++HpLWSOpTb9q/S/peeP49SdMLoM4vS/qipOVN1SlpgqT/lmSSTpD0YoHVfYOkaxpoWxE+L50kHRk+R8V5qPlQSV8Mz7tJWhlqK+jxzlB3oY+3SeoanpdIejGM468lnR+m3y7pivD8f0q6PTw/X9JDeRrvxuq+W9KkBtoXxOeER/4e2eyBfknSanf/m7t/LGm2pLOz6FdIzpZ0T3h+j6Rz8ldKwt2fkfRBvcmN1Xm2pHs98YKknmZ2aJsUWk8jdTfmbEmz3X23u78tabWSz1ObcvcN7v7X8Lxa0huSDlOBj3eGuhtTKOPt7l4TXpaEh0saJ2lOmF5/vFPvwxxJXzUza5tq98tQd2MK4nOC/MkmQA+T9G7a63XK/I8431zSH81siZn9U5h2sLtvCM+rJB2cn9Ka1Fid7eE9+FY4jHVX2iHygqs7HB4coWTvot2Md726pQIfbzMrNrOlkjZKmqdkb3ibu9c2UNu+usP87ZJ6t2nBQf263T013jeG8f6pmXUK0wpmvJEfn8WLiEa7+xcljZf0z2b25fSZ7u7K/FdlQWgvdQa3STpa0nBJGyT9JK/VNMLMukqaK+kqd/8wfV4hj3cDdRf8eLv7XncfLulwJXvBX8hvRdmpX7eZDZX0fSX1Hy+pl6Tr8lchCkk2AbpeUr+014eHaQXJ3deHnxslParkH+/7qUMr4efG/FWYUWN1FvR74O7vh188dZLu0P7DhgVTt5mVKAmh+939kTC54Me7obrbw3inuPs2SQsljVJyiDP1/dvpte2rO8zvIWlL21b6SWl1fy0cSnd33y1ppgp4vNG2sgnQv0g6JlxB11HJSf7f5rasOGZ2gJl1Sz2XdJqk5UrqvTg0u1jSY/mpsEmN1flbSZPDVX8nSNqedugx7+qd9zlXyZhLSd3nh6ssj5R0jKSX8lCfSZoh6Q13vzltVkGPd2N1t4PxLjOznuF5qaRTlZy/XShpUmhWf7xT78MkSQvCEYE21Ujdb6b9kWVKztumj3fePyfIo2yuNFJytdlKJecxfpDvK58y1HmUkqsQX5H0WqpWJedTnpK0StJ8Sb0KoNYHlRx+26Pk3MnUxupUcpXfL8P4L5M0ssDqnhXqelXJL5VD09r/INS9QtL4PNU8Wsnh2VclLQ2PCYU+3hnqLvTxHibp5VDfcknXh+lHKQn01ZIeltQpTO8cXq8O848qsLoXhPFeLuk+7b9StyA+Jzzy9+Cr/AAAiPBZvIgIAICcI0ABAIhAgAIAEIEABQAgAgEKAEAEAhRtItxB5JocLr8s3MnjZTMbk6v1AEBKh6abAO3CVyUtc/fL2mJlZtbB93+vK4DPIfZAkTNm9gMzW2lmz0oalDb9m2b2l3Dfxblm1sXMuoV7LpaENt3TX6f1LTezBeGLvZ8ys/7h/oz/LunscL/G0rT248zsN2mvTzWzR8Pz08zseTP7q5k9HL5zVmZ2fahvuZn9V+rOIGb2tJn9pyX3mf12rsYNQPtAgCInzKxSydc+Dlfy7TnHp81+xN2Pd/fjlHzF21RPbtf1tKTTQ5vzQ7s99Rb9c0n3uPswSfdLusXdl0q6Xsl9JIe7+0dp7RdK+oKZlYXXl0i6y8z6SPqhpFM8ufnAYknfCW1+EeobKqlU0hlpy+vo7iPdveC+wB1A2yJAkStjJD3q7js9uYNI+vcnDzWzRWa2TNKFkoaE6XcqCTiFnzMbWO4oSQ+E57OUfN1dozz5qq1Zkr4Rvud0lJKbIJ+g5AbUfw63r7pY0hGh21fC+dRlSu5hOSRtkQ9lWh+Azw/OgSIf7pZ0jru/YmZTJI2VJHf/czhEO1ZSsbsvb2wBzTRT0uOSdkl62N1rw2HZee5+QXpDM+ss6VYl32v6rpndoOS7WlN2tFJNANo59kCRK89IOsfMSsMdcs5Mm9dN0oZwfvPCev3uVbKH2dDepyQ9p+TwrkLfRU0V4u7vSXpPySHb1HJfkHSSmQ2Q9t3JZ6D2h+XmcE50Uv3lAYDEHihyxN3/amYPKbkzzkYlt8VL+RdJL0raFH52S5t3v6QfK7nrS0OulDTTzL4b+l/SSLv67pdU5u5vhPo2hb3fB82sU2jzQ3dfaWZ3KLnzRlW9ugFgH+7GgoJiZpMkne3uF7Xycn8h6WV3n9GaywXw+cUeKAqGmf1c0nglV+225nKXKDl3eXVrLhfA5xt7oAAAROAiIgAAIhCgAABEIEABAIhAgAIAEIEABQAgwv8HyV+YQ4PBI/UAAAAASUVORK5CYII=\n",
152 |       "text/plain": [
153 |        "<Figure size 432x288 with 1 Axes>"
154 |       ]
155 |      },
156 |      "metadata": {
157 |       "needs_background": "light"
158 |      },
159 |      "output_type": "display_data"
160 |     }
161 |    ],
162 |    "source": [
163 |     "plt.title(\"%d products found for %s\" % \n",
164 |     "          (prodinfo['doy'].shape[0], \",\".join(prodinfo['year'].drop_duplicates().tolist())))\n",
165 |     "plt.scatter(prodinfo['doy'].values, np.zeros(prodinfo.shape[0]), s=20, c='b')\n",
166 |     "#plt.scatter(uniform_doy_list, np.zeros(selected_doys.shape[0]), s=20, c='r')\n",
167 |     "plt.vlines(prodinfo['doy'].values, 0, 1, color='b', label='downloaded')\n",
168 |     "# plt.vlines(uniform_doy_list, 0, -1, color='r', label='uniform')\n",
169 |     "plt.hlines(0, 1, 365, color='k', alpha=0.3)\n",
170 |     "plt.ylim(-0.1, 1)\n",
171 |     "plt.xlim(0, 365)\n",
172 |     "plt.yticks([], [])\n",
173 |     "plt.xlabel('day of year')\n",
174 |     "plt.legend(bbox_to_anchor=(1.35, 1))"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "id": "concrete-wildlife",
180 |    "metadata": {},
181 |    "source": [
182 |     "### Save downloaded products to disk"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 19,
188 |    "id": "thermal-examination",
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "saving products info to /media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2017/cloud_0_30/downloaded_as_of_20211117_170444.csv\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "savename = '%s/downloaded_as_of_%s.csv' % \\\n",
201 |     "           (products_dir, datetime.now().strftime('%Y%m%d_%H%M%S'))\n",
202 |     "\n",
203 |     "if not os.path.exists(os.path.dirname(savename)):\n",
204 |     "    print(\"making new directory %s\" % os.path.dirname(savename))\n",
205 |     "    os.makedirs(os.path.dirname(savename))\n",
206 |     "\n",
207 |     "print(\"saving products info to %s\" % savename)\n",
208 |     "prodinfo.to_csv(savename, index=False)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "id": "happy-defense",
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": []
218 |   }
219 |  ],
220 |  "metadata": {
221 |   "kernelspec": {
222 |    "display_name": "Python 3",
223 |    "language": "python",
224 |    "name": "python3"
225 |   },
226 |   "language_info": {
227 |    "codemirror_mode": {
228 |     "name": "ipython",
229 |     "version": 3
230 |    },
231 |    "file_extension": ".py",
232 |    "mimetype": "text/x-python",
233 |    "name": "python",
234 |    "nbconvert_exporter": "python",
235 |    "pygments_lexer": "ipython3",
236 |    "version": "3.8.2"
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 5
241 | }
242 | 


--------------------------------------------------------------------------------
/download/sentinelsat_download_tileid.py:
--------------------------------------------------------------------------------
 1 | # spatial data processing pipelines
 2 | import argparse
 3 | import pandas as pd
 4 | from sentinelsat import SentinelAPI  # , read_geojson, geojson_to_wkt
 5 | import os
 6 | from glob import glob
 7 | from collections import OrderedDict
 8 | 
 9 | 
10 | # USER INPUT -----------------------------------------------------------------------------------------------------------
11 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
12 | parser.add_argument('--products_file', metavar='PRODUCTS FILE', default='',
13 |                     help='path to file containing all products to be downloaded')
14 | 
15 | args = parser.parse_args()
16 | products_file = args.products_file
17 | 
18 | # CODE -----------------------------------------------------------------------------------------------------------------
19 | # authentication
20 | cred = pd.read_csv("download/pw.csv", header=None)
21 | api = SentinelAPI(cred[0][0], cred[0][1], 'https://apihub.copernicus.eu/apihub')  # 'https://scihub.copernicus.eu/dhus')  #
22 | 
23 | # read products to download from file
24 | if ',' in products_file:
25 |     products_file = products_file.split(',')
26 |     savedir = os.path.dirname(products_file[0])
27 |     products = pd.concat([pd.read_csv(products_file_) for products_file_ in products_file])
28 | else:
29 |     savedir = os.path.dirname(products_file)
30 |     products = pd.read_csv(products_file)
31 | 
32 | # make products into ordered dict
33 | products2download = OrderedDict()
34 | for i in range(products.shape[0]):  # enumerate(list(products.keys())):
35 |     products2download[products['index'].iloc[i]] = products.iloc[i].to_dict()
36 | 
37 | # find number of remaining products
38 | down_filenames = [os.path.basename(p).split(".")[0] for p in glob(os.path.join(savedir, "*.zip"))]
39 | N = 0
40 | for key in products2download:
41 |     if products2download[key]['identifier'] in down_filenames:
42 |         N += 1
43 | print("%d of %d new products already downloaded, %d remaining" % (N, len(products2download), len(products2download)-N))
44 | 
45 | # download
46 | # try:
47 | api.download_all(products2download, directory_path=savedir, n_concurrent_dl=1)
48 | # except:
49 | #     p
50 | print("waiting 30min...")
51 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib~=3.3.0
2 | numpy~=1.19.1
3 | pandas~=1.1.0
4 | rasterio~=1.1.5
5 | shapely~=1.7.0
6 | sentinelsat~=0.14
7 | scikit-learn
8 | simplification
9 | pyproj


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaeltrs/DeepSatData/c2774597dc57af46f777ba2212fe026305d2fd1e/utils/__init__.py


--------------------------------------------------------------------------------
/utils/data_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import zipfile
 4 | 
 5 | 
 6 | def unzip_all(dir_name, extension=".zip"):
 7 |     for item in os.listdir(dir_name):
 8 |         if item.endswith(extension):
 9 |             file_name = os.path.join(dir_name, item)
10 |             zip_ref = zipfile.ZipFile(file_name)
11 |             zip_ref.extractall(dir_name)
12 |             zip_ref.close()
13 |             os.remove(file_name)
14 | 
15 | 
16 | def find_number(text, c, single=True):
17 |     val = re.findall(r'%s(\d+)' % c, text)
18 |     if single:
19 |         val = val[0]
20 |     return val
21 | 


--------------------------------------------------------------------------------
/utils/date_utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | from glob import glob
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def get_doy(date):
 8 |     Y = date[:4]
 9 |     m = date[4:6]
10 |     d = date[6:]
11 |     date = "%s.%s.%s" % (Y, m, d)
12 |     dt = datetime.datetime.strptime(date, '%Y.%m.%d')
13 |     return dt.timetuple().tm_yday
14 | 
15 | 
16 | def get_date(day):
17 |     """
18 |     :param day: day of the year [0, 365]
19 |     :return: sting day_of_month-month, ie. "3-Jul"
20 |     """
21 |     if day < 31:
22 |         m = "Jan"
23 |         d = day
24 |     elif day < 59:
25 |         m = "Feb"
26 |         d = day - 31
27 |     elif day < 90:
28 |         m = "Mar"
29 |         d = day - 59
30 |     elif day < 120:
31 |         m = "Apr"
32 |         d = day - 90
33 |     elif day < 151:
34 |         m = "May"
35 |         d = day - 120
36 |     elif day < 181:
37 |         m = "Jun"
38 |         d = day - 151
39 |     elif day < 212:
40 |         m = "Jul"
41 |         d = day - 181
42 |     elif day < 243:
43 |         m = "Aug"
44 |         d = day - 212
45 |     elif day < 273:
46 |         m = "Sep"
47 |         d = day - 243
48 |     elif day < 304:
49 |         m = "Oct"
50 |         d = day - 273
51 |     elif day < 334:
52 |         m = "Nov"
53 |         d = day - 304
54 |     else:
55 |         m = "Dec"
56 |         d = day - 334
57 |     return "%d-%s" % (d, m)
58 | 
59 | 
60 | def get_paths(root_dir, pattern, save_name=None, relative=True):
61 |     files = glob(os.path.join(root_dir, pattern))
62 |     N = len(root_dir.split("/"))
63 |     if relative:
64 |         # base = "/".join(pattern.split("/")[:-1])
65 |         files = ["/".join(x.split("/")[N:]) for x in files]
66 |     print("%d files found matching %s" % (len(files), pattern))
67 |     if save_name:
68 |         # check if abs path
69 |         if not os.path.exists(save_name):
70 |             save_name = os.path.join(root_dir, save_name)
71 |         pd.DataFrame(files).to_csv(save_name, header=None, index=False)
72 |     else:
73 |         return files
74 | 
75 | 
76 | def get_unique_vals(path, col, header=None, name_fn=None):
77 |     data = pd.read_csv(path, header=header)
78 |     data = data[col]
79 |     if name_fn:
80 |         data = data.apply(name_fn)
81 |     return data.value_counts()
82 | 
83 | 
84 | def get_lat_lon(loc, loc_type="meters"):
85 |     if loc_type == "meters":
86 |         lat = 111319.488
87 |         lon = 111120.0
88 |         return loc[0] / lat, loc[1] / lon
89 | 


--------------------------------------------------------------------------------
/utils/geospatial_data_utils.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | from shapely import geometry
  4 | from shapely.geometry import Polygon
  5 | from pyproj import Proj, transform
  6 | import re 
  7 | from simplification.cutil import simplify_coords
  8 | from sentinelsat import geojson_to_wkt
  9 | 
 10 | 
 11 | class GeoTransform:
 12 |     def __init__(self, intr, outtr, loc2loc=False):
 13 |         """
 14 |         - loc2loc: from local to local coord system. In this case tranform remains x, y -> x, y,
 15 |                    otherwise x, y -> y, x
 16 |         """
 17 |         intr = str(intr)
 18 |         outtr = str(outtr)
 19 |         if not intr.isnumeric(): intr = get_epsg_code(intr)
 20 |         if not outtr.isnumeric(): outtr = get_epsg_code(outtr)
 21 |         self.inProj = Proj(init='epsg:%s' % intr)  # %d' % get_epsg_code(country))
 22 |         self.outProj = Proj(init='epsg:%s' % outtr)  # 2154')
 23 |         self.loc2loc = loc2loc
 24 | 
 25 |     def __call__(self, x, y):
 26 |         yout, xout = transform(self.inProj, self.outProj, x, y)
 27 |         if self.loc2loc:
 28 |             return yout, xout
 29 |         return xout, yout
 30 | 
 31 | 
 32 | def make_AOI(coords, transform):
 33 |     if type(coords) == str:
 34 |         x = [float(x) for x in re.findall("[+-]?\d+(?:\.\d+)?", coords)]
 35 |         x = np.array(x).reshape(-1, 2)
 36 | 
 37 |         points = []
 38 |         for point in x:
 39 |             points.append(transform(point[0], point[1]))
 40 |         points.append(transform(x[0][0], x[0][1]))
 41 | 
 42 |         poly = make_poly(points[:-1])
 43 |         footprint = coords
 44 |         AOI = Polygon(points)
 45 | 
 46 |     elif type(coords) in [list, tuple]:
 47 |         if len(coords) == 2:  # assume NW, SE boxx coords
 48 |             NW, SE = coords
 49 |             NW_glob = transform(NW[1], NW[0])
 50 |             SE_glob = transform(SE[1], SE[0])
 51 | 
 52 |             poly = make_rect_poly(NW_glob, SE_glob)
 53 |             footprint = geojson_to_wkt(poly)
 54 |             AOI = Polygon([[NW_glob[1], NW_glob[0]],
 55 |                            [NW_glob[1], SE_glob[0]],
 56 |                            [SE_glob[1], SE_glob[0]],
 57 |                            [SE_glob[1], NW_glob[0]],
 58 |                            [NW_glob[1], NW_glob[0]]])
 59 | 
 60 |         else:
 61 |             points = []
 62 |             for point in coords:
 63 |                 points.append(transform(point[0], point[1]))
 64 |             points.append(transform(x[0][0], x[0][1]))
 65 | 
 66 |             poly = make_poly(points[:-1])
 67 |             footprint = geojson_to_wkt(poly)
 68 |             AOI = Polygon(points)
 69 |     
 70 |     return poly, footprint, AOI
 71 | 
 72 | 
 73 | def make_poly(points, ret_points=False):
 74 |     points.append(points[0])
 75 |     poly = {"type": "FeatureCollection",
 76 |             "features": [{"type": "Feature", "properties": {}, "geometry": {
 77 |                 "type": "Polygon",
 78 |                 "coordinates": [[points]]} }]}
 79 |     if ret_points:
 80 |         return poly['features'][0]['geometry']['coordinates']
 81 |     return poly
 82 | 
 83 | 
 84 | def make_rect_poly(nw, se, ret_points=False):
 85 |     # W, N
 86 |     poly = {"type": "FeatureCollection",
 87 |             "features": [{"type": "Feature", "properties": {}, "geometry": {
 88 |                 "type": "Polygon",
 89 |                 "coordinates": [[[nw[1], nw[0]],
 90 |                                  [nw[1], se[0]],
 91 |                                  [se[1], se[0]],
 92 |                                  [se[1], nw[0]],
 93 |                                  [nw[1], nw[0]]]]} }]}
 94 |     if ret_points:
 95 |         return poly['features'][0]['geometry']['coordinates']
 96 |     return poly
 97 | 
 98 | 
 99 | def get_epsg_code(country):
100 |     epsg_code = {'germany': 32632, 'senegal': 32628, 'france': 32631}
101 |     return epsg_code[country]
102 | 
103 | 
104 | def plot_poly(points, c=None, newfig=True):
105 |     if type(points) in [list, tuple]:
106 |         points = np.array(points)
107 |     if c is None:
108 |         c = "r"
109 |     if newfig:
110 |         plt.figure()
111 |     for i in range(points.shape[0] - 1):
112 |         plt.plot(points[i:i + 2, 0], points[i:i + 2, 1], c=c)
113 |         plt.scatter(points[i, 0], points[i, 1], c=c)
114 | 
115 | 
116 | def get_points_from_str_poly(str_poly):
117 |     return np.array([[float(j) for j in i.split(" ") if j != ''] for i in str_poly.split("(")[-1].split(")")[0].split(",")])
118 | 
119 | 
120 | # eometry
121 | def get_line_eq(p1, p2, h=1e-7):
122 |     '''
123 |     P: (x, y)
124 |     '''
125 |     denom = p2[0] - p1[0]
126 |     if denom == 0:
127 |         denom = h
128 |     a = (p2[1] - p1[1]) / denom
129 |     b = (p1[1] * p2[0] - p2[1] * p1[0]) / denom
130 |     return a, b
131 | 
132 | 
133 | def get_perp_line(p1, p2, p3, h=1e-7):
134 |     a, b = get_line_eq(p1, p2)
135 |     if a == 0:
136 |         a = h
137 |     a_ = - 1. / a
138 |     b_ = p3[1] + 1. / a * p3[0]
139 |     return a_, b_
140 | 
141 | 
142 | def get_perp_bisect(p1, p2, p3, h=1e-7):
143 |     '''
144 |     p1, p2: line segment end points
145 |     p3: point outside line
146 |     '''
147 |     a, b = get_line_eq(p1, p2)
148 |     if a == 0:
149 |         a = h
150 |     a_ = - 1. / a
151 |     b_ = p3[1] + 1. / a * p3[0]
152 |     x = (b_ - b) / (a - a_)
153 |     y = a * x + b
154 |     return x, y
155 | 
156 | 
157 | def dist(p1, p2):
158 |     return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
159 | 
160 | 
161 | def is_between(p1, p2, p3):
162 |     xmax = max((p1[0]), p2[0])
163 |     xmin = min((p1[0]), p2[0])
164 |     if (p3[0] < xmax) & (p3[0] > xmin):
165 |         return True
166 |     else:
167 |         return False
168 | 
169 | 
170 | def min_dist(p1, p2, p3):
171 |     p4 = get_perp_bisect(p1, p2, p3)
172 |     if is_between(p1, p2, p4):
173 |         return p4, dist(p3, p4)
174 |     d1 = dist(p3, p1)
175 |     d2 = dist(p3, p2)
176 |     return [p1, p2][np.argmin([d1, d2])], min((d1, d2))
177 | 
178 | 
179 | def closest_point_to_poly(poly, point, return_dist=False):
180 |     D = []
181 |     P = []
182 |     for i in range(len(poly)):
183 |         if i == (len(poly) - 1):
184 |             p, d = min_dist(poly[i], poly[0], point)
185 |         else:
186 |             p, d = min_dist(poly[i], poly[i + 1], point)
187 |         D.append(d)
188 |         P.append(p)
189 |     idx = np.argmin(D)
190 |     if return_dist:
191 |         return D[idx]
192 |     return P[idx]
193 | 
194 | 
195 | def distance_pix_to_poly(poly, point):
196 |     poly_point = closest_point_to_poly(poly, point)
197 |     return dist(poly_point, point)
198 | 
199 | 
200 | 
201 | def add_points(poly, numpoints=100):
202 |     # increase number of points by splitting largest line segments in half
203 |     while poly.shape[0] < numpoints:
204 |         idx = np.argmax([dist(poly[i], poly[i+1]) for i in range(poly.shape[0]-1)])
205 |         new_point = (poly[idx] + poly[idx+1]) / 2.
206 |         poly = np.insert(poly, idx+1, new_point, 0)
207 |     return poly
208 | 
209 | 
210 | def interp1d(N, Nmax, Nmin, tmax, tmin):
211 |     return (N - Nmin) / (Nmax - Nmin) * (tmax - tmin) + tmin
212 | 
213 | 
214 | def simplify_poly_points(poly, numpoints=20, iter_max=20):
215 |     numpoints_init = poly.shape[0]
216 | 
217 |     if numpoints_init == numpoints:
218 |         return poly
219 |     elif numpoints_init < numpoints:
220 |         return add_points(poly, numpoints)
221 |     else:  # get initial values of t that lead to larger and smaller polygons
222 |         Nmax = numpoints_init
223 |         t = 5
224 |         while simplify_coords(poly, t).shape[0] >= numpoints:
225 |             t *= 2
226 |         Nmin = simplify_coords(poly, t).shape[0]
227 |         Tmax, Tmin = 0, t
228 | 
229 |     iter = 0
230 |     while True:
231 |         t = interp1d(numpoints, Nmax, Nmin, Tmax, Tmin)
232 |         poly_ = simplify_coords(poly, t)
233 |         N = poly_.shape[0]
234 |         # print(N, t)
235 |         if N == numpoints:
236 |             break
237 |         elif N > numpoints:
238 |             Nmax, Tmax = N, t
239 |         elif N < numpoints:
240 |             Nmin, Tmin = N, t
241 |         iter += 1
242 |         if iter > iter_max:
243 |             poly_ = simplify_coords(poly, Tmin)
244 |             return add_points(poly_, numpoints)
245 |     return poly_
246 | 
247 | 
248 | def is_valid(parcel_poly, pxmin, pymax, res=10):
249 |     """
250 |     checks if parcel_poly polygon has valid shape
251 |     """
252 |     isvalid = True
253 |     i = 0
254 |     j = 0
255 |     pix_points = [[pxmin + loc[0] * res, pymax - loc[1] * res] for loc in
256 |                   [[j, i], [j + 1, i], [j + 1, i + 1], [j, i + 1], [j, i]]]
257 |     try:
258 |         parcel_poly.intersection(geometry.Polygon(pix_points)).area
259 |     except:
260 |         isvalid = False
261 |     return isvalid
262 | 
263 | 
264 | def str_line_eq(points, h=1e-1):
265 |     assert points.shape == (2, 2), 'Two points must be used to derive straight line equation'
266 |     x1, y1 = points[0]
267 |     x2, y2 = points[1]
268 |     denom = x2 - x1
269 |     if denom == 0:
270 |         denom = h
271 |     a = (y2 - y1) / denom  # (x2 - x1)
272 |     b = (y1 * x2 - x1 * y2) / denom  # (x2 - x1)
273 |     return a, b
274 | 
275 | 
276 | # def find_samples_in_poly(N, W, h, w, T, data_df):
277 | #     is_in_prod = (N >= data_df['north']) & (N - 10 * h <= (data_df['north'])) & \
278 | #                  (W <= data_df['west']) & (W + 10 * w >= (data_df['west']))
279 | #     prod_doy = get_doy(T)
280 | #     data_doy = (data_df[[c for c in data_df.columns if c.startswith("doy")]] * 365.0001).round(0).astype(np.int32)
281 | #     doy_idx = (data_doy == prod_doy).any(axis=1)
282 | #     return data_df[is_in_prod & doy_idx]
283 | 


--------------------------------------------------------------------------------
/utils/multiprocessing_utils.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | 
 3 | 
 4 | def flatten_list(l):
 5 |     return [item for sublist in l for item in sublist]
 6 | 
 7 | 
 8 | def run_pool(x, f, num_cores, split=False):
 9 |     if not split:
10 |         x = split_num_segments(x, num_cores)
11 |     print(len(x))
12 |     # x = [[x_, i] for i, x_ in enumerate(x)]
13 |     pool = Pool(num_cores)
14 |     res = pool.map(f, x)
15 |     return res
16 | 
17 | 
18 | def split_num_segments(inlist, num_segments):
19 |     res = [[] for _ in range(num_segments)]
20 |     i = 0
21 |     while len(inlist) > 0:
22 |         if i < num_segments:
23 |             res[i].append(inlist.pop())
24 |         else:
25 |             res[i % num_segments].append(inlist.pop())
26 |         i += 1
27 |     return res
28 | 
29 | 
30 | def split_size_segments(inlist, seg_size):
31 |     i = 0
32 |     newlist = []
33 |     while len(inlist) - len(newlist) * seg_size > seg_size:
34 |         newlist.append(inlist[i * seg_size: (i + 1) * seg_size])
35 |         i += 1
36 |     if len(inlist) - len(newlist) * seg_size > 0:
37 |         newlist.append(inlist[i * seg_size:])
38 |     return newlist
39 | 
40 | 
41 | def split_df(df, num_segments):
42 |     idx = df.index.to_list()
43 |     idx_segments = split_num_segments(idx, num_segments)
44 |     return [df.iloc[idx_seg].reset_index(drop=True) for idx_seg in idx_segments]
45 | 


--------------------------------------------------------------------------------
/utils/sentinel_products_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | import pandas as pd
 4 | import rasterio
 5 | 
 6 | 
 7 | def get_S2prod_info(imdirs):
 8 |     data = []
 9 |     for imdir in imdirs:
10 |         imname = "%s_%s" % (imdir.split("/")[-2].split("_")[-3], imdir.split("/")[-4].split("_")[-5])
11 |         f = rasterio.open("%s/%s_B02.jp2" % (imdir, imname))
12 |         tile_transform = list(f.meta['transform'])
13 |         tile_wn = [tile_transform[2], tile_transform[5]]
14 | 
15 |         data.append([imdir, imdir.split("/")[-2], imname, tile_wn[0], tile_wn[1], f.meta['height'],
16 |                      f.meta['width'], imname.split("_")[1][:8], f.crs.to_dict()['init']])
17 |     df = pd.DataFrame(data=data,
18 |                       columns=["path", "prod_name1", "prod_name2", "West", "North", "height", "width", "Time",
19 |                                "crs"])  # ,
20 |     # dtype=[np.str, np.str, np.float32, np.float32, np.float32, np.float32, np.str, np.str])
21 |     return df
22 | 
23 | 
24 | def get_S2tile_coords(basedir):
25 |     """
26 |     basedir: directory containing sentinel-2 products
27 |     """
28 |     basedir = '/media/michaeltrs/0a8a5a48-ede5-47d0-8eff-10d11350bf98/Satellite_Data/Sentinel2/PSETAE_repl/2018/cloud_0_30'
29 |     if basedir.split('.')[-1] == 'SAFE':
30 |         imdir = basedir
31 |     elif os.path.dir(basedir):
32 |         files = glob('%s/*.SAFE' % basedir)
33 |         tile = [s.split('/')[-1].split('_')[5] for s in files]
34 |         assert all([t == tile[0] for t in tile]), "not all products in dir correspond to the same tile"
35 |         imdir = files[0]
36 |     imdir = glob("%s/GRANULE/**/IMG_DATA" % imdir)[0]
37 |     # info = get_S2prod_info(filename)
38 |     imname = "%s_%s" % (imdir.split("/")[-2].split("_")[-3], imdir.split("/")[-4].split("_")[-5])
39 |     f = rasterio.open("%s/%s_B02.jp2" % (imdir, imname))
40 |     tile_transform = list(f.meta['transform'])
41 |     tile_wn = [tile_transform[2], tile_transform[5]]
42 |     tile_es = [tile_wn[0] + 10 * f.meta['width'], tile_wn[1] - 10 * f.meta['height']]
43 |     return tile_wn, tile_es
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------