├── README.md └── python ├── 01-splitshp-shadow.py ├── 02-pathfinder.py ├── 02-safefinder.py ├── 03-arrayextractor.py ├── 04-flatten-temporal.py ├── 05-histogramize-shadow.py ├── 05-medianize.py ├── 06-histo2stack.py ├── 06-median2stack.py ├── 07-medianstack2ARD.py ├── 07-stack2ARD.py ├── 07C-doyFusion-median.py ├── 07C-doyFusion.py ├── 08-mergeTarget-parallel.py ├── 08A-removeDuplicates-parallel.py ├── 08B-mergeObservations-parallel.py ├── 09-runRF-article-iterate.py └── 09-runTCN-article-iterate.py /README.md: -------------------------------------------------------------------------------- 1 | # Scalable crop yield mapping with Sentinel-2 time series and temporal convolutional network (TCN) 2 | 3 | This repository includes codes for preprocessing the data from Sentinel-2 L2A product into time series and and ready for the prediction models TCN and random forests (RF). 4 | 5 | in python/ 6 | 7 | - 01-splitshp-shadow.py: ESRI shapefile for polygons (field parcel) is split into subsets (files) by Sentinel-2 granule boundaries. 8 | - 02-pathfinder.py: filepaths to Sentinel-2 bands is searched. Use this if no intentions for cloud-masking. 9 | - 02-safefinder.py: directory paths to Sentinel-2 SAFE directories. Use this if cloud-masking wanted. 10 | - 03-arrayextractor.py: extract pixel values from bands by polygons. Cloud-mask used is safe paths given. 11 | - 04-flatten-temporal.py: flatten the observations into 11-day temporal composites. 12 | - 05-histogramize-shadow.py: calculate histograms for each observation (band). 13 | - 05-medianize.py: calculate median for each observation (band). 14 | - 06-histo2stack.py: stack histograms from separate files into one file. 15 | - 06-median2stack.py: stack medians from separate files into one file. 16 | - 07-medianstack2ARD.py: make analysis ready data from medians. 17 | - 07-stack2ARD.py: make analysis ready data from histograms. 18 | - 07C-doyFusion-median.py: if duplicates at day-of-year, merge all observations per day per farm into one (matrix addition) 19 | - 07C-doyFusion.py: if duplicates at day-of-year, merge all observations per day per farm into one (matrix addition) 20 | - 08A-removeDuplicates-parallel.py: remove duplicates, if any, compute marix addition. 21 | - 08B-mergeObservations-parallel.py: merge farms by region 22 | - 08-mergeTarget-parallel.py: merge values with reference to write target y files for training. 23 | - 09-runRF-article-iterate.py: run RF, iterate 10 times for each data set (hard coded) 24 | - 09-runTCN-article-iterate.py: run TCN, iterate 10 times for each data set (hard coded) 25 | 26 | 27 | -------------------------------------------------------------------------------- /python/01-splitshp-shadow.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | 2020-06-01 MY 4 | 5 | Usage: 6 | python splitshp-shadow.py --s2tiles suomiTiles.shp \ 7 | --fullshapefile shapefile --outshpdir satotutkimus-shpPerTile/ --out_file farmIDtile.tsv 8 | 9 | Modified version of EODIE splitshp.py 10 | 11 | """ 12 | 13 | import os 14 | from osgeo import osr 15 | import subprocess 16 | import sys 17 | 18 | import pandas as pd 19 | import geopandas as gpd 20 | 21 | import argparse 22 | import textwrap 23 | import pathlib 24 | 25 | 26 | def main(args): 27 | try: 28 | if not args.fullshapefile or not args.s2tiles: 29 | raise Exception('Missing shapefile argument. Try --help .') 30 | 31 | print(f'\n\nsplitshp-shadow.py') 32 | print(f'\nSentinel2 tiles: {args.s2tiles}') 33 | print(f'ESRI shapefile parcels: {args.fullshapefile}') 34 | out_dir_path = pathlib.Path(os.path.expanduser(args.outshpdir)) 35 | out_dir_path.mkdir(parents=True, exist_ok=True) 36 | 37 | 38 | out_file = args.out_file 39 | 40 | print('Reading parcels...') 41 | 42 | def checkProjection(myshp): 43 | print('INFO: checking the projection of the inputfile now') 44 | head, tail = os.path.split(myshp) 45 | root, ext = os.path.splitext(tail) 46 | rootprj = root + '.prj' 47 | projectionfile = os.path.join(head, rootprj) 48 | prj_file = open(projectionfile , 'r') 49 | prj_text = prj_file.read() 50 | srs = osr.SpatialReference() 51 | srs.ImportFromESRI([prj_text]) 52 | srs.AutoIdentifyEPSG() 53 | epsgcode = srs.GetAuthorityCode(None) 54 | if epsgcode == '3067': 55 | print('INFO: input shapefile has EPSG 3067, that works!') 56 | return myshp 57 | else: 58 | reprojectedshape = os.path.join(head, root + '_reprojected_3067'+ ext) 59 | #if not os.path.exists(reprojectedshape): 60 | reprojectcommand = 'ogr2ogr -t_srs EPSG:3067 ' + reprojectedshape + ' ' + myshp 61 | subprocess.call(reprojectcommand, shell=True) 62 | print('INFO: input shapefile had other than EPSG 3067, but was reprojected and works now') 63 | return reprojectedshape 64 | 65 | 66 | # bringing all input shapefiles to EPSG 3067 67 | print('Parcel shapefile: ') 68 | fullshapefile2 = checkProjection(args.fullshapefile) 69 | print('Sentinel2 shapefile: ') 70 | s2tiles2 = checkProjection(args.s2tiles) 71 | 72 | 73 | # filename: 74 | originalname = os.path.splitext(os.path.split(fullshapefile2)[-1])[0] 75 | 76 | 77 | # Tehdään loput geopandalla: 78 | 79 | tiles = gpd.read_file(s2tiles2) 80 | parcelshp = gpd.read_file(fullshapefile2) 81 | 82 | print(f'There are ', len(parcelshp), ' parcels in the input shapefile.') 83 | 84 | # for bookkeeping 85 | df = pd.DataFrame(columns = ['farmID', 'Tile']) 86 | 87 | for index, row in tiles.iterrows(): # Looping over all tiles 88 | tilename = row['Name'] 89 | 90 | # is there any parcels on this tile's BBOX: 91 | xmin, ymin, xmax, ymax = row['geometry'].bounds 92 | parcels = parcelshp.cx[xmin:xmax, ymin:ymax] 93 | 94 | if not parcels.empty: 95 | 96 | res_intersection = parcels['geometry'].within(row['geometry']) 97 | if any(res_intersection): 98 | parcelsToFile = parcels[res_intersection] 99 | outshpname = os.path.join(args.outshpdir,originalname + '_' + str(tilename)+'.shp') 100 | parcelsToFile.crs = '+proj=utm +zone=35 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs' 101 | parcelsToFile.to_file(outshpname) 102 | 103 | writeParcels = pd.DataFrame(parcelsToFile['farmID']) 104 | writeParcels['Tile'] = tilename 105 | #print(writeParcels) 106 | writeParcels.to_csv(out_file, mode='a', header=False) 107 | 108 | df = df.append(writeParcels, ignore_index = True) 109 | 110 | print(f'Intersecting farmIDs and tiles saved to {out_file}.') 111 | #df.to_csv(out_file, sep = '\t', index = False) 112 | 113 | print(f'\nDone.') 114 | 115 | except Exception as e: 116 | print('\n\nUnable to read input or write out files. Check prerequisites and see exception output below.') 117 | parser.print_help() 118 | raise e 119 | 120 | 121 | if __name__ == '__main__': 122 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 123 | epilog=textwrap.dedent(__doc__)) 124 | parser.add_argument('-s', '--s2tiles', 125 | type=str, 126 | help='Sentinel-2 tiles.') 127 | parser.add_argument('-a', '--fullshapefile', 128 | type=str, 129 | help='ESRI shapefile containing a set of polygons (.shp with its auxiliary files)') 130 | parser.add_argument('-d', '--outshpdir', 131 | help='Directory for output shp files', 132 | type=str, 133 | default='.') 134 | parser.add_argument('-o', '--out_file', 135 | help='Output (e.g. .tsv) tab-separated file containing farmID and the tile it was found at.', 136 | type=str, 137 | default='farmIDtile.tsv') 138 | 139 | args = parser.parse_args() 140 | main(args) 141 | 142 | -------------------------------------------------------------------------------- /python/02-pathfinder.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | 5 | EODIE PathFinder creates paths to all files between start- and enddate 6 | 7 | """ 8 | import os 9 | import userinput 10 | 11 | def makefilepaths(userinput): 12 | 13 | tilepath = userinput.datadir 14 | filepaths = [] 15 | 16 | #for tilepath in tilepaths: 17 | for filename in os.listdir(tilepath): 18 | print(filename) 19 | date = filename.split('_')[2].split('T')[0] 20 | if not userinput.enddate is None and not userinput.startdate is None: 21 | if date <= userinput.enddate and date >= userinput.startdate: 22 | filepath = os.path.join(tilepath,filename) 23 | filepaths.append(filepath) 24 | else: 25 | filepath = os.path.join(tilepath,filename) 26 | filepaths.append(filepath) 27 | 28 | return filepaths 29 | 30 | def makebandname(userinput): 31 | 32 | bandnames = [] 33 | 34 | for band in userinput.bandlist: 35 | if band > 10: 36 | if band == 13: 37 | bandname = 'B8A_20m' 38 | else: 39 | bandname = 'B'+str(band) + '_20m' 40 | elif band in [9,1]: 41 | bandname = 'B0'+str(band) +'_60m' 42 | elif band in [2,3,4,8]: 43 | bandname = 'B0'+str(band) +'_10m' 44 | else: 45 | bandname = 'B0'+str(band) + '_20m' 46 | 47 | bandnames.append(bandname) 48 | return bandnames 49 | 50 | def makebandpaths(): 51 | 52 | ui = userinput.UserInput() 53 | 54 | #from all filepaths, extend to matching band paths 55 | 56 | filepaths = makefilepaths(ui) 57 | bandpaths = [] 58 | 59 | for filepath in filepaths: 60 | granulepath = os.path.join(filepath,'GRANULE') 61 | betweenpath = os.path.join(granulepath,os.listdir(granulepath)[0]) 62 | imgpath = os.path.join(betweenpath,'IMG_DATA') 63 | r10 = os.path.join(imgpath,'R10m') 64 | r20 = os.path.join(imgpath,'R20m') 65 | r60 = os.path.join(imgpath,'R60m') 66 | bandlist = makebandname(ui) 67 | #print(bandlist) 68 | for rdir in [r10,r20,r60]: 69 | mylist = [os.path.join(rdir,bandfile) for bandfile in os.listdir(rdir) if bandfile.split('_')[-2] +'_' + bandfile.split('_')[-1][:3] in bandlist] 70 | bandpaths.extend(mylist) 71 | #print(mylist) 72 | 73 | to_txt(bandpaths) 74 | 75 | def to_txt(paths): 76 | 77 | with open('bandpaths.txt', 'w') as f: 78 | for item in paths: 79 | f.write("%s\n" % item) 80 | 81 | 82 | if __name__ == "__main__": 83 | makebandpaths() 84 | 85 | 86 | -------------------------------------------------------------------------------- /python/02-safefinder.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | safefinder.py creates paths to all files between start- and enddate. 4 | 5 | 17.8.2021 MY modified from EODIE pathfinder.py to find safe dirs. 6 | 7 | RUN: 8 | python safefinder.py -s 20200501 -e 20200905 -d /scratch/project_2002694/safedirs 9 | 10 | """ 11 | import os 12 | import userinput 13 | 14 | import argparse 15 | import textwrap 16 | import pathlib 17 | 18 | 19 | 20 | def makesafepaths(datadir, startdate, enddate): 21 | 22 | tilepath = datadir 23 | filepaths = [] 24 | 25 | for filename in os.listdir(tilepath): 26 | date = filename.split('_')[2].split('T')[0] 27 | if not enddate is None and not startdate is None: 28 | if date <= enddate and date >= startdate: 29 | filepath = os.path.join(tilepath,filename) 30 | filepaths.append(filepath) 31 | else: 32 | filepath = os.path.join(tilepath,filename) 33 | filepaths.append(filepath) 34 | 35 | to_txt(filepaths) 36 | 37 | def to_txt(paths): 38 | 39 | with open('../bin/safepaths.txt', 'w') as f: 40 | for item in paths: 41 | f.write("%s\n" % item) 42 | 43 | 44 | def main(args): 45 | try: 46 | if not args.datapath: 47 | raise Exception('Missing input dir argument. Try --help .') 48 | 49 | print(f'\n\nsafefinder.py') 50 | print(f'\n\nLists all SAFE directories within the start and end date.\n Writes the list to ../bin/safepaths.txt.') 51 | 52 | makesafepaths(args.datapath, args.startdate, args.enddate) 53 | 54 | except Exception as e: 55 | print('\n\nUnable to read input or write out files. Check prerequisites and see exception output below.') 56 | parser.print_help() 57 | raise e 58 | 59 | 60 | 61 | 62 | if __name__ == '__main__': 63 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 64 | epilog=textwrap.dedent(__doc__)) 65 | parser.add_argument('-d', '--datapath', 66 | type=str, 67 | help='Directory path to safe directories') 68 | parser.add_argument('-s', '--startdate', 69 | type=str, 70 | help='Start date, e.g. 20200501') 71 | parser.add_argument('-e', '--enddate', 72 | help='End date, e.g. 20200901', 73 | type=str, 74 | default='.') 75 | 76 | args = parser.parse_args() 77 | main(args) 78 | 79 | -------------------------------------------------------------------------------- /python/03-arrayextractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Originally Samantha Wittke in 2020 for EODIE. 3 | 4 | Modified by Maria Yli-Heikkila, Markku Luotamo 5 | 6 | 2020-11-12 Commented out meta data writer (extractmeta), useful only for testing purposes. 7 | 2021-08 ML added cloud masking 8 | 2021-09-06 MY added option to use tempdir 9 | 2021-09-24 MY changed to save csv in UNIX format, not DOS; not saving empty files anymore 10 | 11 | USAGE: 12 | python 03-arrayextractor.py -f $name -shp $shppath -p $projectpath -jn ${ID} -id $idname -r 10 -t $TEMPDIRPATH 13 | 14 | WHERE: 15 | -f: raster file path 16 | -shp: polygons shapefile path 17 | -p: output path 18 | -jn: job number ID 19 | -id: name of the identifier variable in shapefile (e.g. 'parcelID') 20 | -r: for multi-band operation you must specify a common target resolution (e.g. 10) 21 | -t: temporary directory path 22 | 23 | 24 | """ 25 | 26 | import csv 27 | import os 28 | import re 29 | from datetime import datetime 30 | from glob import glob 31 | from shutil import copyfile 32 | from typing import Optional 33 | from rasterstats import zonal_stats 34 | import functools 35 | 36 | import numpy as np 37 | import rasterio 38 | import shapeobject 39 | import userinput 40 | import traceback 41 | 42 | BANDS = ['B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B11', 'B12'] 43 | 44 | FILTER_OUT_SEN2COR_CLASSES = [0, 1, 3, 8, 9, 10] # No data, Cloud Shadows + Clouds medium+high probability + Cirrus 45 | #[0, 1, 3, 8, 9, 10] 46 | CLOUD_MASK_RESO_M = 20 47 | NO_DATA = np.nan 48 | INVALID = 0 49 | 50 | #direct array extraction 51 | 52 | def main(): 53 | 54 | ui = userinput.UserInput() 55 | jobnumber = ui.jobnumber 56 | bandpathtxt = ui.bandpath 57 | tile = parse_tile_from_path(bandpathtxt) 58 | shapedir = ui.shapedir 59 | namelist = os.listdir(shapedir)[0].split('_') 60 | shapename = '_'.join(namelist[:-1]) 61 | tileshapename = shapename + '_' + tile 62 | shapefile = os.path.join(shapedir, tileshapename) 63 | projectname = ui.projectname 64 | if not os.path.exists(projectname): 65 | print(f'Creating output direcory {projectname}...') 66 | os.makedirs(projectname) 67 | if ui.tmpdir: 68 | tmpdir = ui.tmpdir 69 | else: 70 | tmpdir = projectname 71 | shpfile = None 72 | if not jobnumber is None: 73 | for ext in ['.shp','.shx','.prj','.dbf']: 74 | shp = shapefile + ext 75 | if os.path.isfile(shp): 76 | #print(shp) 77 | jobdir = os.path.join(tmpdir,'temp',jobnumber) 78 | dst = os.path.join(jobdir, tileshapename + ext) 79 | if not os.path.exists(jobdir): 80 | os.makedirs(jobdir) 81 | copyfile(shp, dst) 82 | if dst.endswith('.shp'): 83 | shpfile = dst 84 | else: 85 | shpfile = shapefile + '.shp' 86 | 87 | extractarray(bandpathtxt, shpfile, tile, projectname, ui) 88 | 89 | 90 | def extractarray(raster_path, 91 | shpfile, 92 | tile, 93 | projectname, 94 | ui): 95 | 96 | cloud_mask_path: Optional[str] 97 | band_paths: [str] 98 | 99 | cloud_mask_path, band_paths = expand_raster_paths(raster_path, ui) 100 | 101 | shapeobj: shapeobject.ShapeObject = shapeobject.ShapeObject(shpfile) 102 | 103 | if cloud_mask_path: 104 | shpfile: str = shapeobj.checkProjection(cloud_mask_path) 105 | else: 106 | shpfile: str = shapeobj.checkProjection(raster_path) 107 | 108 | if cloud_mask_path: 109 | assert ui.target_resolution_m, 'For cloud masking you must specify a common target resolution.' 110 | try: 111 | parcel_cloud_masks: [dict] = crop_band_raster_per_parcel(cloud_mask_path, shpfile, ui.target_resolution_m, 112 | resampling=rasterio.enums.Resampling.nearest) 113 | except Exception as e: 114 | print(f'Error reading cloud mask for tile {tile} from "{cloud_mask_path}"') 115 | raise e 116 | 117 | for band_path in band_paths: 118 | 119 | date: str = parse_date_from_path(band_path) 120 | band: str = parse_band_from_path(band_path) 121 | 122 | if ui.target_resolution_m: 123 | target_resolution_m: int = ui.target_resolution_m 124 | else: 125 | target_resolution_m: int = parse_resolution_from_path(band_path) 126 | try: 127 | band_raster_per_parcel: [dict] = crop_band_raster_per_parcel(band_path, shpfile, target_resolution_m) 128 | except: 129 | print(f'Error reading band {band} for tile {tile} ') 130 | 131 | csv_rows = [] 132 | 133 | if not cloud_mask_path: 134 | parcel_cloud_masks = [None] * len(band_raster_per_parcel) 135 | 136 | for parcel_band_raster, parcel_cloud_mask in zip(band_raster_per_parcel, parcel_cloud_masks): 137 | filtered_band: np.ndarray = filter_band_using_mask(parcel_band_raster, parcel_cloud_mask) 138 | if np.count_nonzero(filtered_band) == 0: 139 | continue 140 | 141 | parcel_id: str = parcel_band_raster['properties'][ui.idname] 142 | row: [str] = [parcel_id] + filtered_band.flatten().tolist() 143 | csv_rows.append(row) 144 | 145 | if csv_rows: 146 | tocsv(date,band,csv_rows,tile,projectname) 147 | 148 | 149 | def maximal_resolution_band_paths(bands: [str], band_root_path: str, max_resolution_m: int): 150 | candidate_paths = list(filter(lambda p: parse_resolution_from_path(p) 151 | and parse_resolution_from_path(p) >= max_resolution_m 152 | and parse_band_from_path(p) in bands, 153 | glob(f'{band_root_path}/**/*.jp2', recursive=True))) 154 | candidate_path_resolutions = list(map(parse_resolution_from_path, candidate_paths)) 155 | candidate_path_bands = list(map(parse_band_from_path, candidate_paths)) 156 | candidate_paths_sorted = sorted(list(zip(candidate_path_resolutions, candidate_paths, candidate_path_bands)), 157 | key=lambda t: t[0]) 158 | max_reso_path_by_band = {} 159 | for candidate in candidate_paths_sorted: 160 | reso, path, band = candidate 161 | if band in max_reso_path_by_band: 162 | continue 163 | else: 164 | max_reso_path_by_band[band] = path 165 | 166 | return list(max_reso_path_by_band.values()) 167 | 168 | 169 | def filter_band_using_mask(parcel_band_raster: [dict], parcel_cloud_mask: [dict]): 170 | parcel_array = parcel_band_raster['properties']['mini_raster_array'].filled(NO_DATA) 171 | 172 | if parcel_cloud_mask: 173 | cloud_array = parcel_cloud_mask['properties']['mini_raster_array'].filled(INVALID) 174 | cloud_mask = np.logical_not(sen2cor_binary_transformer(cloud_array)) 175 | filtered_array = parcel_array[cloud_mask] 176 | else: 177 | filtered_array = parcel_array 178 | 179 | return filtered_array[np.isfinite(filtered_array)].astype(np.uint16) 180 | 181 | 182 | def crop_band_raster_per_parcel(band_path:str, shpfile: str, target_resolution_m: int, 183 | resampling=rasterio.enums.Resampling.bilinear) -> [dict]: 184 | band_data, tile_band_resample_transform = \ 185 | resampled_raster_dataset(band_path, 186 | parse_resolution_from_path(band_path) / target_resolution_m, 187 | resampling=resampling) 188 | try: 189 | bandwise_zstats = zonal_stats(shpfile, 190 | band_data, 191 | affine=tile_band_resample_transform, 192 | stats=['count', 'nodata'], 193 | band=1, 194 | nodata=-999, 195 | geojson_out=True, 196 | all_touched=False, 197 | raster_out=True) 198 | except Exception as e: 199 | print(f'Error extracting polygons from raster "{band_path}" and shp "{shpfile}":') 200 | traceback.print_exc() 201 | raise e 202 | 203 | return bandwise_zstats 204 | 205 | 206 | def expand_raster_paths(raster_path: str, ui: userinput.UserInput): 207 | if '.jp2' in raster_path: # single band 208 | raster_paths = [raster_path] 209 | cloud_mask_path = ui.cloud_mask_path 210 | else: # all eligible bands in a SAFE dir 211 | assert ui.target_resolution_m, 'For multi-band operation you must specify a common target resolution.' 212 | raster_paths = maximal_resolution_band_paths(BANDS, raster_path, ui.target_resolution_m) 213 | cloud_mask_path = safe_cloud_mask_path(raster_path) 214 | 215 | return cloud_mask_path, raster_paths 216 | 217 | 218 | def resampled_raster_dataset(raster_path, scaling_factor, resampling=rasterio.enums.Resampling.bilinear): 219 | try: 220 | with rasterio.open(raster_path) as dataset: 221 | raster_data = dataset.read(1, 222 | out_shape=( 223 | dataset.count, 224 | int(dataset.height * scaling_factor), 225 | int(dataset.width * scaling_factor) 226 | ), 227 | resampling=resampling) 228 | resample_transform = \ 229 | dataset.transform * dataset.transform.scale( 230 | (dataset.width / raster_data.shape[-1]), 231 | (dataset.height / raster_data.shape[-2]) 232 | ) 233 | except Exception as e: 234 | print(f'Error reading raster file "{raster_path}":') 235 | traceback.print_exc() 236 | raise e 237 | return raster_data, resample_transform 238 | 239 | 240 | def parse_resolution_from_path(p: str): 241 | groups = re.match('.*_([0-9]{2})m.*', p) 242 | return int(groups[1]) if groups else None 243 | 244 | 245 | def parse_band_from_path(p: str): 246 | groups = re.match('.*_(B[0-9].).*', p) 247 | return groups[1] if groups else None 248 | 249 | 250 | def parse_tile_from_path(p: str): 251 | groups = re.match('.*T([0-9]{2}[A-Z]{3}).*', p) 252 | return groups[1] if groups else None 253 | 254 | 255 | def parse_date_from_path(rasterpath: str): 256 | return os.path.split(rasterpath)[-1].split('_')[1][:8] 257 | 258 | 259 | def safe_cloud_mask_path(safe_root: str): 260 | return glob(f'{safe_root}/**/*_SCL_20m.jp2', recursive=True)[0] 261 | 262 | 263 | def array_value_in_one_of(arr: np.ndarray, vals: list): 264 | return functools.reduce(lambda acc, class_ix: np.logical_or(acc,arr == class_ix), vals, False) 265 | 266 | 267 | def sen2cor_binary_transformer(array_raw): 268 | return array_value_in_one_of(array_raw, FILTER_OUT_SEN2COR_CLASSES) 269 | 270 | 271 | def tocsv(date,band,myarray,tile,projectname): 272 | csvfile = os.path.join(projectname,'array_'+tile + '_' + date +'_'+ band+'.csv') 273 | with open(csvfile, "w", newline='') as f: 274 | writer = csv.writer(f, lineterminator=os.linesep) 275 | writer.writerows(myarray) 276 | 277 | 278 | def extractmeta(bandtif, parcelID, mydate, count, nodata, projectname, band, tile): 279 | 280 | #(parcel_ID, year, day-of-year, name of the file (tile), mission ID (SA|SB), count) 281 | 282 | #band and tile could be gotten from bandtif 283 | 284 | metadatacsv = os.path.join(projectname,'meta_'+tile + '_' + mydate +'_'+ band+'.csv') 285 | 286 | mycolumns = ['parcelID','year','DOY','tilefilename','missionID','count', 'nodata'] 287 | 288 | if not os.path.exists(metadatacsv): # write the header 289 | with open(metadatacsv,'w') as csvfile: 290 | writer = csv.writer(csvfile, delimiter=',') 291 | writer.writerow(mycolumns) 292 | 293 | year = mydate[0:4] 294 | 295 | dateobj = datetime.strptime(mydate, '%Y%m%d') 296 | doy = (dateobj - datetime(dateobj.year, 1, 1)).days + 1 297 | 298 | bandtif = bandtif.split('/')[-6] 299 | tilefilename = ('_').join(bandtif.split('_')[0:6]) 300 | 301 | missionID = bandtif.split('_')[0] 302 | 303 | onerow = [parcelID, year, doy, tilefilename, missionID, count, nodata] 304 | 305 | with open(metadatacsv,'a') as csvfile: 306 | writer = csv.writer(csvfile, delimiter=',') 307 | writer.writerow(onerow) 308 | 309 | 310 | if __name__ == "__main__": 311 | main() 312 | -------------------------------------------------------------------------------- /python/04-flatten-temporal.py: -------------------------------------------------------------------------------- 1 | """ 2 | MY 2022-03-19 3 | 4 | Flatten observations' temporal dimension: 11-days combined 5 | 6 | RUN: python 04-flatten-temporal.py -i cloudless/results_1110_2018 -o cloudless/results_1110_2018 \ 7 | -c 19 8 | 9 | WHERE: 10 | i: input dir 11 | o: output dir 12 | c: Number of cores to use 13 | 14 | """ 15 | 16 | import os 17 | 18 | import argparse 19 | import textwrap 20 | from pathlib import Path 21 | 22 | from itertools import repeat 23 | from multiprocessing import Pool 24 | 25 | 26 | def concatSeparateDatesIntoOne(arrayfile, datadir, out_dir_path): 27 | 28 | if arrayfile.endswith('.csv') and arrayfile.startswith('array_'): 29 | #print(arrayfile) 30 | tile = arrayfile.split('_')[1] 31 | date0 = arrayfile.split('_')[2][:-3] 32 | date = int(arrayfile.split('_')[2][-2:]) 33 | month = arrayfile.split('_')[2][-3:-2] # works only for Jan-Sept 34 | monthNext = str(int(month) + 1) 35 | tail = arrayfile.split('_')[3] 36 | 37 | if date < 11: 38 | newdate = month + '11' 39 | elif (date >= 11 and date < 21): 40 | newdate = month + '21' 41 | else: 42 | newdate = monthNext + '01' 43 | 44 | newarrayfile = 'array_' + tile + '_' + date0 + newdate + '_' + tail 45 | #print(newarrayfile) 46 | 47 | arraypath = os.path.join(datadir,arrayfile) 48 | outputpath = os.path.join(out_dir_path,newarrayfile) 49 | 50 | os.system('cat {} >> {}' .format(str(arraypath), str(outputpath))) 51 | 52 | # Done. 53 | 54 | 55 | def main(args): 56 | try: 57 | if not args.inputpath or not args.outdir: 58 | raise Exception('Missing input or output dir argument. Try --help .') 59 | 60 | print(f'\n\n04-flatten-temporal.py') 61 | print(f'\nInput files in {args.inputpath}') 62 | 63 | fp = args.inputpath 64 | 65 | print(f'\nSaving time flattened arrays into {args.outdir}...') 66 | 67 | datadir = args.inputpath 68 | 69 | list_of_files = os.listdir(datadir) 70 | p = Pool(args.ncores) 71 | p.starmap(concatSeparateDatesIntoOne, zip(list_of_files, repeat(datadir), repeat(args.outdir))) 72 | # wait for all tasks to finish 73 | p.close() 74 | 75 | 76 | except Exception as e: 77 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 78 | parser.print_help() 79 | 80 | raise e 81 | 82 | if __name__ == '__main__': 83 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 84 | epilog=textwrap.dedent(__doc__)) 85 | parser.add_argument('-i', '--inputpath', 86 | type=str, 87 | help='Path to the directory with reflectance values.') 88 | parser.add_argument('-o', '--outdir', 89 | type=str, 90 | help='Name of the output directory.') 91 | parser.add_argument('-c', '--ncores', 92 | type=int, 93 | help='Number of cores to use.', 94 | default = 1) 95 | 96 | 97 | args = parser.parse_args() 98 | main(args) 99 | -------------------------------------------------------------------------------- /python/05-histogramize-shadow.py: -------------------------------------------------------------------------------- 1 | ############################ 2 | """ 3 | 16.6.2020 modified by MY 4 | 27.8.2021 remove normalization of histograms, do normalization later. 5 | 24.9.2021 changed to_csv to save in UNIX forma (not DOS) 6 | 20.11.2021 removed if below (or over) bin range, set 1st (or last) value 1, others 0. 7 | 8 | Based on Samatha Wittke's code histogramize.py for EODIE: 9 | input array csvs from main program, each line representing one field, first number being ID of the field 10 | output similar csv with each line representing one field, first number being ID of field, followed by x = bins numbers representing histogram values 11 | 12 | 13 | RUN: python histogramize-shadow.py -i input -o output -b B8A -n nrbins -l 2 -h 2000 14 | 15 | WHERE: 16 | i: input dir 17 | o: output dir 18 | b: band ID 19 | n: number of bins in histogram 20 | l: lower limit of range in histogram 21 | h: upper limit of range in histogram 22 | 23 | """ 24 | ###################### 25 | 26 | 27 | import os 28 | import numpy as np 29 | import sys 30 | import csv 31 | import os 32 | 33 | import argparse 34 | import textwrap 35 | import pathlib 36 | 37 | 38 | def to_csv(csvfile, myarray): 39 | 40 | csvfile = csvfile.replace('array_','histogram_') 41 | #with open(csvfile, "w") as f: 42 | with open(csvfile, "w", newline='') as f: 43 | writer = csv.writer(f, lineterminator=os.linesep) 44 | writer.writerows(myarray) 45 | 46 | def make_histogram(inarray,bin_seq): 47 | 48 | histo1, _ = np.histogram(inarray, bin_seq, density=False) 49 | return histo1 50 | 51 | def main(args): 52 | try: 53 | if not args.inputpath or not args.band: 54 | raise Exception('Missing input or output dir argument or band number (e.g. B8A). Try --help .') 55 | 56 | print(f'\n\nhistogramize-shadow.py') 57 | print(f'\nInput files in {args.inputpath}') 58 | print(f'Band: {args.band}') 59 | out_dir_path = pathlib.Path(os.path.expanduser(args.outdir)) 60 | out_dir_path.mkdir(parents=True, exist_ok=True) 61 | 62 | datadir = args.inputpath 63 | band = args.band 64 | 65 | bin_seq = np.linspace(args.minimum,args.maximum,args.nrbins+1) 66 | 67 | #print('Reading arrayfiles...') 68 | 69 | for arrayfile in os.listdir(datadir): 70 | if arrayfile.endswith(band + '.csv') and arrayfile.startswith('array_'): 71 | #print(arrayfile) 72 | histlist = [] 73 | arraypath = os.path.join(datadir,arrayfile) 74 | outputpath = os.path.join(out_dir_path,arrayfile) 75 | with open(arraypath, "r") as f: 76 | reader = csv.reader(f) 77 | for line in reader: 78 | myid = [line[0]] 79 | #if myid: 80 | line = [int(elem) for elem in line if not '_' in elem] 81 | #print(line) 82 | #if min(line) >= args.maximum: 83 | # #hist = [float(0)]*(args.nrbins-1); hist.append(float(1)) 84 | # hist = [0]*(args.nrbins-1); hist.append(1) 85 | #elif max(line) <= args.minimum: 86 | # hist = [1]; hist.extend([0]*(args.nrbins-1)) 87 | #else: 88 | # hist = make_histogram(line, bin_seq) 89 | hist = make_histogram(line, bin_seq) 90 | #print(hist) 91 | myid.extend(hist) 92 | hist2 = myid 93 | #print(hist2) 94 | histlist.append(hist2) 95 | #print(histlist) 96 | 97 | to_csv(outputpath,histlist) 98 | 99 | except Exception as e: 100 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 101 | parser.print_help() 102 | raise e 103 | 104 | if __name__ == '__main__': 105 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 106 | epilog=textwrap.dedent(__doc__)) 107 | parser.add_argument('-i', '--inputpath', 108 | type=str, 109 | help='Path to the directory with array csv files.', 110 | default='.') 111 | parser.add_argument('-n', '--nrbins', 112 | type=int, 113 | default=16, 114 | help='Number of bins.') 115 | parser.add_argument('-b', '--band', 116 | help='Band number (e.g. B02)', 117 | type=str) 118 | parser.add_argument('-l', '--minimum', 119 | help='The lower range of the bins.', 120 | type=int) 121 | parser.add_argument('-u', '--maximum', 122 | help='The upper range of the bins.', 123 | type=int) 124 | parser.add_argument('-o', '--outdir', 125 | type=str, 126 | help='Name of the output directory.', 127 | default='.') 128 | 129 | args = parser.parse_args() 130 | main(args) 131 | -------------------------------------------------------------------------------- /python/05-medianize.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2021-12-01 3 | 4 | python 05-medianize.py -i input -o output 5 | 6 | """ 7 | 8 | 9 | import os 10 | import numpy as np 11 | import sys 12 | import csv 13 | import os 14 | 15 | import argparse 16 | import textwrap 17 | import pathlib 18 | 19 | 20 | def to_csv(csvfile, myarray): 21 | 22 | csvfile = csvfile.replace('array_','median_') 23 | #with open(csvfile, "w") as f: 24 | with open(csvfile, "w", newline='') as f: 25 | writer = csv.writer(f, lineterminator=os.linesep) 26 | writer.writerows(myarray) 27 | 28 | def main(args): 29 | try: 30 | if not args.inputpath: 31 | raise Exception('Missing input or output dir argument. Try --help .') 32 | 33 | print(f'\n\n05-medianize.py') 34 | print(f'\nInput files in {args.inputpath}') 35 | 36 | out_dir_path = pathlib.Path(os.path.expanduser(args.outdir)) 37 | out_dir_path.mkdir(parents=True, exist_ok=True) 38 | 39 | datadir = args.inputpath 40 | 41 | 42 | #print('Reading arrayfiles...') 43 | 44 | for arrayfile in os.listdir(datadir): 45 | if arrayfile.startswith('array_'): 46 | #print(arrayfile) 47 | lista = [] 48 | arraypath = os.path.join(datadir,arrayfile) 49 | outputpath = os.path.join(out_dir_path,arrayfile) 50 | with open(arraypath, "r") as f: 51 | reader = csv.reader(f) 52 | 53 | for line in reader: 54 | myid = [line[0]] 55 | #if myid: 56 | line = [int(elem) for elem in line if not '_' in elem] 57 | median = np.median(line) 58 | myid.extend([int(median)]) 59 | median2 = myid 60 | lista.append(median2) 61 | 62 | if lista: 63 | to_csv(outputpath,lista) 64 | 65 | except Exception as e: 66 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 67 | parser.print_help() 68 | raise e 69 | 70 | if __name__ == '__main__': 71 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 72 | epilog=textwrap.dedent(__doc__)) 73 | parser.add_argument('-i', '--inputpath', 74 | type=str, 75 | help='Path to the directory with array csv files.', 76 | default='.') 77 | 78 | parser.add_argument('-o', '--outdir', 79 | type=str, 80 | help='Name of the output directory.', 81 | default='.') 82 | 83 | args = parser.parse_args() 84 | main(args) 85 | 86 | 87 | -------------------------------------------------------------------------------- /python/06-histo2stack.py: -------------------------------------------------------------------------------- 1 | """ 2 | 17.8.2020 MY 3 | 23.10.2020 no more features as tuples but as array. 4 | 19.8.2021 modified to save into dataStack_annuals (instead of dataStack_temp). 5 | 31.8.2021 added option to use tempdir 6 | 7 | Make histo-files into annual dataframes. Saves into outputdir_annuals. 8 | 9 | createMissingFiles() checks if all 10 bands exists per observation (farm). If not, makes a copy of any band from the same doy and sets all values to zero. 10 | 11 | getAttributesFromFilename() adds tile-, DOY ja band information from filename to data. 12 | 13 | mergeAllGetNumpyArrays() makes one big dataframe for one year. Save to outputdir_annuals. 14 | 15 | testing(outputfile) tests if output file is ok. 16 | 17 | RUN: 18 | 19 | python 06-histo2stack.py -i histo_test1110_2016 -n 32 -o dataStack -f test1110_2016.pkl -t TEMPDIRPATH 20 | 21 | After this into 07-stack2ARD.py. 22 | 23 | """ 24 | 25 | import os 26 | import pandas as pd 27 | import numpy as np 28 | import pickle 29 | 30 | from pathlib import Path 31 | 32 | import argparse 33 | import textwrap 34 | from datetime import datetime 35 | 36 | 37 | ###### FUNCTIONS: 38 | 39 | def load_intensities(filename): 40 | with open(filename, "rb") as f: 41 | data = pickle.load(f) 42 | return data 43 | 44 | def save_intensities(filename, arrayvalues): 45 | with open(filename, 'wb+') as outputfile: 46 | pickle.dump(arrayvalues, outputfile) 47 | 48 | def createMissingFiles(datadir): 49 | # List all files 50 | list_of_files = os.listdir(datadir) 51 | 52 | # histogram_35VNL_20200830_B8A.csv 53 | # This removes the .csv and splits the name to three parts 54 | list_of_filename_parts = [i.replace(".csv","").split("_") for i in list_of_files] 55 | 56 | # Makes a df of all filenames 57 | df = pd.DataFrame(list_of_filename_parts, columns=['histo','tile','date','band']) 58 | #print(df.head()) 59 | 60 | # Group and iterate by date, see if bands are missing 61 | grouped_df = df.groupby(['date', 'tile']) 62 | 63 | # Bands as text that should exist 64 | bands = ['B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12'] 65 | 66 | # Iterate 67 | for name, date_group in grouped_df: 68 | #print(name[1]) 69 | existing_bands = list(date_group['band']) 70 | for band in bands: 71 | if band not in existing_bands: 72 | # Band is missing create a mockup dataframe and save 73 | print(f"For date {name} band {band} is missing!") 74 | 75 | ### Copy from existing band, same date, set all values to 0 (or np.nan) 76 | 77 | temp_filename = os.path.join(datadir,"histogram_" + name[1] + "_" + name[0] + "_" + existing_bands[0] + ".csv") 78 | #print(temp_filename) 79 | dftemp = pd.read_csv(temp_filename, encoding='utf-8', header = None) 80 | #print(dftemp.iloc[:, 1:]) 81 | dftemp.iloc[:,1:] = 0 82 | #print(dftemp) 83 | 84 | output_filename = os.path.join(datadir,"histogram_" + name[1] + "_" + name[0] + "_" + band + ".csv") 85 | print(f"Saving a new file named {output_filename}") 86 | dftemp.to_csv(output_filename,encoding='utf-8',index=False, header=False) 87 | 88 | def getAttributesFromFilename(datadir, data_folder2): 89 | ### Add date and band to every file as columns 90 | 91 | # Loop files in data_folder 92 | for filename in os.listdir(datadir): 93 | if filename.endswith('.csv') and filename.startswith('histogram_'): 94 | #print(filename) 95 | try: 96 | df = pd.read_csv(os.path.join(datadir,filename), encoding='utf-8', header = None) 97 | except pd.errors.EmptyDataError: 98 | print(f'{os.path.join(datadir,filename)} was empty. Skipping.') 99 | continue 100 | # Add tile, band and date from filename to columns 101 | df['tile'] = filename.split("_")[1] 102 | pvm = filename.split("_")[2] 103 | df['doy'] = datetime.strptime(pvm, '%Y%m%d').timetuple().tm_yday 104 | #print(doy) 105 | df['band'] = filename.split("_")[3].replace(".csv","") 106 | #print(band) 107 | 108 | ### Write to data_folder2 109 | df.to_csv(os.path.join(data_folder2,filename), encoding='utf-8',index=False, header=False) 110 | 111 | def mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile): 112 | ### Merge all files to one big dataframe 113 | 114 | df_array = [] 115 | 116 | ### Read files to pandas, add the dataframes to the array 117 | for filename in os.listdir(data_folder2): 118 | df = pd.read_csv(os.path.join(data_folder2,filename), encoding="utf-8", header=None) 119 | df.rename(columns={(bins + 1): 'tile', (bins + 2): 'doy', (bins + 3): 'band'}, inplace=True) 120 | try: 121 | df['farmID'] = df[0] + '_' + df['tile'] 122 | except Exception as e: 123 | print(f'\n\nThere is something wrong with file {os.path.join(data_folder2,filename)}...') 124 | print('Check that you have set the right number of bins!') 125 | raise e 126 | old_names = df.columns.tolist()[1:bins+1] 127 | new_names = [] 128 | for bin in range(bins): 129 | new_names.append("bin" + str(bin+1)) 130 | 131 | df = df.rename(columns=dict(zip(old_names, new_names))) 132 | df = df.drop(0, axis = 1) 133 | df = df[['farmID', 'band','doy', *df.columns[df.columns.str.startswith("bin")]]] 134 | df_array.append(df) 135 | 136 | ### Make a big dataframe out of the list of dataframes 137 | all_files_df = pd.concat(df_array) 138 | ### And save to temp: 139 | save_intensities(os.path.join(data_folder3,outputfile), all_files_df) 140 | 141 | return all_files_df 142 | 143 | def addDOYrank(all_files_df, out_dir_path, outputfile): 144 | #print(all_files_df.head()) 145 | days = all_files_df.doy.sort_values().unique() 146 | days_dict = dict(zip(days, range(len(days)))) 147 | print(days_dict) 148 | all_files_df2 = all_files_df 149 | return all_files_df2 150 | 151 | def testing(all_files_df, out_dir_path, outputfile): 152 | print("Output written to file: ", outputfile) 153 | 154 | tmp2 = all_files_df.groupby(['doy', 'farmID']).count()#.unstack().fillna(0) 155 | 156 | if tmp2[tmp2.band != 10]['band'].any(): 157 | print("Some bands missing!") 158 | else: 159 | print("All farms have full 10 bands!") 160 | 161 | # kuinka monta tilaa mukana? 162 | print("How many farms are observed from one or several S2 granules?:", len(all_files_df[['farmID']].drop_duplicates())) 163 | 164 | # kuinka monta tilaa mukana oikeasti? 165 | farmIDs = all_files_df['farmID'].str.rsplit('_',1).str[0] 166 | print("How many farms we really have?: ", len(farmIDs.drop_duplicates())) 167 | 168 | # Kuinka monta havaintoa per tila koko kesältä, mediaani? 169 | print("How many observations per farm in one season (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['farmID']).count().median())) 170 | 171 | # kuinka monta havaintoa per päivä, mediaani? 172 | print("How many observations per day (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['doy']).count().median())) 173 | 174 | 175 | def main(args): 176 | 177 | try: 178 | if not args.inputpath or not args.outdir: 179 | raise Exception('Missing input or output dir argument or bin number (e.g. 32). Try --help .') 180 | 181 | print(f'\n\nhisto2stack.py') 182 | print(f'\nInput files in {args.inputpath}') 183 | print(f'Bins: {args.bins}') 184 | out_dir_path = Path(os.path.expanduser(args.outdir)) 185 | out_dir_path.mkdir(parents=True, exist_ok=True) 186 | 187 | datadir = args.inputpath 188 | bins = args.bins 189 | outputfile = args.outfile 190 | 191 | # temp directory for annual histograms: 192 | data_folder2 = args.tmpdir 193 | Path(data_folder2).mkdir(parents=True, exist_ok=True) 194 | 195 | # directory for annual dataframes: 196 | data_folder3 = args.outdir + "_annual" 197 | Path(data_folder3).mkdir(parents=True, exist_ok=True) 198 | 199 | 200 | createMissingFiles(datadir) 201 | getAttributesFromFilename(datadir, data_folder2) 202 | 203 | # tämä tekee jo varsinaisen osuuden: 204 | all_files_df = mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile) 205 | 206 | # loput on testausta: 207 | all_files_df = load_intensities(os.path.join(data_folder3,outputfile)) 208 | all_files_df = addDOYrank(all_files_df, out_dir_path, outputfile) 209 | testing(all_files_df, out_dir_path, outputfile) 210 | 211 | 212 | except Exception as e: 213 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 214 | parser.print_help() 215 | raise e 216 | 217 | if __name__ == '__main__': 218 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 219 | epilog=textwrap.dedent(__doc__)) 220 | parser.add_argument('-i', '--inputpath', 221 | type=str, 222 | help='Path to the directory with histogram csv files.', 223 | default='.') 224 | parser.add_argument('-n', '--bins', 225 | type=int, 226 | default=16, 227 | help='Number of bins.') 228 | parser.add_argument('-o', '--outdir', 229 | type=str, 230 | help='Name of the output directory.', 231 | default='.') 232 | parser.add_argument('-f', '--outfile', 233 | type=str, 234 | help='Name of the output file.', 235 | default='.') 236 | parser.add_argument('-t', '--tmpdir', 237 | type=str, 238 | help='Name of the temp directory.', 239 | default='.') 240 | args = parser.parse_args() 241 | main(args) 242 | 243 | 244 | -------------------------------------------------------------------------------- /python/06-median2stack.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.12.2021 MY 3 | 4 | Make histo-files into annual dataframes. Saves into outputdir_annuals. 5 | 6 | createMissingFiles() checks if all 10 bands exists per observation (farm). If not, makes a copy of any band from the same doy and sets all values to zero. 7 | 8 | getAttributesFromFilename() adds tile-, DOY ja band information from filename to data. 9 | 10 | mergeAllGetNumpyArrays() makes one big dataframe for one year. Save to outputdir_annuals. 11 | 12 | testing(outputfile) tests if output file is ok. 13 | 14 | RUN: 15 | 16 | python 06-median2stack.py -i median_test1110_2016 -o medianStack -f test1110_2016.pkl -t TEMPDIRPATH 17 | 18 | After this into 07-stack2ARD.py. 19 | 20 | """ 21 | 22 | import os 23 | import pandas as pd 24 | import numpy as np 25 | import pickle 26 | 27 | from pathlib import Path 28 | 29 | import argparse 30 | import textwrap 31 | from datetime import datetime 32 | 33 | 34 | ###### FUNCTIONS: 35 | 36 | def load_intensities(filename): 37 | with open(filename, "rb") as f: 38 | data = pickle.load(f) 39 | return data 40 | 41 | def save_intensities(filename, arrayvalues): 42 | with open(filename, 'wb+') as outputfile: 43 | pickle.dump(arrayvalues, outputfile) 44 | 45 | def createMissingFiles(datadir): 46 | # List all files 47 | list_of_files = os.listdir(datadir) 48 | 49 | # median_35VNL_20200830_B8A.csv 50 | # This removes the .csv and splits the name to three parts 51 | list_of_filename_parts = [i.replace(".csv","").split("_") for i in list_of_files] 52 | 53 | # Makes a df of all filenames 54 | df = pd.DataFrame(list_of_filename_parts, columns=['histo','tile','date','band']) 55 | #print(df.head()) 56 | 57 | # Group and iterate by date, see if bands are missing 58 | grouped_df = df.groupby(['date', 'tile']) 59 | 60 | # Bands as text that should exist 61 | bands = ['B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12'] 62 | 63 | # Iterate 64 | for name, date_group in grouped_df: 65 | #print(name[1]) 66 | existing_bands = list(date_group['band']) 67 | for band in bands: 68 | if band not in existing_bands: 69 | # Band is missing create a mockup dataframe and save 70 | print(f"For date {name} band {band} is missing!") 71 | 72 | ### Copy from existing band, same date, set all values to 0 (or np.nan) 73 | 74 | temp_filename = os.path.join(datadir,"median_" + name[1] + "_" + name[0] + "_" + existing_bands[0] + ".csv") 75 | #print(temp_filename) 76 | dftemp = pd.read_csv(temp_filename, encoding='utf-8', header = None) 77 | #print(dftemp.iloc[:, 1:]) 78 | dftemp.iloc[:,1:] = 0 79 | #print(dftemp) 80 | 81 | output_filename = os.path.join(datadir,"median_" + name[1] + "_" + name[0] + "_" + band + ".csv") 82 | print(f"Saving a new file named {output_filename}") 83 | dftemp.to_csv(output_filename,encoding='utf-8',index=False, header=False) 84 | 85 | def getAttributesFromFilename(datadir, data_folder2): 86 | ### Add date and band to every file as columns 87 | 88 | # Loop files in data_folder 89 | for filename in os.listdir(datadir): 90 | if filename.endswith('.csv') and filename.startswith('median_'): 91 | #print(filename) 92 | try: 93 | df = pd.read_csv(os.path.join(datadir,filename), encoding='utf-8', header = None) 94 | except pd.errors.EmptyDataError: 95 | print(f'{os.path.join(datadir,filename)} was empty. Skipping.') 96 | continue 97 | # Add tile, band and date from filename to columns 98 | df['tile'] = filename.split("_")[1] 99 | pvm = filename.split("_")[2] 100 | df['doy'] = datetime.strptime(pvm, '%Y%m%d').timetuple().tm_yday 101 | #print(doy) 102 | df['band'] = filename.split("_")[3].replace(".csv","") 103 | #print(band) 104 | 105 | ### Write to data_folder2 106 | df.to_csv(os.path.join(data_folder2,filename), encoding='utf-8',index=False, header=False) 107 | 108 | def mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile): 109 | ### Merge all files to one big dataframe 110 | 111 | df_array = [] 112 | 113 | ### Read files to pandas, add the dataframes to the array 114 | for filename in os.listdir(data_folder2): 115 | df = pd.read_csv(os.path.join(data_folder2,filename), encoding="utf-8", header=None) 116 | df.rename(columns={(bins + 1): 'tile', (bins + 2): 'doy', (bins + 3): 'band'}, inplace=True) 117 | try: 118 | df['farmID'] = df[0] + '_' + df['tile'] 119 | except Exception as e: 120 | print(f'\n\nThere is something wrong with file {os.path.join(data_folder2,filename)}...') 121 | print('Check that you have set the right number of bins!') 122 | raise e 123 | old_names = df.columns.tolist()[1:bins+1] 124 | new_names = [] 125 | for bin in range(bins): 126 | new_names.append("bin" + str(bin+1)) 127 | 128 | df = df.rename(columns=dict(zip(old_names, new_names))) 129 | df = df.drop(0, axis = 1) 130 | df = df[['farmID', 'band','doy', *df.columns[df.columns.str.startswith("bin")]]] 131 | df_array.append(df) 132 | 133 | ### Make a big dataframe out of the list of dataframes 134 | all_files_df = pd.concat(df_array) 135 | ### And save to temp: 136 | save_intensities(os.path.join(data_folder3,outputfile), all_files_df) 137 | 138 | return all_files_df 139 | 140 | def addDOYrank(all_files_df, out_dir_path, outputfile): 141 | #print(all_files_df.head()) 142 | days = all_files_df.doy.sort_values().unique() 143 | days_dict = dict(zip(days, range(len(days)))) 144 | print(days_dict) 145 | all_files_df2 = all_files_df 146 | return all_files_df2 147 | 148 | def testing(all_files_df, out_dir_path, outputfile): 149 | print("Output written to file: ", outputfile) 150 | 151 | tmp2 = all_files_df.groupby(['doy', 'farmID']).count()#.unstack().fillna(0) 152 | 153 | if tmp2[tmp2.band != 10]['band'].any(): 154 | print("Some bands missing!") 155 | else: 156 | print("All farms have full 10 bands!") 157 | 158 | # kuinka monta tilaa mukana? 159 | print("How many farms are observed from one or several S2 granules?:", len(all_files_df[['farmID']].drop_duplicates())) 160 | 161 | # kuinka monta tilaa mukana oikeasti? 162 | farmIDs = all_files_df['farmID'].str.rsplit('_',1).str[0] 163 | print("How many farms we really have?: ", len(farmIDs.drop_duplicates())) 164 | 165 | # Kuinka monta havaintoa per tila koko kesältä, mediaani? 166 | print("How many observations per farm in one season (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['farmID']).count().median())) 167 | 168 | # kuinka monta havaintoa per päivä, mediaani? 169 | print("How many observations per day (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['doy']).count().median())) 170 | 171 | 172 | def main(args): 173 | 174 | try: 175 | if not args.inputpath or not args.outdir: 176 | raise Exception('Missing input or output dir argument. Try --help .') 177 | 178 | print(f'\n\n06-median2stack.py') 179 | print(f'\nInput files in {args.inputpath}') 180 | 181 | out_dir_path = Path(os.path.expanduser(args.outdir)) 182 | out_dir_path.mkdir(parents=True, exist_ok=True) 183 | 184 | datadir = args.inputpath 185 | outputfile = args.outfile 186 | 187 | bins = 1 # vain yksi feature eli mediaani 188 | 189 | # temp directory for annual medians: 190 | data_folder2 = args.tmpdir 191 | Path(data_folder2).mkdir(parents=True, exist_ok=True) 192 | 193 | # directory for annual dataframes: 194 | data_folder3 = args.outdir + "_annual" 195 | Path(data_folder3).mkdir(parents=True, exist_ok=True) 196 | 197 | 198 | createMissingFiles(datadir) 199 | getAttributesFromFilename(datadir, data_folder2) 200 | 201 | # tämä tekee jo varsinaisen osuuden: 202 | all_files_df = mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile) 203 | 204 | # loput on testausta: 205 | all_files_df = load_intensities(os.path.join(data_folder3,outputfile)) 206 | all_files_df = addDOYrank(all_files_df, out_dir_path, outputfile) 207 | testing(all_files_df, out_dir_path, outputfile) 208 | 209 | 210 | except Exception as e: 211 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 212 | parser.print_help() 213 | raise e 214 | 215 | if __name__ == '__main__': 216 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 217 | epilog=textwrap.dedent(__doc__)) 218 | parser.add_argument('-i', '--inputpath', 219 | type=str, 220 | help='Path to the directory with median csv files.', 221 | default='.') 222 | 223 | parser.add_argument('-o', '--outdir', 224 | type=str, 225 | help='Name of the output directory.', 226 | default='.') 227 | parser.add_argument('-f', '--outfile', 228 | type=str, 229 | help='Name of the output file.', 230 | default='.') 231 | parser.add_argument('-t', '--tmpdir', 232 | type=str, 233 | help='Name of the temp directory.', 234 | default='.') 235 | args = parser.parse_args() 236 | main(args) 237 | 238 | 239 | 240 | -------------------------------------------------------------------------------- /python/07-medianstack2ARD.py: -------------------------------------------------------------------------------- 1 | """ 2 | 1.12.2021 3 | 4 | Combine annual stack-files into one array stack. 5 | 6 | combineAllYears() reads all annuals into one big dataframe. 7 | 8 | reshapeAndSave() pivots the dataframe by farmID and doy, converts to numpy array, fills with na (-> not ragged) and reshapes into 3D. Saves array and farmIDs into separate files. 9 | 10 | RUN: 11 | 12 | python 07-medianstack2ARD.py -i medianStack_annual -o medianStack/ -f 1400 -y 2018 2019 13 | 14 | After this into 08-mergeTarget.py. 15 | 16 | """ 17 | import glob 18 | import os 19 | import pandas as pd 20 | import numpy as np 21 | import pickle 22 | 23 | from pathlib import Path 24 | 25 | import argparse 26 | import textwrap 27 | from datetime import datetime 28 | 29 | 30 | ###### FUNCTIONS: 31 | 32 | def load_intensities(filename): 33 | with open(filename, "rb") as f: 34 | data = pickle.load(f) 35 | return data 36 | 37 | def save_intensities(filename, arrayvalues): 38 | with open(filename, 'wb+') as outputfile: 39 | pickle.dump(arrayvalues, outputfile) 40 | 41 | def combineAllYears(data_folder3, setti, years): 42 | # read files in inputdir: 43 | s = pd.Series(glob.glob(data_folder3 + '/*.pkl')) 44 | 45 | filepaths = [] 46 | 47 | for filename in s: 48 | for keyword1 in years: 49 | if keyword1 in filename: 50 | for keyword2 in setti: 51 | if keyword2 in filename: 52 | #print(filename) 53 | filepaths.append(filename) 54 | #print(filepaths) 55 | # open all chosen years into one dataframe: 56 | allyears = pd.concat(map(pd.read_pickle, filepaths), sort=False) 57 | return allyears 58 | 59 | def reshapeAndSave(full_array_stack, out_dir_path, outputfile, rank): 60 | # reshape and save data to 3D: 61 | print(f"\nLength of the data stack dataframe: {len(full_array_stack)}") 62 | 63 | if rank: 64 | dateVar = 'doyid' 65 | else: 66 | dateVar = 'doy' 67 | 68 | full_array_stack['doyid'] = full_array_stack.groupby(['farmID', 'band'])['doy'].rank(method="first", ascending=True).astype('int') 69 | 70 | #print(full_array_stack.sort_values(['farmID', 'doy']).tail(15)) 71 | 72 | # printtaa esimerkkitila: 73 | #tmp = full_array_stack[full_array_stack['farmID'] == '2019_12026885_35VMH'][['farmID', 'doy', 'band', 'doyid']] 74 | #print(tmp.sort_values(['doy', 'band'])) 75 | 76 | # printtaa sellaiset, joilla bin1 on 1: 77 | #print(len(full_array_stack[full_array_stack['bin1'] == 1])) 78 | # printtaa sellaiset, joilla bin32 on 1: 79 | #print(len(full_array_stack[full_array_stack['bin32'] == 1])) 80 | 81 | # printtaa sellaiset, joiden rivisumma ei ole 1: 82 | #print(full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) != 1]) 83 | 84 | # printtaa näiden rivisummat: 85 | #tmp = full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) < 1] 86 | #print(len(tmp)) # jotain pyöristysvirhettä ehkäpä vain 87 | #print(tmp.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1)) 88 | 89 | # Predictions to compare with forecasts: 15.6. eli DOY 166, that is pythonic 165. 90 | # and 15.7. eli DOY 196 91 | # and 15.8. eli DOY 227 92 | # and the last DOY 243 -> the final state 93 | 94 | #june = full_array_stack[full_array_stack['doy'] <= 165] 95 | #print(june.sort_values(['doy', 'band']).tail(20)) 96 | #print(june['doyid'].value_counts()) 97 | 98 | #july = full_array_stack[full_array_stack['doy'] <= 195] 99 | #august = full_array_stack[full_array_stack['doy'] <= 226] 100 | 101 | 102 | final = full_array_stack 103 | #print(final['doyid'].value_counts()) 104 | 105 | # Kuinka monta havaintoa per tila koko kesältä, mediaani? 106 | print("How many observations per farm in one season (median)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().median())) 107 | # Kuinka monta havaintoa per tila koko kesältä, max? 108 | print("How many observations per farm in one season (max)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().max())) 109 | # Kuinka monta havaintoa per tila koko kesältä, min? 110 | print("How many observations per farm in one season (min)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().min())) 111 | 112 | # koko kausi: 113 | farms = final.farmID.nunique() 114 | doys = final[dateVar].nunique() 115 | bands = 10 116 | bins = 1 # nyt vain yksi feature eli median 117 | pivoted = final.pivot(index=['farmID', dateVar], columns='band', values=[*final.columns[final.columns.str.startswith('bin')]]) 118 | m = pd.MultiIndex.from_product([pivoted.index.get_level_values(0).unique(), pivoted.index.get_level_values(1).sort_values().unique()], names=pivoted.index.names) 119 | pt = pivoted.reindex(m, fill_value = 0) 120 | finalfinal = pt.to_numpy().reshape(farms, doys, bins, bands).swapaxes(2,3).reshape(farms,doys,bands*bins) 121 | 122 | outputfile2 = 'array_' + outputfile 123 | fp = os.path.join(out_dir_path, outputfile2) 124 | 125 | print(f"Shape of the 3D stack dataframe: {finalfinal.shape}") 126 | print(f"Output into file: {fp}") 127 | np.savez_compressed(fp, finalfinal) 128 | #save_intensities(fp, finalfinal) 129 | 130 | # save farmIDs for later merging with target y: 131 | farmIDs = pt.index.get_level_values(0).unique().str.rsplit('_',1).str[0].values 132 | print(f"\n\nNumber of farms: {len(farmIDs)}") 133 | outputfile2 = 'farmID_' + outputfile + '.pkl' 134 | fp = os.path.join(out_dir_path, outputfile2) 135 | print(f"Output farmIDs in file: {fp}") 136 | save_intensities(fp, farmIDs) 137 | 138 | 139 | 140 | def main(args): 141 | 142 | try: 143 | if not args.outdir or not args.setti: 144 | raise Exception('Missing output dir argument or dataset label (e.g. test1110). Try --help .') 145 | 146 | print(f'\n\nstack2ARD.py') 147 | print(f'\nInput files in {args.inputdir}') 148 | 149 | # directory for input, i.e. annual results: 150 | data_folder3 = args.inputdir 151 | 152 | # directory for outputs: 153 | out_dir_path = args.outdir 154 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 155 | 156 | # years: 157 | years = args.ylist 158 | setti = args.setti 159 | 160 | # outputfilename: 161 | #outputfile = '-'.join(setti) + '-' + '-'.join(years) + '.pkl' 162 | outputfile = '-'.join(setti) + '-' + '-'.join(years) 163 | 164 | 165 | 166 | print("\nPresuming preprocessing done earlier. If not done previously, please, run with histo2stack.py first!") 167 | 168 | print("\nCombining the years and data sets...") 169 | allyears = combineAllYears(data_folder3, setti, years) 170 | reshapeAndSave(allyears, out_dir_path, outputfile, args.rank) 171 | 172 | 173 | except Exception as e: 174 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 175 | parser.print_help() 176 | raise e 177 | 178 | if __name__ == '__main__': 179 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 180 | epilog=textwrap.dedent(__doc__)) 181 | 182 | parser.add_argument('-i', '--inputdir', 183 | type=str, 184 | help='Name of the input directory (where annual histogram dataframes are).', 185 | default='.') 186 | parser.add_argument('-o', '--outdir', 187 | type=str, 188 | help='Name of the output directory.', 189 | default='.') 190 | # is not true: cannot combine multiple data sets (crops), because farmID does not hold crop information -> duplicated farmIDs 191 | parser.add_argument('-f', '--setti', action='store', dest='setti', 192 | type=str, nargs='*', default=['1400'], 193 | help='Name of the data set. Can be also multiple. E.g. -f 1310 1320.') 194 | #parser.add_argument('-f', '--setti', 195 | # type=str, 196 | # default=['1400'], 197 | # help='Name of the data set. E.g. -f 1310.') 198 | parser.add_argument('-y', '--years', action='store', dest='ylist', 199 | type=str, nargs='*', default=['2018', '2019', '2020', '2021'], 200 | help="Optionally e.g. -y 2018 2019, default all") 201 | 202 | parser.add_argument('-r', '--rank', 203 | help='If saving time series by rank of days.', 204 | default=False, 205 | action='store_true') 206 | 207 | args = parser.parse_args() 208 | main(args) 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /python/07-stack2ARD.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | MY 23.10.2020 4 | 5 | Combine annual stack-files into one array stack. 6 | 7 | combineAllYears() reads all annuals into one big dataframe. 8 | 9 | reshapeAndSave() pivots the dataframe by farmID and doy, converts to numpy array, fills with na (-> not ragged) and reshapes into 3D. Saves array and farmIDs into separate files. 10 | 11 | RUN: 12 | 13 | python 07-stack2ARD.py -i dataStack_annual -o dataStack/ -f 1400 -y 2018 2019 14 | 15 | After this into 08-mergeTarget.py. 16 | 17 | """ 18 | import glob 19 | import os 20 | import pandas as pd 21 | import numpy as np 22 | import pickle 23 | 24 | from pathlib import Path 25 | 26 | import argparse 27 | import textwrap 28 | from datetime import datetime 29 | 30 | 31 | ###### FUNCTIONS: 32 | 33 | def load_intensities(filename): 34 | with open(filename, "rb") as f: 35 | data = pickle.load(f) 36 | return data 37 | 38 | def save_intensities(filename, arrayvalues): 39 | with open(filename, 'wb+') as outputfile: 40 | pickle.dump(arrayvalues, outputfile) 41 | 42 | def combineAllYears(data_folder3, setti, years): 43 | # read files in inputdir: 44 | s = pd.Series(glob.glob(data_folder3 + '/*.pkl')) 45 | 46 | filepaths = [] 47 | 48 | for filename in s: 49 | for keyword1 in years: 50 | if keyword1 in filename: 51 | for keyword2 in setti: 52 | if keyword2 in filename: 53 | #print(filename) 54 | filepaths.append(filename) 55 | #print(filepaths) 56 | # open all chosen years into one dataframe: 57 | allyears = pd.concat(map(pd.read_pickle, filepaths), sort=False) 58 | return allyears 59 | 60 | def reshapeAndSave(full_array_stack, out_dir_path, outputfile, rank): 61 | # reshape and save data to 3D: 62 | print(f"\nLength of the data stack dataframe: {len(full_array_stack)}") 63 | 64 | if rank: 65 | dateVar = 'doyid' 66 | else: 67 | dateVar = 'doy' 68 | 69 | full_array_stack['doyid'] = full_array_stack.groupby(['farmID', 'band'])['doy'].rank(method="first", ascending=True).astype('int') 70 | 71 | #print(full_array_stack.sort_values(['farmID', 'doy']).tail(15)) 72 | 73 | # printtaa esimerkkitila: 74 | #tmp = full_array_stack[full_array_stack['farmID'] == '2019_12026885_35VMH'][['farmID', 'doy', 'band', 'doyid']] 75 | #print(tmp.sort_values(['doy', 'band'])) 76 | 77 | # printtaa sellaiset, joilla bin1 on 1: 78 | #print(len(full_array_stack[full_array_stack['bin1'] == 1])) 79 | # printtaa sellaiset, joilla bin32 on 1: 80 | #print(len(full_array_stack[full_array_stack['bin32'] == 1])) 81 | 82 | # printtaa sellaiset, joiden rivisumma ei ole 1: 83 | #print(full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) != 1]) 84 | 85 | # printtaa näiden rivisummat: 86 | #tmp = full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) < 1] 87 | #print(len(tmp)) # jotain pyöristysvirhettä ehkäpä vain 88 | #print(tmp.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1)) 89 | 90 | # Predictions to compare with forecasts: 15.6. eli DOY 166, that is pythonic 165. 91 | # and 15.7. eli DOY 196 92 | # and 15.8. eli DOY 227 93 | # and the last DOY 243 -> the final state 94 | 95 | #june = full_array_stack[full_array_stack['doy'] <= 165] 96 | #print(june.sort_values(['doy', 'band']).tail(20)) 97 | #print(june['doyid'].value_counts()) 98 | 99 | #july = full_array_stack[full_array_stack['doy'] <= 195] 100 | #august = full_array_stack[full_array_stack['doy'] <= 226] 101 | 102 | 103 | final = full_array_stack 104 | #print(final['doyid'].value_counts()) 105 | 106 | # Kuinka monta havaintoa per tila koko kesältä, mediaani? 107 | print("How many observations per farm in one season (median)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().median())) 108 | # Kuinka monta havaintoa per tila koko kesältä, max? 109 | print("How many observations per farm in one season (max)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().max())) 110 | # Kuinka monta havaintoa per tila koko kesältä, min? 111 | print("How many observations per farm in one season (min)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().min())) 112 | 113 | # koko kausi: 114 | farms = final.farmID.nunique() 115 | doys = final[dateVar].nunique() 116 | bands = 10 117 | bins = 32 118 | pivoted = final.pivot(index=['farmID', dateVar], columns='band', values=[*final.columns[final.columns.str.startswith('bin')]]) 119 | m = pd.MultiIndex.from_product([pivoted.index.get_level_values(0).unique(), pivoted.index.get_level_values(1).sort_values().unique()], names=pivoted.index.names) 120 | pt = pivoted.reindex(m, fill_value = 0) 121 | finalfinal = pt.to_numpy().reshape(farms, doys, bins, bands).swapaxes(2,3).reshape(farms,doys,bands*bins) 122 | 123 | outputfile2 = 'array_' + outputfile 124 | fp = os.path.join(out_dir_path, outputfile2) 125 | 126 | print(f"Shape of the 3D stack dataframe: {finalfinal.shape}") 127 | print(f"Output into file: {fp}") 128 | np.savez_compressed(fp, finalfinal) 129 | #save_intensities(fp, finalfinal) 130 | 131 | # save farmIDs for later merging with target y: 132 | farmIDs = pt.index.get_level_values(0).unique().str.rsplit('_',1).str[0].values 133 | print(f"\n\nNumber of farms: {len(farmIDs)}") 134 | outputfile2 = 'farmID_' + outputfile + '.pkl' 135 | fp = os.path.join(out_dir_path, outputfile2) 136 | print(f"Output farmIDs in file: {fp}") 137 | save_intensities(fp, farmIDs) 138 | 139 | 140 | 141 | def main(args): 142 | 143 | try: 144 | if not args.outdir or not args.setti: 145 | raise Exception('Missing output dir argument or dataset label (e.g. test1110). Try --help .') 146 | 147 | print(f'\n\nstack2ARD.py') 148 | print(f'\nInput files in {args.inputdir}') 149 | 150 | # directory for input, i.e. annual results: 151 | data_folder3 = args.inputdir 152 | 153 | # directory for outputs: 154 | out_dir_path = args.outdir 155 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 156 | 157 | # years: 158 | years = args.ylist 159 | setti = args.setti 160 | 161 | # outputfilename: 162 | #outputfile = '-'.join(setti) + '-' + '-'.join(years) + '.pkl' 163 | outputfile = '-'.join(setti) + '-' + '-'.join(years) 164 | 165 | 166 | 167 | print("\nPresuming preprocessing done earlier. If not done previously, please, run with histo2stack.py first!") 168 | 169 | print("\nCombining the years and data sets...") 170 | allyears = combineAllYears(data_folder3, setti, years) 171 | reshapeAndSave(allyears, out_dir_path, outputfile, args.rank) 172 | 173 | 174 | except Exception as e: 175 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 176 | parser.print_help() 177 | raise e 178 | 179 | if __name__ == '__main__': 180 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 181 | epilog=textwrap.dedent(__doc__)) 182 | 183 | parser.add_argument('-i', '--inputdir', 184 | type=str, 185 | help='Name of the input directory (where annual histogram dataframes are).', 186 | default='.') 187 | parser.add_argument('-o', '--outdir', 188 | type=str, 189 | help='Name of the output directory.', 190 | default='.') 191 | # is not true: cannot combine multiple data sets (crops), because farmID does not hold crop information -> duplicated farmIDs 192 | parser.add_argument('-f', '--setti', action='store', dest='setti', 193 | type=str, nargs='*', default=['1400'], 194 | help='Name of the data set. Can be also multiple. E.g. -f 1310 1320.') 195 | #parser.add_argument('-f', '--setti', 196 | # type=str, 197 | # default=['1400'], 198 | # help='Name of the data set. E.g. -f 1310.') 199 | parser.add_argument('-y', '--years', action='store', dest='ylist', 200 | type=str, nargs='*', default=['2018', '2019', '2020', '2021'], 201 | help="Optionally e.g. -y 2018 2019, default all") 202 | 203 | parser.add_argument('-r', '--rank', 204 | help='If saving time series by rank of days.', 205 | default=False, 206 | action='store_true') 207 | 208 | args = parser.parse_args() 209 | main(args) 210 | 211 | 212 | 213 | -------------------------------------------------------------------------------- /python/07C-doyFusion-median.py: -------------------------------------------------------------------------------- 1 | """ 2 | MY 2022-03-24 3 | 4 | Apply to all annual stack-files: add/sum of duplicates per doy, i.e. merge all observations per day per farm into one. 5 | 6 | 7 | RUN: 8 | 9 | python 07C-doyFusion-median.py -i cloudless/medianStack_annual -o cloudless/medianStack_annualFused 10 | 11 | """ 12 | import glob 13 | import os 14 | import pandas as pd 15 | import numpy as np 16 | import pickle 17 | import utils 18 | 19 | from pathlib import Path 20 | 21 | import argparse 22 | import textwrap 23 | 24 | 25 | 26 | ###### FUNCTIONS: 27 | 28 | def combineAllDOYs(data_folder, out_dir_path): 29 | # read files in inputdir: 30 | s = pd.Series(glob.glob(data_folder + '/*.pkl')) 31 | 32 | for filename in s: 33 | df = utils._load_intensities(filename) 34 | df2 = df.replace(0, np.nan) 35 | df3 = df2.groupby(['farmID', 'band', 'doy']).mean().reset_index() 36 | 37 | 38 | filename2 = os.path.join(out_dir_path, filename.split('/')[-1]) 39 | 40 | print(f"Saving {filename} to file: {filename2}") 41 | utils.save_intensities(filename2, df3) 42 | 43 | 44 | 45 | def main(args): 46 | 47 | try: 48 | if not args.inputdir or not args.outdir: 49 | raise Exception('Missing input or output dir. Try --help .') 50 | 51 | print(f'\n\n07C-doyFusion-median.py') 52 | print(f'\nInput files in {args.inputdir}') 53 | 54 | # directory for input, i.e. annual results: 55 | data_folder = args.inputdir 56 | 57 | # directory for outputs: 58 | out_dir_path = args.outdir 59 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 60 | 61 | print("\nCombining the doys within fused time window (11-days)...") 62 | combineAllDOYs(data_folder, out_dir_path) 63 | 64 | 65 | except Exception as e: 66 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 67 | parser.print_help() 68 | raise e 69 | 70 | if __name__ == '__main__': 71 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 72 | epilog=textwrap.dedent(__doc__)) 73 | 74 | parser.add_argument('-i', '--inputdir', 75 | type=str, 76 | help='Name of the input directory (where annual histogram dataframes are).', 77 | default='.') 78 | parser.add_argument('-o', '--outdir', 79 | type=str, 80 | help='Name of the output directory.', 81 | default='.') 82 | args = parser.parse_args() 83 | main(args) 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /python/07C-doyFusion.py: -------------------------------------------------------------------------------- 1 | """ 2 | MY 2022-03-24 3 | 4 | Apply to all annual stack-files: add/sum of duplicates per doy, i.e. merge all observations per day per farm into one. 5 | 6 | 7 | RUN: 8 | 9 | python 07C-doyFusion.py -i dataStack_annual -o dataStack_annualFused 10 | 11 | Before this 06-histo2stack.py, after this 07-stack2ARD.py. 12 | 13 | """ 14 | import glob 15 | import os 16 | import pandas as pd 17 | import numpy as np 18 | import pickle 19 | import utils 20 | 21 | from pathlib import Path 22 | 23 | import argparse 24 | import textwrap 25 | 26 | 27 | 28 | ###### FUNCTIONS: 29 | 30 | def combineAllDOYs(data_folder, out_dir_path): 31 | # read files in inputdir: 32 | s = pd.Series(glob.glob(data_folder + '/*.pkl')) 33 | 34 | for filename in s: 35 | df = utils._load_intensities(filename) 36 | df2 = df.groupby(['farmID', 'band', 'doy']).aggregate(np.sum).reset_index() 37 | 38 | filename2 = os.path.join(out_dir_path, filename.split('/')[-1]) 39 | 40 | print(f"Saving {filename} to file: {filename2}") 41 | utils.save_intensities(filename2, df2) 42 | 43 | 44 | 45 | def main(args): 46 | 47 | try: 48 | if not args.inputdir or not args.outdir: 49 | raise Exception('Missing input or output dir. Try --help .') 50 | 51 | print(f'\n\n07C-doyFusion.py') 52 | print(f'\nInput files in {args.inputdir}') 53 | 54 | # directory for input, i.e. annual results: 55 | data_folder = args.inputdir 56 | 57 | # directory for outputs: 58 | out_dir_path = args.outdir 59 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 60 | 61 | print("\nCombining the doys within fused time window (11-days)...") 62 | combineAllDOYs(data_folder, out_dir_path) 63 | 64 | 65 | except Exception as e: 66 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 67 | parser.print_help() 68 | raise e 69 | 70 | if __name__ == '__main__': 71 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 72 | epilog=textwrap.dedent(__doc__)) 73 | 74 | parser.add_argument('-i', '--inputdir', 75 | type=str, 76 | help='Name of the input directory (where annual histogram dataframes are).', 77 | default='.') 78 | parser.add_argument('-o', '--outdir', 79 | type=str, 80 | help='Name of the output directory.', 81 | default='.') 82 | args = parser.parse_args() 83 | main(args) 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /python/08-mergeTarget-parallel.py: -------------------------------------------------------------------------------- 1 | """ 2 | 20.8.2021 MY 3 | 4 | Merge farmID with target y. 5 | 6 | RUN: 7 | 8 | python 08-mergeTarget-parallel.py -i dataStack/ -k references-all.csv 9 | 10 | """ 11 | import pandas as pd 12 | import numpy as np 13 | import pickle 14 | import os.path 15 | from pathlib import Path 16 | import argparse 17 | import textwrap 18 | import re 19 | import glob 20 | import utils 21 | from itertools import repeat 22 | from multiprocessing import Pool 23 | 24 | maxcores = 18 25 | 26 | # FUNCTIONS: 27 | 28 | def makeTarget(inputfile, refefile, out_dir_path): 29 | # read array: 30 | arrayfile = utils.load_npintensities(inputfile) 31 | # read farmIDs: 32 | farmid = utils.readTargetID(inputfile) 33 | setti = utils.parse_xpath(inputfile) 34 | print(setti) 35 | fp1 = os.path.join(out_dir_path, 'y_' + setti + '.pkl') 36 | fp2 = os.path.join(out_dir_path, 'farmID_' + setti + '.pkl') 37 | fp3 = os.path.join(out_dir_path, inputfile.split('/')[-1]) 38 | 39 | idsdf = pd.DataFrame(farmid) 40 | idsdf.columns = ['farmID'] 41 | #print(idsdf.tail()) 42 | # read crop yields (target): 43 | targets = pd.read_csv(refefile) 44 | # merge: 45 | df = idsdf.merge(targets, how = 'left') 46 | if len(idsdf) == len(df): 47 | print(f'Length of farmIDs before and after merge match ({len(df)}).') 48 | if df['y'].isna().any(): 49 | print(f'There are NAs!') 50 | # this means, some y not found. Let's filter also array and farmID. 51 | print(f"There are {df['y'].isna().sum()} NAs.") 52 | #print(arrayfile.shape, farmid.shape, len(targets)) 53 | rowmaskNAs = np.array(df['y'].isna()) 54 | 55 | arrayfileClear = arrayfile[~rowmaskNAs, :, :] 56 | farmidClear = df['farmID'][~rowmaskNAs] 57 | yClear = df['y'][~rowmaskNAs] 58 | 59 | print(f'Saving filtered data.') 60 | 61 | print(f'Saving target y to {fp1}.') 62 | utils.save_intensities(fp1, yClear) 63 | 64 | print(f'Saving farmID to {fp2}.') 65 | utils.save_intensities(fp2, farmidClear) 66 | 67 | print(f'Saving arrayfiles into {fp3}.') 68 | np.savez_compressed(fp3, arrayfileClear) 69 | 70 | print(len(yClear), len(farmidClear), arrayfileClear.shape) 71 | 72 | else: 73 | print(f'Saving without the need to filter out NA data.') 74 | # Saving: 75 | 76 | print(f'Saving target y to {fp1}.') 77 | utils.save_intensities(fp1, df['y']) 78 | 79 | print(f'Saving farmID to {fp2}.') 80 | utils.save_intensities(fp2, df['farmID']) 81 | 82 | print(f'Saving arrayfiles into {fp3}.') 83 | np.savez_compressed(fp3, arrayfile) 84 | 85 | # HERE STARTS MAIN: 86 | 87 | def main(args): 88 | try: 89 | if not args.inputpath or not args.refefile: 90 | raise Exception('Missing farmID or target filepath argument. Try --help .') 91 | 92 | print(f'\n08-mergeTarget-parallel.py') 93 | print(f'\nStacked data in: {args.inputpath}') 94 | 95 | # directory for results: 96 | if 'median' in args.inputpath: 97 | out_dir_path = os.path.join(str(Path(args.inputpath).parents[0]), 'medianStack_ard') 98 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 99 | else: 100 | out_dir_path = os.path.join(str(Path(args.inputpath).parents[0]), 'dataStack_ard') 101 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 102 | 103 | 104 | print("\nMerging farmID and crop yields to make target set y...") 105 | filenames = glob.glob(args.inputpath + 'array*.npz') 106 | #print(filenames) 107 | if filenames: 108 | p = Pool(maxcores) 109 | p.starmap(makeTarget, zip(filenames, repeat(args.refefile), repeat(out_dir_path))) 110 | # wait for all tasks to finish 111 | p.close() 112 | 113 | 114 | #for fp in filenames: 115 | # print(fp) 116 | # makeTarget(fp, args.refefile, out_dir_path) 117 | 118 | 119 | 120 | print(f'\nDone.') 121 | 122 | except Exception as e: 123 | print('\n\nUnable to read input or write out statistics. Check prerequisites and see exception output below.') 124 | parser.print_help() 125 | raise e 126 | 127 | 128 | if __name__ == '__main__': 129 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 130 | epilog=textwrap.dedent(__doc__)) 131 | 132 | parser.add_argument('-i', '--inputpath', 133 | help='Path to data directory (dataStack_duplicatesRemoved).', 134 | type=str) 135 | 136 | parser.add_argument('-k', '--refefile', 137 | help='Filename of crop yields data. Remove bad data beforehand (like suspiciously low yields).', 138 | type=str) 139 | parser.add_argument('--debug', 140 | help='Verbose output for debugging.', 141 | action='store_true') 142 | 143 | args = parser.parse_args() 144 | main(args) 145 | -------------------------------------------------------------------------------- /python/08A-removeDuplicates-parallel.py: -------------------------------------------------------------------------------- 1 | """ 2 | MY 3 | 13.2.2022 Remove duplicates, compute marix addition. 4 | 5 | Reads all files in input path. Handles duplicates and corrects farmID file. Saves into *_duplicatesRemoved 6 | 7 | Do this before 08-mergeTarget.py. 8 | 9 | RUN: 10 | 11 | python 08A-removeDuplicates.py -i dataStack/ 12 | 13 | python 08A-removeDuplicates.py -i dataStack/ 14 | 15 | """ 16 | 17 | import pandas as pd 18 | import numpy as np 19 | import pickle 20 | import os.path 21 | from pathlib import Path 22 | import argparse 23 | import textwrap 24 | import re 25 | import glob 26 | import utils 27 | from iteration_utilities import duplicates, unique_everseen 28 | 29 | from multiprocessing import Pool 30 | 31 | maxcores = 18 32 | 33 | # FUNCTIONS: 34 | 35 | def theworks(fp, inputpath, out_dir_path): 36 | 37 | arrayfile = utils.load_npintensities(fp) 38 | farmid = utils.readTargetID(fp) 39 | #print(list(duplicates(farmid))) 40 | rowmaskDuplicated = np.array([True if x in list(duplicates(farmid)) else False for x in farmid]) 41 | 42 | 43 | # if there are duplicates: 44 | if any(rowmaskDuplicated): 45 | # save the unique cases first: 46 | arrayfileClear = arrayfile[~rowmaskDuplicated, :, :] 47 | farmidClear = farmid[~rowmaskDuplicated] 48 | 49 | for farm in list(unique_everseen(duplicates(farmid))): 50 | alist = arrayfile[farm == farmid, :, :] 51 | # matrix addition of multiple arrays: 52 | uusi = np.add.reduce(alist) 53 | 54 | arrayfileClear = np.concatenate([arrayfileClear, uusi[np.newaxis,:,:]]) 55 | farmidClear = np.append(farmidClear, farm) 56 | 57 | 58 | else: # if there are no duplicates at all 59 | arrayfileClear = arrayfile 60 | farmidClear = farmid 61 | print('There are no duplicates at all.') 62 | 63 | # last check: 64 | if arrayfileClear.shape[0] != farmidClear.shape[0]: 65 | print(f'List lengths not matching! Check {fp}') 66 | 67 | print(f'There was {sum([True if x in list(duplicates(farmid)) else False for x in farmid])} duplicates.') 68 | print(f'Old array shape: {arrayfile.shape}') 69 | print(f'Old farm list shape: {farmid.shape}') 70 | print(f'New array shape: {arrayfileClear.shape}') 71 | print(f'New farm list shape: {farmidClear.shape}') 72 | 73 | # Saving: 74 | tail = utils.parse_xpath(fp) 75 | print(tail) 76 | fp2 = 'farmID_' + tail + '.pkl' 77 | print(f'Saving farmID files into {os.path.join(out_dir_path, fp2)}.') 78 | utils.save_intensities(os.path.join(out_dir_path, fp2), farmidClear) 79 | fp3 = fp.split('/')[-1] 80 | print(f'Saving arrayfiles into {os.path.join(out_dir_path, fp3)}.') 81 | np.savez_compressed(os.path.join(out_dir_path, fp3), arrayfileClear) 82 | 83 | 84 | 85 | # HERE STARTS MAIN: 86 | 87 | def main(args): 88 | try: 89 | if not args.inputpath: 90 | raise Exception('Missing input dir argument. Try --help .') 91 | 92 | print(f'\n\n08A-removeDuplicates-parallel.py') 93 | print(f'\nInput files in {args.inputpath}') 94 | 95 | datadir = args.inputpath 96 | 97 | # directory for results: 98 | out_dir_path = os.path.dirname(datadir) + "_duplicatesRemoved" 99 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 100 | 101 | list_of_files = glob.glob(inputpath + 'array*.npz') 102 | if list_of_files: 103 | p = Pool(maxcores) 104 | p.starmap(theworks, zip(list_of_files, repeat(datadir), repeat(out_dir_path))) 105 | # wait for all tasks to finish 106 | p.close() 107 | 108 | #theworks(datadir, out_dir_path) 109 | 110 | print('Done.') 111 | 112 | except Exception as e: 113 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 114 | parser.print_help() 115 | raise e 116 | 117 | if __name__ == '__main__': 118 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 119 | epilog=textwrap.dedent(__doc__)) 120 | parser.add_argument('-i', '--inputpath', 121 | type=str, 122 | help='Path to the directory with stacked array files.', 123 | default='.') 124 | 125 | args = parser.parse_args() 126 | main(args) 127 | 128 | 129 | -------------------------------------------------------------------------------- /python/08B-mergeObservations-parallel.py: -------------------------------------------------------------------------------- 1 | """ 2 | MY 3 | 9.3.2022 Merge selected observations (e.g. by region), compute marix addition. 4 | 5 | Reads all files in input path. Handles duplicates and corrects farmID file. Saves into *_merged 6 | 7 | Run only for single years (regions are not meant to be used for training). 8 | 9 | After this run 08-mergeTarget.py. 10 | 11 | RUN: 12 | 13 | python 08B-mergeObservations-parallel.py -i dataStack/ -o dataStack/ \ 14 | -k satotilalistaJaKunta.csv -c 8 15 | 16 | """ 17 | 18 | import pandas as pd 19 | import numpy as np 20 | import pickle 21 | import os.path 22 | from pathlib import Path 23 | import argparse 24 | import textwrap 25 | import re 26 | import glob 27 | import utils 28 | from iteration_utilities import duplicates, unique_everseen 29 | from itertools import repeat 30 | from multiprocessing import Pool 31 | 32 | maxcores = 18 33 | 34 | # FUNCTIONS: 35 | 36 | def theworks(fp, inputpath, out_dir_path, chosenFarms): 37 | 38 | tail = utils.parse_xpath(fp) 39 | print(f'Starting processing {tail}') 40 | 41 | arrayfile = utils.load_npintensities(fp) 42 | farmid = utils.readTargetID(fp) 43 | 44 | rowmask = np.array([True if x in list(chosenFarms['farmID'].tolist()) else False for x in farmid]) 45 | 46 | 47 | # if there are any farms: 48 | if any(rowmask): 49 | 50 | newfarmid = farmid[rowmask] 51 | newarray = arrayfile[rowmask, :, :] 52 | 53 | newdf = pd.DataFrame(newfarmid, columns = ['farmID']) 54 | newdf2 = newdf.merge(chosenFarms) 55 | newdf2[['Year', 'farm_ID', 'Crop']] = newdf2['farmID'].str.split('_', expand = True) 56 | newfarmid = newdf2['Year'] + '_' + newdf2['KUNTA_KNRO_VUOSI'].astype('str') + '_' + newdf2['Crop'] 57 | 58 | # are there cases (regions) with only one observation (tila)? 59 | if set(newfarmid) - set(duplicates(newfarmid)): 60 | print(f'There are cases (regions) with only one observation (tila): {len(set(newfarmid) - set(duplicates(newfarmid)))}') 61 | print(f'Namely, these: {len(set(newfarmid) - set(duplicates(newfarmid)))}') 62 | 63 | l = [] 64 | lfarmid = [] 65 | 66 | print(f'There are {len(list(unique_everseen(duplicates(newfarmid))))} duplicated regions.') 67 | 68 | for farm in list(unique_everseen(duplicates(newfarmid))): 69 | print(farm) 70 | alist = newarray[[i in farm for i in newfarmid], :, :] 71 | # matrix addition of multiple arrays: 72 | uusi = np.add.reduce(alist) 73 | #l.append(uusi[np.newaxis,:,:]) 74 | l.append(uusi) 75 | lfarmid.append(farm) 76 | 77 | newarrayMerged = np.asarray(l) 78 | 79 | 80 | # last check: 81 | if newarray.shape[0] != newfarmid.shape[0]: 82 | print(f'List lengths not matching! Check {fp}') 83 | 84 | print(f'There were {rowmask.sum()} chosen farms.') 85 | print(f'Old array shape: {arrayfile.shape}') 86 | print(f'Old farm list shape: {farmid.shape}') 87 | print(f'New array shape: {newarrayMerged.shape}') 88 | print(f'New farm list shape: {len(lfarmid)}') 89 | 90 | # Saving: 91 | 92 | fp2 = 'farmID_' + tail + '.pkl' 93 | print(f'Saving farmID files into {os.path.join(out_dir_path, fp2)}.') 94 | utils.save_intensities(os.path.join(out_dir_path, fp2), lfarmid) 95 | fp3 = fp.split('/')[-1] 96 | print(f'Saving arrayfiles into {os.path.join(out_dir_path, fp3)}.') 97 | np.savez_compressed(os.path.join(out_dir_path, fp3), newarrayMerged) 98 | 99 | else: # if there are no duplicates at all 100 | print(f'There are no selected farms in {fp}.') 101 | 102 | 103 | 104 | # HERE STARTS MAIN: 105 | 106 | def main(args): 107 | try: 108 | if not args.inputpath: 109 | raise Exception('Missing input dir argument. Try --help .') 110 | 111 | print(f'\n\n08B-mergeObservations-parallel.py') 112 | print(f'\nInput files in {args.inputpath}') 113 | 114 | datadir = args.inputpath 115 | 116 | # directory for results: 117 | out_dir_path = os.path.dirname(args.outputpath) + "_merged" 118 | Path(out_dir_path).mkdir(parents=True, exist_ok=True) 119 | 120 | 121 | chosenFarms = pd.read_csv(args.kunnat) 122 | 123 | # only annual data sets: 124 | list_of_files = glob.glob(datadir + 'array_1' + ('[0-9]' * 3) + '-20' + ('[0-9]' * 2) + '.npz') 125 | if list_of_files: 126 | p = Pool(maxcores) 127 | p.starmap(theworks, zip(list_of_files, repeat(datadir), repeat(out_dir_path), repeat(chosenFarms))) 128 | # wait for all tasks to finish 129 | p.close() 130 | 131 | #theworks(datadir, out_dir_path) 132 | 133 | print('Done.') 134 | 135 | except Exception as e: 136 | print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.') 137 | parser.print_help() 138 | raise e 139 | 140 | if __name__ == '__main__': 141 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 142 | epilog=textwrap.dedent(__doc__)) 143 | parser.add_argument('-i', '--inputpath', 144 | type=str, 145 | help='Path to the directory with stacked array files.', 146 | default='.') 147 | parser.add_argument('-o', '--outputpath', 148 | type=str, 149 | help='Path to the output directory with merged array files.', 150 | default='.') 151 | parser.add_argument('-k', '--kunnat', 152 | type=str, 153 | help='Path to the file with kunnat, tilat.') 154 | parser.add_argument('-c', '--ncores', 155 | type=int, 156 | help='Number of cores to use.', 157 | default = 1) 158 | 159 | args = parser.parse_args() 160 | main(args) 161 | 162 | 163 | -------------------------------------------------------------------------------- /python/09-runRF-article-iterate.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2021-11-30 RF / iterable 2.1.2022 3 | 4 | RUN: 5 | 6 | Without testing set (makes train/validation split automatically): 7 | python 09-runRF-article-iterate.py -i dataStack/array_1110-2020.npz 8 | 9 | With testing set (kunta or separate year): 10 | python 09-runRF-article-iterate.py -i dataStack/array_1110-2018-2019.npz \ 11 | -j dataStack/array_1110-2020.npz 12 | 13 | 14 | NOTE: if you test with a separate year, be sure that training set excludes that year! 15 | 16 | 17 | """ 18 | import glob 19 | import pandas as pd 20 | import numpy as np 21 | import os.path 22 | from pathlib import Path 23 | import argparse 24 | import textwrap 25 | import math 26 | import time 27 | import csv 28 | from scipy import stats 29 | import seaborn as sns 30 | import utils 31 | 32 | from sklearn.ensemble import RandomForestRegressor 33 | from sklearn import metrics 34 | import matplotlib.pyplot as plt 35 | 36 | # EDIT: 37 | # How many times to iterate each data set? 38 | ntimes = 10 39 | 40 | 41 | t = time.localtime() 42 | timeString = time.strftime("%Y-%m-%d", t) 43 | 44 | # FUNCTIONS: 45 | 46 | def runModel(model, Xtrain, ytrain, Xtest): 47 | model.fit(Xtrain, ytrain) 48 | test_predictions = model.predict(Xtest) 49 | return test_predictions 50 | 51 | def doRMSE(residuals): 52 | return np.sqrt(np.square(residuals).mean()) 53 | 54 | # HERE STARTS MAIN: 55 | 56 | def main(args): 57 | try: 58 | if not args.inputfile : 59 | raise Exception('Missing input filepath argument. Try --help .') 60 | 61 | print(f'\n09-runRF-article-iterate.py') 62 | print(f'\nARD data set in: {args.inputfile}') 63 | 64 | 65 | 66 | if 'median' in args.inputfile: 67 | print('Median as a sole feature') 68 | normalizer = 'median' 69 | else: 70 | # EDIT: 71 | #normalizer = "linear" # or "L1" 72 | normalizer = "L1" 73 | 74 | # read in array data: 75 | xtrain0 = utils.load_npintensities(args.inputfile) 76 | # normalize: 77 | xtrain = utils.normalise3D(xtrain0, normalizer) 78 | # read in target y: 79 | ytrain = utils.readTarget(args.inputfile) 80 | # jos ei anneta test set, niin tehdään split: 81 | if not args.testfile: 82 | print(f"\nSplitting {args.inputfile} into validation and training set:") 83 | xtrain, ytrain, xval, yval = utils.split_data(xtrain, ytrain) 84 | setID = utils.parse_xpath(args.inputfile) 85 | else: 86 | xval0 = utils.load_npintensities(args.testfile) 87 | # normalize: 88 | xval = utils.normalise3D(xval0, normalizer) 89 | yval = utils.readTarget(args.testfile) 90 | setID = utils.parse_xpath(args.testfile) 91 | 92 | # this needs 3D: 93 | m,n = xtrain.shape[:2] 94 | xtrain3d = xtrain.reshape(m,n,-1) 95 | m,n = xval.shape[:2] 96 | xval3d = xval.reshape(m,n,-1) 97 | 98 | if xval3d.shape[1] < xtrain3d.shape[1]: 99 | doysToAdd = xtrain3d.shape[1] - xval3d.shape[1] 100 | print(f"Shape of testing set differs from training set. We need to pad it with {doysToAdd} DOYs.") 101 | b = np.zeros( (xval3d.shape[0],doysToAdd,xval3d.shape[2]) ) 102 | xval3d = np.column_stack((xval3d,b)) 103 | print(f'New shape of padded xval3d is {xval3d.shape}.') 104 | 105 | if xtrain3d.shape[1] < xval3d.shape[1]: 106 | doysToAdd = xval3d.shape[1] - xtrain3d.shape[1] 107 | print(f"Shape of training set differs from testing set. We need to pad it with {doysToAdd} DOYs.") 108 | b = np.zeros( (xtrain3d.shape[0],doysToAdd,xtrain3d.shape[2]) ) 109 | xtrain3d = np.column_stack((xtrain3d,b)) 110 | print(f'New shape of padded xtrain3d is {xtrain3d.shape}.') 111 | 112 | # 2D: 113 | # make 2D: 114 | m = xval3d.shape[0] 115 | xval2d = xval3d.reshape(m,-1) 116 | m = xtrain3d.shape[0] 117 | xtrain2d = xtrain3d.reshape(m,-1) 118 | 119 | #pitää tehdä se in-season ennen kuin 2D: 120 | june = 43 121 | july = 73 122 | august = 104 123 | # June: 124 | xtrain3dnew = xtrain3d[:,:june,:] 125 | xval3dnew = xval3d[:,:june,:] 126 | 127 | # make 2D: 128 | m = xval3dnew.shape[0] 129 | XtestJune= xval3dnew.reshape(m,-1) 130 | m = xtrain3dnew.shape[0] 131 | XtrainJune = xtrain3dnew.reshape(m,-1) 132 | 133 | # July: 134 | xtrain3dnew = xtrain3d[:,:july,:] 135 | xval3dnew = xval3d[:,:july,:] 136 | 137 | # make 2D: 138 | m = xval3dnew.shape[0] 139 | XtestJuly = xval3dnew.reshape(m,-1) 140 | m = xtrain3dnew.shape[0] 141 | XtrainJuly = xtrain3dnew.reshape(m,-1) 142 | 143 | # August: 144 | xtrain3dnew = xtrain3d[:,:august,:] 145 | xval3dnew = xval3d[:,:august,:] 146 | 147 | # make 2D: 148 | m = xval3dnew.shape[0] 149 | XtestAugust = xval3dnew.reshape(m,-1) 150 | m = xtrain3dnew.shape[0] 151 | XtrainAugust = xtrain3dnew.reshape(m,-1) 152 | 153 | 154 | # MODEL: 155 | model = RandomForestRegressor(max_features = 8, n_jobs = -1, n_estimators = 500) 156 | 157 | if normalizer == 'median': 158 | modelname = 'RFmedian' 159 | else: 160 | if not args.testfile: 161 | modelname = 'RF' 162 | else: 163 | if 'ely' in args.testfile: 164 | modelname = 'RFely' 165 | if 'Rank' in args.testfile: 166 | modelname = 'RFrank' 167 | else: 168 | modelname = 'RFtest' 169 | 170 | df = [] 171 | 172 | # iterate predictions: 173 | for i in range(ntimes): 174 | print(f'Iteration {i+1}...') 175 | test_predictions = runModel(model, xtrain2d, ytrain, xval2d) 176 | dfResiduals = pd.DataFrame(np.subtract(test_predictions, yval)) 177 | dfResiduals.columns = ['farmfinal'] 178 | 179 | # June: 180 | test_predictions = runModel(model, XtrainJune, ytrain, XtestJune) 181 | dfResiduals['farm43'] = np.subtract(test_predictions, yval) 182 | 183 | # July: 184 | test_predictions = runModel(model, XtrainJuly, ytrain, XtestJuly) 185 | dfResiduals['farm73'] = np.subtract(test_predictions, yval) 186 | 187 | # August: 188 | test_predictions = runModel(model, XtrainAugust, ytrain, XtestAugust) 189 | dfResiduals['farm104'] = np.subtract(test_predictions, yval) 190 | 191 | df.append(dfResiduals) 192 | 193 | if not args.testfile: 194 | basepath = args.inputfile.split('/')[:-2] 195 | out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative') 196 | Path(out_dir_results).mkdir(parents=True, exist_ok=True) 197 | else: 198 | basepath = args.testfile.split('/')[:-2] 199 | out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative') 200 | Path(out_dir_results).mkdir(parents=True, exist_ok=True) 201 | 202 | t = time.localtime() 203 | timeString2 = time.strftime("%Y-%m-%d-%H:%M:%S", t) 204 | 205 | pklfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', timeString2 + '-allIteratedRMSE-' + modelname + '-' + setID + '.pkl') 206 | print(f"\nWriting results to file {pklfile}.") 207 | utils.save_intensities(pklfile, df) 208 | 209 | csvfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', 'iteratedRMSE.csv') 210 | print(f"\nWriting results to file {csvfile}.") 211 | 212 | for setti in ['farmfinal', 'farm43', 'farm73', 'farm104']: 213 | residuals = [] 214 | for i in range(ntimes): 215 | residuals.extend(df[i][setti]) 216 | rmse = doRMSE(residuals) 217 | 218 | with open(csvfile, "a+") as f: 219 | writer = csv.writer(f) 220 | writer.writerow([setID, modelname, round(rmse, 3), setti]) 221 | 222 | 223 | print(f'\nDone.') 224 | 225 | except Exception as e: 226 | print('\n\nUnable to read input or write out statistics. Check prerequisites and see exception output below.') 227 | parser.print_help() 228 | raise e 229 | 230 | 231 | if __name__ == '__main__': 232 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 233 | epilog=textwrap.dedent(__doc__)) 234 | 235 | parser.add_argument('-i', '--inputfile', 236 | help='Filepath of array intensities (training set).', 237 | type=str) 238 | parser.add_argument('-j', '--testfile', 239 | help='Filepath of the testing set (optional).', 240 | type=str) 241 | 242 | parser.add_argument('--debug', 243 | help='Verbose output for debugging.', 244 | action='store_true') 245 | 246 | args = parser.parse_args() 247 | main(args) 248 | 249 | -------------------------------------------------------------------------------- /python/09-runTCN-article-iterate.py: -------------------------------------------------------------------------------- 1 | """ 2 | 2021-09-01 MY added normalization 3 | 2022-01-02 iterable, returns mean RMSE of ntimes iterated trainings. 4 | 2022-03-05 return also all RMSEs 5 | 6 | RUN: 7 | 8 | Without testing set (makes train/validation split automatically): 9 | python 09-runTCN-article-iterate.py -i dataStack/array_1110-2020.npz \ 10 | --epochs 200 --batchsize 128 --learningrate 0.001 --epsilon 0.1 11 | 12 | With testing set (region or separate year): 13 | python 09-runTCN-article-iterate.py -i dataStack/array_1110-2018-2019.npz \ 14 | -j dataStack/array_1110-2020.npz \ 15 | --epochs 200 --batchsize 128 --learningrate 0.001 --epsilon 0.1 16 | 17 | 18 | NOTE: if you test with a separate year, be sure that training set excludes that year! 19 | 20 | 21 | """ 22 | import glob 23 | import pandas as pd 24 | import numpy as np 25 | import os.path 26 | from pathlib import Path 27 | import argparse 28 | import textwrap 29 | import math 30 | import time 31 | import csv 32 | from scipy import stats 33 | import utils 34 | 35 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 36 | from tensorflow.keras.models import Sequential, save_model, load_model 37 | from tensorflow.keras.layers import Dense, Dropout, SimpleRNN, LSTM 38 | from tensorflow.keras.callbacks import EarlyStopping 39 | from tensorflow.keras.utils import plot_model 40 | from tensorflow.keras.optimizers import Adam 41 | 42 | from tcn import TCN, tcn_full_summary 43 | 44 | # pip install keras-tcn --user 45 | 46 | t = time.localtime() 47 | timeString = time.strftime("%Y-%m-%d", t) 48 | 49 | # EDIT: 50 | # How many times to iterate each data set? 51 | ntimes = 10 52 | 53 | # FUNCTIONS: 54 | 55 | def doRMSE(residuals): 56 | return np.sqrt(np.square(residuals).mean()) 57 | 58 | def temporalConvolutionalNetworks(shape1, shape2): 59 | print("\nTraining TCN...") 60 | 61 | tcn_layer = TCN(input_shape=(None, shape2), nb_filters = 32, padding = 'causal', kernel_size = 2, 62 | nb_stacks=1, dilations = [1, 2, 4, 8, 16], 63 | return_sequences=True 64 | ) 65 | 66 | # The receptive field tells you how far the model can see in terms of timesteps. 67 | print('Receptive field size =', tcn_layer.receptive_field) 68 | 69 | model = Sequential([ 70 | tcn_layer, 71 | Dense(1) 72 | ]) 73 | 74 | 75 | # Model summary: 76 | print('\nNetwork architecture:') 77 | print(model.summary()) 78 | #print(tcn_full_summary(model)) 79 | 80 | return model 81 | 82 | 83 | 84 | def runModel(model, modelname, Xtrain, ytrain, Xtest, ytest, outputdir, epochs, batchsize, optimizeri, lera, epsiloni, setID, normalizer): 85 | 86 | # monitor validation progress 87 | early = EarlyStopping(monitor = "val_loss", mode = "min", patience = 10) 88 | callbacks_list = [early] 89 | 90 | if optimizeri == 'adam': 91 | model.compile(loss = 'mean_squared_error', 92 | optimizer = Adam(learning_rate=lera, epsilon = epsiloni), 93 | metrics = ['mse']) 94 | df = [] 95 | 96 | # iterate training: 97 | for i in range(ntimes): 98 | print(f'Iteration {i+1}...') 99 | history = model.fit(Xtrain, ytrain, 100 | epochs=epochs, batch_size=batchsize, verbose=0, 101 | validation_split = 0.20, 102 | callbacks = callbacks_list) 103 | 104 | test_predictions = model.predict(Xtest) 105 | 106 | dfResiduals = pd.DataFrame(np.subtract(test_predictions[:, -1, 0], ytest)) 107 | dfResiduals.columns = ['farmfinal'] 108 | 109 | # in this case using doys (130-243) (43, 73, 104) with zero-padding: 110 | june = 43 111 | july = 73 112 | august = 104 113 | 114 | #June: 115 | dfResiduals['farm43'] = np.subtract(test_predictions[:, june, 0], ytest) 116 | 117 | #July: 118 | dfResiduals['farm73'] = np.subtract(test_predictions[:, july, 0], ytest) 119 | 120 | #August: 121 | dfResiduals['farm104'] = np.subtract(test_predictions[:, august, 0], ytest) 122 | 123 | df.append(dfResiduals) 124 | 125 | return df 126 | 127 | 128 | # HERE STARTS MAIN: 129 | 130 | def main(args): 131 | try: 132 | if not args.inputfile : 133 | raise Exception('Missing input filepath argument. Try --help .') 134 | 135 | print(f'\n09-runTCN-article-iterate.py') 136 | print(f'\nARD data set in: {args.inputfile}') 137 | 138 | if 'median' in args.inputfile: 139 | print('Median as a sole feature...') 140 | normalizer = 'median' 141 | else: 142 | # EDIT: 143 | #normalizer = "linear" # or "L1" 144 | normalizer = "L1" 145 | 146 | ############################# Preprocessing: 147 | # read in array data: 148 | xtrain0 = utils.load_npintensities(args.inputfile) 149 | # normalize: 150 | xtrain = utils.normalise3D(xtrain0, normalizer) 151 | # read in target y: 152 | ytrain = utils.readTarget(args.inputfile) 153 | # jos ei anneta test set, niin tehdään split: 154 | if not args.testfile: 155 | print(f"\nSplitting {args.inputfile} into validation and training set:") 156 | xtrain, ytrain, xval, yval = utils.split_data(xtrain, ytrain) 157 | setID = utils.parse_xpath(args.inputfile) 158 | else: 159 | xval0 = utils.load_npintensities(args.testfile) 160 | # normalize: 161 | xval = utils.normalise3D(xval0, normalizer) 162 | yval = utils.readTarget(args.testfile) 163 | setID = utils.parse_xpath(args.testfile) 164 | 165 | 166 | if not args.testfile: 167 | basepath = args.inputfile.split('/')[:-2] 168 | out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative') 169 | Path(out_dir_results).mkdir(parents=True, exist_ok=True) 170 | else: 171 | basepath = args.testfile.split('/')[:-2] 172 | out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative') 173 | Path(out_dir_results).mkdir(parents=True, exist_ok=True) 174 | 175 | 176 | # this needs 3D: 177 | m,n = xtrain.shape[:2] 178 | xtrain3d = xtrain.reshape(m,n,-1) 179 | m,n = xval.shape[:2] 180 | xval3d = xval.reshape(m,n,-1) 181 | 182 | # forget zero-padding: 183 | #if xval3d.shape[1] < xtrain3d.shape[1]: 184 | # doysToAdd = xtrain3d.shape[1] - xval3d.shape[1] 185 | # print(f"Shape of testing set differs from training set. We need to pad it with {doysToAdd} DOYs.") 186 | # b = np.zeros( (xval3d.shape[0],doysToAdd,xval3d.shape[2]) ) 187 | # xval3d = np.column_stack((xval3d,b)) 188 | # print(f'New shape of padded xval3d is {xval3d.shape}.') 189 | 190 | #if xtrain3d.shape[1] < xval3d.shape[1]: 191 | # doysToAdd = xval3d.shape[1] - xtrain3d.shape[1] 192 | # print(f"Shape of training set differs from testing set. We need to pad it with {doysToAdd} DOYs.") 193 | # b = np.zeros( (xtrain3d.shape[0],doysToAdd,xtrain3d.shape[2]) ) 194 | # xtrain3d = np.column_stack((xtrain3d,b)) 195 | # print(f'New shape of padded xtrain3d is {xtrain3d.shape}.') 196 | 197 | ##################################### Models: 198 | # model topology: 199 | model = temporalConvolutionalNetworks(xtrain3d.shape[1], xtrain3d.shape[2]) 200 | if normalizer == 'median': 201 | modelname = 'TCNmedian' 202 | else: 203 | if not args.testfile: 204 | modelname = 'TCN' 205 | else: 206 | modelname = 'TCNtest' 207 | 208 | df = runModel(model, modelname, xtrain3d, ytrain, xval3d, yval, out_dir_results, args.epochs, args.batchsize, args.optimizer, args.learningrate, args.epsilon, setID, normalizer) 209 | 210 | basepath = args.inputfile.split('/')[:-2] 211 | out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative') 212 | Path(out_dir_results).mkdir(parents=True, exist_ok=True) 213 | 214 | t = time.localtime() 215 | timeString2 = time.strftime("%Y-%m-%d-%H:%M:%S", t) 216 | 217 | pklfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', timeString2 + '-allIteratedRMSE-' + modelname + '-' + setID + '.pkl') 218 | 219 | print(f"\nWriting results to file {pklfile}.") 220 | utils.save_intensities(pklfile, df) 221 | 222 | csvfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', 'iteratedRMSE.csv') 223 | print(f"\nWriting results to file {csvfile}.") 224 | 225 | 226 | for setti in ['farmfinal', 'farm43', 'farm73', 'farm104']: 227 | residuals = [] 228 | for i in range(ntimes): 229 | residuals.extend(df[i][setti]) 230 | rmse = doRMSE(residuals) 231 | 232 | with open(csvfile, "a+") as f: 233 | writer = csv.writer(f) 234 | writer.writerow([setID, modelname, round(rmse, 3), setti]) 235 | 236 | 237 | print(f'\nDone.') 238 | 239 | except Exception as e: 240 | print('\n\nUnable to read input or write out statistics. Check prerequisites and see exception output below.') 241 | parser.print_help() 242 | raise e 243 | 244 | 245 | if __name__ == '__main__': 246 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, 247 | epilog=textwrap.dedent(__doc__)) 248 | 249 | parser.add_argument('-i', '--inputfile', 250 | help='Filepath of array intensities (training set).', 251 | type=str) 252 | parser.add_argument('-j', '--testfile', 253 | help='Filepath of the testing set (optional).', 254 | type=str) 255 | parser.add_argument('-e', '--epochs', 256 | help='An epoch is an iteration over the entire x and y data provided (default 20).', 257 | type=int, default = 20) 258 | parser.add_argument('-b', '--batchsize', 259 | help='Number of samples per gradient update (default 32).', 260 | type=int, default = 32) 261 | parser.add_argument('-o', '--optimizer', 262 | help='Optimizer (default adam).', 263 | type=str, default = 'adam') 264 | parser.add_argument('-l', '--learningrate', 265 | help='Learning rate (defaults to 0.001).', 266 | type=float, default = '0.001') 267 | parser.add_argument('-p', '--epsilon', 268 | help='A small constant for numerical stability (defaults to 1e-07).', 269 | type=float, default = '0.0000001') 270 | parser.add_argument('--debug', 271 | help='Verbose output for debugging.', 272 | action='store_true') 273 | 274 | args = parser.parse_args() 275 | main(args) 276 | --------------------------------------------------------------------------------