├── README.md
└── python
    ├── 01-splitshp-shadow.py
    ├── 02-pathfinder.py
    ├── 02-safefinder.py
    ├── 03-arrayextractor.py
    ├── 04-flatten-temporal.py
    ├── 05-histogramize-shadow.py
    ├── 05-medianize.py
    ├── 06-histo2stack.py
    ├── 06-median2stack.py
    ├── 07-medianstack2ARD.py
    ├── 07-stack2ARD.py
    ├── 07C-doyFusion-median.py
    ├── 07C-doyFusion.py
    ├── 08-mergeTarget-parallel.py
    ├── 08A-removeDuplicates-parallel.py
    ├── 08B-mergeObservations-parallel.py
    ├── 09-runRF-article-iterate.py
    └── 09-runTCN-article-iterate.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Scalable crop yield mapping with Sentinel-2 time series and temporal convolutional network (TCN)
 2 | 
 3 | This repository includes codes for preprocessing the data from Sentinel-2 L2A product into time series and and ready for the prediction models TCN and random forests (RF).
 4 | 
 5 | in python/
 6 | 
 7 | - 01-splitshp-shadow.py: ESRI shapefile for polygons (field parcel) is split into subsets (files) by Sentinel-2 granule boundaries.
 8 | - 02-pathfinder.py: filepaths to Sentinel-2 bands is searched. Use this if no intentions for cloud-masking.
 9 | - 02-safefinder.py: directory paths to Sentinel-2 SAFE directories. Use this if cloud-masking wanted.
10 | - 03-arrayextractor.py: extract pixel values from bands by polygons. Cloud-mask used is safe paths given.
11 | - 04-flatten-temporal.py: flatten the observations into 11-day temporal composites.
12 | - 05-histogramize-shadow.py: calculate histograms for each observation (band).
13 | - 05-medianize.py: calculate median for each observation (band).
14 | - 06-histo2stack.py: stack histograms from separate files into one file.
15 | - 06-median2stack.py: stack medians from separate files into one file.
16 | - 07-medianstack2ARD.py: make analysis ready data from medians.
17 | - 07-stack2ARD.py: make analysis ready data from histograms.
18 | - 07C-doyFusion-median.py: if duplicates at day-of-year, merge all observations per day per farm into one (matrix addition)
19 | - 07C-doyFusion.py: if duplicates at day-of-year, merge all observations per day per farm into one (matrix addition)
20 | - 08A-removeDuplicates-parallel.py: remove duplicates, if any, compute marix addition.
21 | - 08B-mergeObservations-parallel.py: merge farms by region
22 | - 08-mergeTarget-parallel.py: merge values with reference to write target y files for training.
23 | - 09-runRF-article-iterate.py: run RF, iterate 10 times for each data set (hard coded)
24 | - 09-runTCN-article-iterate.py: run TCN, iterate 10 times for each data set (hard coded)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/python/01-splitshp-shadow.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | 2020-06-01 MY
  4 | 
  5 | Usage:
  6 | python splitshp-shadow.py --s2tiles suomiTiles.shp \
  7 | --fullshapefile shapefile --outshpdir satotutkimus-shpPerTile/ --out_file farmIDtile.tsv
  8 | 
  9 | Modified version of EODIE splitshp.py
 10 | 
 11 | """
 12 | 
 13 | import os
 14 | from osgeo import osr
 15 | import subprocess
 16 | import sys
 17 | 
 18 | import pandas as pd
 19 | import geopandas as gpd
 20 | 
 21 | import argparse
 22 | import textwrap
 23 | import pathlib
 24 | 
 25 | 
 26 | def main(args):
 27 |     try:
 28 |         if not args.fullshapefile or not args.s2tiles:
 29 |             raise Exception('Missing shapefile argument. Try --help .')
 30 | 
 31 |         print(f'\n\nsplitshp-shadow.py')
 32 |         print(f'\nSentinel2 tiles: {args.s2tiles}')
 33 |         print(f'ESRI shapefile parcels: {args.fullshapefile}')
 34 |         out_dir_path = pathlib.Path(os.path.expanduser(args.outshpdir))
 35 |         out_dir_path.mkdir(parents=True, exist_ok=True)
 36 | 
 37 | 
 38 |         out_file = args.out_file
 39 | 
 40 |         print('Reading parcels...')
 41 | 
 42 |         def checkProjection(myshp):
 43 |             print('INFO: checking the projection of the inputfile now')
 44 |             head, tail = os.path.split(myshp)
 45 |             root, ext = os.path.splitext(tail)
 46 |             rootprj = root + '.prj'
 47 |             projectionfile = os.path.join(head, rootprj)
 48 |             prj_file = open(projectionfile , 'r')
 49 |             prj_text = prj_file.read()
 50 |             srs = osr.SpatialReference()
 51 |             srs.ImportFromESRI([prj_text])
 52 |             srs.AutoIdentifyEPSG()
 53 |             epsgcode = srs.GetAuthorityCode(None)
 54 |             if epsgcode == '3067':
 55 |                 print('INFO: input shapefile has EPSG 3067, that works!')
 56 |                 return myshp
 57 |             else:
 58 |                 reprojectedshape = os.path.join(head, root + '_reprojected_3067'+ ext)
 59 |                 #if not os.path.exists(reprojectedshape):
 60 |                 reprojectcommand = 'ogr2ogr -t_srs EPSG:3067 ' + reprojectedshape + ' ' + myshp
 61 |                 subprocess.call(reprojectcommand, shell=True)
 62 |                 print('INFO: input shapefile had other than EPSG 3067, but was reprojected and works now')
 63 |                 return reprojectedshape
 64 | 
 65 | 
 66 |         # bringing all input shapefiles to EPSG 3067 
 67 |         print('Parcel shapefile: ')
 68 |         fullshapefile2 = checkProjection(args.fullshapefile)
 69 |         print('Sentinel2 shapefile: ')
 70 |         s2tiles2 = checkProjection(args.s2tiles)
 71 | 
 72 | 
 73 |         # filename:
 74 |         originalname = os.path.splitext(os.path.split(fullshapefile2)[-1])[0]
 75 | 
 76 | 
 77 |         # Tehdään loput geopandalla:
 78 | 
 79 |         tiles = gpd.read_file(s2tiles2)
 80 |         parcelshp = gpd.read_file(fullshapefile2)
 81 | 
 82 |         print(f'There are ', len(parcelshp), ' parcels in the input shapefile.')
 83 |         
 84 |         # for bookkeeping
 85 |         df = pd.DataFrame(columns = ['farmID', 'Tile'])
 86 | 
 87 |         for index, row in tiles.iterrows(): # Looping over all tiles
 88 |             tilename = row['Name']
 89 | 
 90 |             # is there any parcels on this tile's BBOX:
 91 |             xmin, ymin, xmax, ymax = row['geometry'].bounds
 92 |             parcels = parcelshp.cx[xmin:xmax, ymin:ymax]
 93 | 
 94 |             if not parcels.empty:
 95 | 
 96 |                 res_intersection = parcels['geometry'].within(row['geometry'])
 97 |                 if any(res_intersection):
 98 |                     parcelsToFile = parcels[res_intersection]
 99 |                     outshpname = os.path.join(args.outshpdir,originalname + '_' + str(tilename)+'.shp')        
100 |                     parcelsToFile.crs = '+proj=utm +zone=35 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs'
101 |                     parcelsToFile.to_file(outshpname)
102 | 
103 |                     writeParcels = pd.DataFrame(parcelsToFile['farmID'])
104 |                     writeParcels['Tile'] =  tilename
105 |                     #print(writeParcels)
106 |                     writeParcels.to_csv(out_file, mode='a', header=False)
107 | 
108 |                     df = df.append(writeParcels, ignore_index = True) 
109 | 
110 |         print(f'Intersecting farmIDs and tiles saved to {out_file}.')
111 |         #df.to_csv(out_file, sep = '\t', index = False)
112 | 
113 |         print(f'\nDone.')
114 | 
115 |     except Exception as e:
116 |         print('\n\nUnable to read input or write out files. Check prerequisites and see exception output below.')
117 |         parser.print_help()
118 |         raise e
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
123 |                                      epilog=textwrap.dedent(__doc__))
124 |     parser.add_argument('-s', '--s2tiles',
125 |                         type=str,
126 |                         help='Sentinel-2 tiles.')
127 |     parser.add_argument('-a', '--fullshapefile',
128 |                         type=str,
129 |                         help='ESRI shapefile containing a set of polygons (.shp with its auxiliary files)')
130 |     parser.add_argument('-d', '--outshpdir',
131 |                         help='Directory for output shp files',
132 |                         type=str,
133 |                         default='.')
134 |     parser.add_argument('-o', '--out_file',
135 |                     help='Output (e.g. .tsv) tab-separated file containing farmID and the tile it was found at.',
136 |                     type=str,
137 |                     default='farmIDtile.tsv')
138 | 
139 |     args = parser.parse_args()
140 |     main(args)
141 |     
142 | 


--------------------------------------------------------------------------------
/python/02-pathfinder.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | """
 4 | 
 5 | EODIE PathFinder creates paths to all files between start- and enddate 
 6 | 
 7 | """
 8 | import os
 9 | import userinput
10 | 
11 | def makefilepaths(userinput):
12 | 
13 |     tilepath = userinput.datadir
14 |     filepaths = []
15 |     
16 |     #for tilepath in tilepaths:
17 |     for filename in os.listdir(tilepath):
18 |         print(filename)
19 |         date = filename.split('_')[2].split('T')[0]
20 |         if not userinput.enddate is None and not userinput.startdate is None:
21 |             if date <= userinput.enddate and date >= userinput.startdate:
22 |                 filepath = os.path.join(tilepath,filename)
23 |                 filepaths.append(filepath)
24 |         else:
25 |             filepath = os.path.join(tilepath,filename)
26 |             filepaths.append(filepath)
27 |     
28 |     return filepaths
29 | 
30 | def makebandname(userinput):
31 |     
32 |     bandnames = []
33 |     
34 |     for band in userinput.bandlist:
35 |         if band > 10:
36 |             if band == 13:
37 |                 bandname = 'B8A_20m'
38 |             else:
39 |                 bandname = 'B'+str(band) + '_20m'
40 |         elif band in [9,1]:
41 |             bandname = 'B0'+str(band) +'_60m'
42 |         elif band in [2,3,4,8]:
43 |             bandname = 'B0'+str(band) +'_10m'
44 |         else:
45 |             bandname = 'B0'+str(band) + '_20m'
46 |         
47 |         bandnames.append(bandname)
48 |     return bandnames
49 |     
50 | def makebandpaths():
51 | 
52 |     ui = userinput.UserInput()
53 |     
54 |     #from all filepaths, extend to matching band paths
55 |     
56 |     filepaths = makefilepaths(ui)
57 |     bandpaths = []
58 |     
59 |     for filepath in filepaths:
60 |         granulepath = os.path.join(filepath,'GRANULE')
61 |         betweenpath = os.path.join(granulepath,os.listdir(granulepath)[0])
62 |         imgpath = os.path.join(betweenpath,'IMG_DATA')
63 |         r10 = os.path.join(imgpath,'R10m')
64 |         r20 = os.path.join(imgpath,'R20m')
65 |         r60 = os.path.join(imgpath,'R60m')
66 |         bandlist = makebandname(ui)
67 |         #print(bandlist)
68 |         for rdir in [r10,r20,r60]:
69 |             mylist = [os.path.join(rdir,bandfile) for bandfile in os.listdir(rdir) if bandfile.split('_')[-2] +'_' + bandfile.split('_')[-1][:3] in bandlist]
70 |             bandpaths.extend(mylist)
71 |             #print(mylist)
72 |     
73 |     to_txt(bandpaths)
74 |             
75 | def to_txt(paths):
76 | 
77 |     with open('bandpaths.txt', 'w') as f:
78 |         for item in paths:
79 |             f.write("%s\n" % item)
80 | 
81 |         
82 | if __name__ == "__main__":
83 |     makebandpaths()
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/python/02-safefinder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | safefinder.py creates paths to all files between start- and enddate.
 4 | 
 5 | 17.8.2021 MY modified from EODIE pathfinder.py to find safe dirs.
 6 | 
 7 | RUN:
 8 | python safefinder.py -s 20200501 -e 20200905  -d /scratch/project_2002694/safedirs
 9 | 
10 | """
11 | import os
12 | import userinput
13 | 
14 | import argparse
15 | import textwrap
16 | import pathlib
17 | 
18 | 
19 |         
20 | def makesafepaths(datadir, startdate, enddate):
21 | 
22 |     tilepath = datadir
23 |     filepaths = []
24 |     
25 |     for filename in os.listdir(tilepath):
26 |         date = filename.split('_')[2].split('T')[0]
27 |         if not enddate is None and not startdate is None:
28 |             if date <= enddate and date >= startdate:
29 |                 filepath = os.path.join(tilepath,filename)
30 |                 filepaths.append(filepath)
31 |         else:
32 |             filepath = os.path.join(tilepath,filename)
33 |             filepaths.append(filepath)
34 |     
35 |     to_txt(filepaths)
36 |             
37 | def to_txt(paths):
38 | 
39 |     with open('../bin/safepaths.txt', 'w') as f:
40 |         for item in paths:
41 |             f.write("%s\n" % item)
42 | 
43 |         
44 | def main(args):
45 |     try:
46 |         if not args.datapath:
47 |             raise Exception('Missing input dir argument. Try --help .')
48 | 
49 |         print(f'\n\nsafefinder.py')
50 |         print(f'\n\nLists all SAFE directories within the start and end date.\n Writes the list to ../bin/safepaths.txt.')
51 | 
52 |         makesafepaths(args.datapath, args.startdate, args.enddate)
53 |         
54 |     except Exception as e:
55 |         print('\n\nUnable to read input or write out files. Check prerequisites and see exception output below.')
56 |         parser.print_help()
57 |         raise e
58 | 
59 |         
60 |         
61 |         
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
64 |                                      epilog=textwrap.dedent(__doc__))
65 |     parser.add_argument('-d', '--datapath',
66 |                         type=str,
67 |                         help='Directory path to safe directories')
68 |     parser.add_argument('-s', '--startdate',
69 |                         type=str,
70 |                         help='Start date, e.g. 20200501')
71 |     parser.add_argument('-e', '--enddate',
72 |                         help='End date, e.g. 20200901',
73 |                         type=str,
74 |                         default='.')
75 | 
76 |     args = parser.parse_args()
77 |     main(args)
78 | 
79 | 


--------------------------------------------------------------------------------
/python/03-arrayextractor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Originally Samantha Wittke in 2020 for EODIE.
  3 | 
  4 | Modified by Maria Yli-Heikkila, Markku Luotamo
  5 | 
  6 | 2020-11-12 Commented out meta data writer (extractmeta), useful only for testing purposes.
  7 | 2021-08 ML added cloud masking
  8 | 2021-09-06 MY added option to use tempdir
  9 | 2021-09-24 MY changed to save csv in UNIX format, not DOS; not saving empty files anymore
 10 | 
 11 | USAGE:
 12 | python 03-arrayextractor.py -f $name -shp $shppath -p $projectpath -jn ${ID} -id $idname -r 10 -t $TEMPDIRPATH
 13 | 
 14 | WHERE:
 15 | -f: raster file path
 16 | -shp: polygons shapefile path
 17 | -p: output path
 18 | -jn: job number ID
 19 | -id: name of the identifier variable in shapefile (e.g. 'parcelID')
 20 | -r: for multi-band operation you must specify a common target resolution (e.g. 10)
 21 | -t: temporary directory path
 22 | 
 23 | 
 24 | """
 25 | 
 26 | import csv
 27 | import os
 28 | import re
 29 | from datetime import datetime
 30 | from glob import glob
 31 | from shutil import copyfile
 32 | from typing import Optional
 33 | from rasterstats import zonal_stats
 34 | import functools
 35 | 
 36 | import numpy as np
 37 | import rasterio
 38 | import shapeobject
 39 | import userinput
 40 | import traceback
 41 | 
 42 | BANDS = ['B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B11', 'B12']
 43 | 
 44 | FILTER_OUT_SEN2COR_CLASSES = [0, 1, 3, 8, 9, 10] # No data, Cloud Shadows + Clouds medium+high probability + Cirrus
 45 | #[0, 1, 3, 8, 9, 10]
 46 | CLOUD_MASK_RESO_M = 20
 47 | NO_DATA = np.nan
 48 | INVALID = 0
 49 | 
 50 | #direct array extraction
 51 | 
 52 | def main():
 53 | 
 54 |     ui = userinput.UserInput()
 55 |     jobnumber = ui.jobnumber
 56 |     bandpathtxt = ui.bandpath
 57 |     tile = parse_tile_from_path(bandpathtxt)
 58 |     shapedir = ui.shapedir
 59 |     namelist = os.listdir(shapedir)[0].split('_')
 60 |     shapename = '_'.join(namelist[:-1])
 61 |     tileshapename = shapename + '_' + tile
 62 |     shapefile = os.path.join(shapedir, tileshapename)
 63 |     projectname = ui.projectname
 64 |     if not os.path.exists(projectname):
 65 |        print(f'Creating output direcory {projectname}...')
 66 |        os.makedirs(projectname)
 67 |     if ui.tmpdir:
 68 |         tmpdir = ui.tmpdir
 69 |     else:
 70 |         tmpdir = projectname
 71 |     shpfile = None
 72 |     if not jobnumber is None:
 73 |         for ext in ['.shp','.shx','.prj','.dbf']:
 74 |             shp = shapefile + ext
 75 |             if os.path.isfile(shp):
 76 |                 #print(shp)
 77 |                 jobdir = os.path.join(tmpdir,'temp',jobnumber)
 78 |                 dst = os.path.join(jobdir, tileshapename + ext)
 79 |                 if not os.path.exists(jobdir):
 80 |                     os.makedirs(jobdir)
 81 |                 copyfile(shp, dst)
 82 |                 if dst.endswith('.shp'):
 83 |                     shpfile = dst
 84 |     else:
 85 |         shpfile = shapefile + '.shp'
 86 | 
 87 |     extractarray(bandpathtxt, shpfile, tile, projectname, ui)
 88 | 
 89 | 
 90 | def extractarray(raster_path,
 91 |                  shpfile,
 92 |                  tile,
 93 |                  projectname,
 94 |                  ui):
 95 | 
 96 |     cloud_mask_path: Optional[str]
 97 |     band_paths: [str]
 98 | 
 99 |     cloud_mask_path, band_paths = expand_raster_paths(raster_path, ui)
100 | 
101 |     shapeobj: shapeobject.ShapeObject = shapeobject.ShapeObject(shpfile)
102 | 
103 |     if cloud_mask_path:
104 |         shpfile: str = shapeobj.checkProjection(cloud_mask_path)
105 |     else:
106 |         shpfile: str = shapeobj.checkProjection(raster_path)
107 | 
108 |     if cloud_mask_path:
109 |         assert ui.target_resolution_m, 'For cloud masking you must specify a common target resolution.'
110 |         try:
111 |             parcel_cloud_masks: [dict] = crop_band_raster_per_parcel(cloud_mask_path, shpfile, ui.target_resolution_m,
112 |                                                                      resampling=rasterio.enums.Resampling.nearest)
113 |         except Exception as e:
114 |             print(f'Error reading cloud mask for tile {tile} from "{cloud_mask_path}"')
115 |             raise e
116 | 
117 |     for band_path in band_paths:
118 | 
119 |         date: str = parse_date_from_path(band_path)
120 |         band: str = parse_band_from_path(band_path)
121 | 
122 |         if ui.target_resolution_m:
123 |             target_resolution_m: int = ui.target_resolution_m
124 |         else:
125 |             target_resolution_m: int = parse_resolution_from_path(band_path)
126 |         try:
127 |             band_raster_per_parcel: [dict] = crop_band_raster_per_parcel(band_path, shpfile, target_resolution_m)
128 |         except:
129 |             print(f'Error reading band {band} for tile {tile} ')
130 | 
131 |         csv_rows = []
132 | 
133 |         if not cloud_mask_path:
134 |             parcel_cloud_masks = [None] * len(band_raster_per_parcel)
135 | 
136 |         for parcel_band_raster, parcel_cloud_mask in zip(band_raster_per_parcel, parcel_cloud_masks):
137 |             filtered_band: np.ndarray = filter_band_using_mask(parcel_band_raster, parcel_cloud_mask)
138 |             if np.count_nonzero(filtered_band) == 0:
139 |                 continue
140 | 
141 |             parcel_id: str = parcel_band_raster['properties'][ui.idname]
142 |             row: [str] = [parcel_id] + filtered_band.flatten().tolist()
143 |             csv_rows.append(row)
144 | 
145 |         if csv_rows:
146 |             tocsv(date,band,csv_rows,tile,projectname)
147 | 
148 | 
149 | def maximal_resolution_band_paths(bands: [str], band_root_path: str, max_resolution_m: int):
150 |     candidate_paths = list(filter(lambda p: parse_resolution_from_path(p)
151 |                                             and parse_resolution_from_path(p) >= max_resolution_m
152 |                                             and parse_band_from_path(p) in bands,
153 |                                   glob(f'{band_root_path}/**/*.jp2', recursive=True)))
154 |     candidate_path_resolutions = list(map(parse_resolution_from_path, candidate_paths))
155 |     candidate_path_bands = list(map(parse_band_from_path, candidate_paths))
156 |     candidate_paths_sorted = sorted(list(zip(candidate_path_resolutions, candidate_paths, candidate_path_bands)),
157 |                                     key=lambda t: t[0])
158 |     max_reso_path_by_band = {}
159 |     for candidate in candidate_paths_sorted:
160 |         reso, path, band = candidate
161 |         if band in max_reso_path_by_band:
162 |             continue
163 |         else:
164 |             max_reso_path_by_band[band] = path
165 | 
166 |     return list(max_reso_path_by_band.values())
167 | 
168 | 
169 | def filter_band_using_mask(parcel_band_raster: [dict], parcel_cloud_mask: [dict]):
170 |     parcel_array = parcel_band_raster['properties']['mini_raster_array'].filled(NO_DATA)
171 | 
172 |     if parcel_cloud_mask:
173 |         cloud_array = parcel_cloud_mask['properties']['mini_raster_array'].filled(INVALID)
174 |         cloud_mask = np.logical_not(sen2cor_binary_transformer(cloud_array))
175 |         filtered_array = parcel_array[cloud_mask]
176 |     else:
177 |         filtered_array = parcel_array
178 | 
179 |     return filtered_array[np.isfinite(filtered_array)].astype(np.uint16)
180 | 
181 | 
182 | def crop_band_raster_per_parcel(band_path:str, shpfile: str, target_resolution_m: int,
183 |                                 resampling=rasterio.enums.Resampling.bilinear) -> [dict]:
184 |     band_data, tile_band_resample_transform = \
185 |         resampled_raster_dataset(band_path,
186 |                                  parse_resolution_from_path(band_path) / target_resolution_m,
187 |                                  resampling=resampling)
188 |     try:
189 |         bandwise_zstats = zonal_stats(shpfile,
190 |                                       band_data,
191 |                                       affine=tile_band_resample_transform,
192 |                                       stats=['count', 'nodata'],
193 |                                       band=1,
194 |                                       nodata=-999,
195 |                                       geojson_out=True,
196 |                                       all_touched=False,
197 |                                       raster_out=True)
198 |     except Exception as e:
199 |         print(f'Error extracting polygons from raster "{band_path}" and shp "{shpfile}":')
200 |         traceback.print_exc()
201 |         raise e
202 | 
203 |     return bandwise_zstats
204 | 
205 | 
206 | def expand_raster_paths(raster_path: str, ui: userinput.UserInput):
207 |     if '.jp2' in raster_path:  # single band
208 |         raster_paths = [raster_path]
209 |         cloud_mask_path = ui.cloud_mask_path
210 |     else:  # all eligible bands in a SAFE dir
211 |         assert ui.target_resolution_m, 'For multi-band operation you must specify a common target resolution.'
212 |         raster_paths = maximal_resolution_band_paths(BANDS, raster_path, ui.target_resolution_m)
213 |         cloud_mask_path = safe_cloud_mask_path(raster_path)
214 | 
215 |     return cloud_mask_path, raster_paths
216 | 
217 | 
218 | def resampled_raster_dataset(raster_path, scaling_factor, resampling=rasterio.enums.Resampling.bilinear):
219 |     try:
220 |         with rasterio.open(raster_path) as dataset:
221 |             raster_data = dataset.read(1,
222 |                                        out_shape=(
223 |                                            dataset.count,
224 |                                            int(dataset.height * scaling_factor),
225 |                                            int(dataset.width * scaling_factor)
226 |                                        ),
227 |                                        resampling=resampling)
228 |             resample_transform = \
229 |                 dataset.transform * dataset.transform.scale(
230 |                     (dataset.width / raster_data.shape[-1]),
231 |                     (dataset.height / raster_data.shape[-2])
232 |                 )
233 |     except Exception as e:
234 |         print(f'Error reading raster file "{raster_path}":')
235 |         traceback.print_exc()
236 |         raise e
237 |     return raster_data, resample_transform
238 | 
239 | 
240 | def parse_resolution_from_path(p: str):
241 |     groups = re.match('.*_([0-9]{2})m.*', p)
242 |     return int(groups[1]) if groups else None
243 | 
244 | 
245 | def parse_band_from_path(p: str):
246 |     groups = re.match('.*_(B[0-9].).*', p)
247 |     return groups[1] if groups else None
248 | 
249 | 
250 | def parse_tile_from_path(p: str):
251 |     groups = re.match('.*T([0-9]{2}[A-Z]{3}).*', p)
252 |     return groups[1] if groups else None
253 | 
254 | 
255 | def parse_date_from_path(rasterpath: str):
256 |     return os.path.split(rasterpath)[-1].split('_')[1][:8]
257 | 
258 | 
259 | def safe_cloud_mask_path(safe_root: str):
260 |     return glob(f'{safe_root}/**/*_SCL_20m.jp2', recursive=True)[0]
261 | 
262 | 
263 | def array_value_in_one_of(arr: np.ndarray, vals: list):
264 |     return functools.reduce(lambda acc, class_ix: np.logical_or(acc,arr == class_ix), vals, False)
265 | 
266 | 
267 | def sen2cor_binary_transformer(array_raw):
268 |     return array_value_in_one_of(array_raw, FILTER_OUT_SEN2COR_CLASSES)
269 | 
270 | 
271 | def tocsv(date,band,myarray,tile,projectname):
272 |     csvfile = os.path.join(projectname,'array_'+tile + '_' + date +'_'+ band+'.csv')
273 |     with open(csvfile, "w", newline='') as f:
274 |         writer = csv.writer(f, lineterminator=os.linesep)
275 |         writer.writerows(myarray)
276 | 
277 | 
278 | def extractmeta(bandtif, parcelID, mydate, count, nodata, projectname, band, tile):
279 | 
280 |     #(parcel_ID, year, day-of-year, name of the file (tile), mission ID (SA|SB), count)
281 | 
282 |     #band and tile could be gotten from bandtif
283 |      
284 |     metadatacsv =  os.path.join(projectname,'meta_'+tile + '_' + mydate +'_'+ band+'.csv')
285 | 
286 |     mycolumns = ['parcelID','year','DOY','tilefilename','missionID','count', 'nodata']
287 | 
288 |     if not os.path.exists(metadatacsv): # write the header
289 |         with open(metadatacsv,'w') as csvfile:
290 |             writer = csv.writer(csvfile, delimiter=',')
291 |             writer.writerow(mycolumns)
292 |     
293 |     year = mydate[0:4]
294 | 
295 |     dateobj = datetime.strptime(mydate, '%Y%m%d')
296 |     doy = (dateobj - datetime(dateobj.year, 1, 1)).days + 1
297 | 
298 |     bandtif = bandtif.split('/')[-6]
299 |     tilefilename = ('_').join(bandtif.split('_')[0:6])
300 |     
301 |     missionID = bandtif.split('_')[0]
302 | 
303 |     onerow = [parcelID, year, doy, tilefilename, missionID, count, nodata]
304 | 
305 |     with open(metadatacsv,'a') as csvfile:
306 |         writer = csv.writer(csvfile, delimiter=',')
307 |         writer.writerow(onerow)
308 | 
309 | 
310 | if __name__ == "__main__":
311 |     main()
312 | 


--------------------------------------------------------------------------------
/python/04-flatten-temporal.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MY 2022-03-19
 3 | 
 4 | Flatten observations' temporal dimension: 11-days combined
 5 | 
 6 | RUN: python 04-flatten-temporal.py -i cloudless/results_1110_2018 -o cloudless/results_1110_2018 \
 7 | -c 19
 8 | 
 9 | WHERE:
10 | i: input dir
11 | o: output dir
12 | c: Number of cores to use
13 | 
14 | """
15 | 
16 | import os
17 | 
18 | import argparse
19 | import textwrap
20 | from pathlib import Path
21 | 
22 | from itertools import repeat
23 | from multiprocessing import Pool
24 |         
25 |         
26 | def concatSeparateDatesIntoOne(arrayfile, datadir, out_dir_path):
27 | 
28 |     if arrayfile.endswith('.csv') and arrayfile.startswith('array_'):
29 |         #print(arrayfile)
30 |         tile = arrayfile.split('_')[1]
31 |         date0 = arrayfile.split('_')[2][:-3]
32 |         date = int(arrayfile.split('_')[2][-2:])
33 |         month = arrayfile.split('_')[2][-3:-2] # works only for Jan-Sept
34 |         monthNext = str(int(month) + 1)
35 |         tail = arrayfile.split('_')[3]
36 | 
37 |         if date < 11:
38 |             newdate = month + '11'
39 |         elif (date >= 11 and date < 21):
40 |             newdate = month + '21'
41 |         else:
42 |             newdate = monthNext + '01'
43 |             
44 |         newarrayfile = 'array_' + tile + '_' + date0 + newdate + '_' + tail
45 |         #print(newarrayfile)
46 |                 
47 |         arraypath = os.path.join(datadir,arrayfile)
48 |         outputpath = os.path.join(out_dir_path,newarrayfile)
49 | 
50 |         os.system('cat {} >> {}' .format(str(arraypath), str(outputpath)))
51 |         
52 |         # Done.
53 |         
54 |         
55 | def main(args):
56 |     try:
57 |         if not args.inputpath or not args.outdir:
58 |             raise Exception('Missing input or output dir argument. Try --help .')
59 | 
60 |         print(f'\n\n04-flatten-temporal.py')
61 |         print(f'\nInput files in {args.inputpath}')
62 | 
63 |         fp = args.inputpath
64 |             
65 |         print(f'\nSaving time flattened arrays into {args.outdir}...')
66 |                 
67 |         datadir = args.inputpath
68 |         
69 |         list_of_files = os.listdir(datadir)
70 |         p = Pool(args.ncores)
71 |         p.starmap(concatSeparateDatesIntoOne, zip(list_of_files, repeat(datadir), repeat(args.outdir)))
72 |         # wait for all tasks to finish
73 |         p.close()
74 | 
75 |                     
76 |     except Exception as e:
77 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
78 |         parser.print_help()
79 |         
80 |         raise e
81 | 
82 | if __name__ == '__main__':
83 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
84 |                                      epilog=textwrap.dedent(__doc__))
85 |     parser.add_argument('-i', '--inputpath',
86 |                         type=str,
87 |                         help='Path to the directory with reflectance values.')
88 |     parser.add_argument('-o', '--outdir',
89 |                         type=str,
90 |                         help='Name of the output directory.')
91 |     parser.add_argument('-c', '--ncores',
92 |                         type=int,
93 |                         help='Number of cores to use.',
94 |                         default = 1)
95 |         
96 |     
97 |     args = parser.parse_args()
98 |     main(args)
99 | 


--------------------------------------------------------------------------------
/python/05-histogramize-shadow.py:
--------------------------------------------------------------------------------
  1 | ############################
  2 | """
  3 | 16.6.2020 modified by MY
  4 | 27.8.2021 remove normalization of histograms, do normalization later.
  5 | 24.9.2021  changed to_csv to save in UNIX forma (not DOS)
  6 | 20.11.2021 removed if below (or over) bin range, set 1st (or last) value 1, others 0. 
  7 | 
  8 | Based on Samatha Wittke's code histogramize.py for EODIE:
  9 | input array csvs from main program, each line representing one field, first number being ID of the field
 10 | output similar csv with each line representing one field, first number being ID of field, followed by x = bins numbers representing histogram values
 11 | 
 12 | 
 13 | RUN: python histogramize-shadow.py -i input -o output -b B8A -n nrbins -l 2 -h 2000
 14 | 
 15 | WHERE:
 16 | i: input dir
 17 | o: output dir
 18 | b: band ID
 19 | n: number of bins in histogram
 20 | l: lower limit of range in histogram
 21 | h: upper limit of range in histogram
 22 | 
 23 | """
 24 | ######################
 25 | 
 26 | 
 27 | import os
 28 | import numpy as np
 29 | import sys
 30 | import csv
 31 | import os
 32 | 
 33 | import argparse
 34 | import textwrap
 35 | import pathlib
 36 | 
 37 |             
 38 | def to_csv(csvfile, myarray):
 39 | 
 40 |     csvfile = csvfile.replace('array_','histogram_')
 41 |     #with open(csvfile, "w") as f:
 42 |     with open(csvfile, "w", newline='') as f:
 43 |         writer = csv.writer(f, lineterminator=os.linesep)
 44 |         writer.writerows(myarray)
 45 | 
 46 | def make_histogram(inarray,bin_seq):
 47 | 
 48 |     histo1, _ = np.histogram(inarray, bin_seq, density=False)
 49 |     return histo1
 50 | 
 51 | def main(args):
 52 |     try:
 53 |         if not args.inputpath or not args.band:
 54 |             raise Exception('Missing input or output dir argument or band number (e.g. B8A). Try --help .')
 55 | 
 56 |         print(f'\n\nhistogramize-shadow.py')
 57 |         print(f'\nInput files in {args.inputpath}')
 58 |         print(f'Band: {args.band}')
 59 |         out_dir_path = pathlib.Path(os.path.expanduser(args.outdir))
 60 |         out_dir_path.mkdir(parents=True, exist_ok=True)
 61 | 
 62 |         datadir = args.inputpath
 63 |         band = args.band
 64 |   
 65 |         bin_seq = np.linspace(args.minimum,args.maximum,args.nrbins+1)
 66 |         
 67 |         #print('Reading arrayfiles...')
 68 | 
 69 |         for arrayfile in os.listdir(datadir):
 70 |             if arrayfile.endswith(band + '.csv') and arrayfile.startswith('array_'):
 71 |                 #print(arrayfile)
 72 |                 histlist = []
 73 |                 arraypath = os.path.join(datadir,arrayfile)
 74 |                 outputpath = os.path.join(out_dir_path,arrayfile)
 75 |                 with open(arraypath, "r") as f:
 76 |                     reader = csv.reader(f)
 77 |                     for line in reader:
 78 |                         myid = [line[0]]
 79 |                         #if myid:
 80 |                         line = [int(elem) for elem in line if not '_' in elem] 
 81 |                         #print(line)
 82 |                         #if min(line) >= args.maximum:
 83 |                         #    #hist = [float(0)]*(args.nrbins-1); hist.append(float(1)) 
 84 |                         #    hist = [0]*(args.nrbins-1); hist.append(1) 
 85 |                         #elif max(line) <= args.minimum:
 86 |                         #    hist = [1]; hist.extend([0]*(args.nrbins-1))
 87 |                         #else:
 88 |                         #    hist = make_histogram(line, bin_seq)
 89 |                         hist = make_histogram(line, bin_seq)
 90 |                         #print(hist)
 91 |                         myid.extend(hist)
 92 |                         hist2 = myid
 93 |                         #print(hist2)
 94 |                         histlist.append(hist2)
 95 |                         #print(histlist)
 96 | 
 97 |                     to_csv(outputpath,histlist)
 98 |                     
 99 |     except Exception as e:
100 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
101 |         parser.print_help()
102 |         raise e
103 | 
104 | if __name__ == '__main__':
105 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
106 |                                      epilog=textwrap.dedent(__doc__))
107 |     parser.add_argument('-i', '--inputpath',
108 |                         type=str,
109 |                         help='Path to the directory with array csv files.',
110 |                         default='.')
111 |     parser.add_argument('-n', '--nrbins',
112 |                         type=int,
113 |                         default=16,
114 |                         help='Number of bins.')
115 |     parser.add_argument('-b', '--band',
116 |                         help='Band number (e.g. B02)',
117 |                         type=str)
118 |     parser.add_argument('-l', '--minimum',
119 |                         help='The lower range of the bins.',
120 |                         type=int)
121 |     parser.add_argument('-u', '--maximum',
122 |                         help='The upper range of the bins.',
123 |                         type=int)
124 |     parser.add_argument('-o', '--outdir',
125 |                         type=str,
126 |                         help='Name of the output directory.',
127 |                         default='.')
128 | 
129 |     args = parser.parse_args()
130 |     main(args)
131 | 


--------------------------------------------------------------------------------
/python/05-medianize.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 2021-12-01
 3 | 
 4 | python 05-medianize.py -i input -o output
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import os
10 | import numpy as np
11 | import sys
12 | import csv
13 | import os
14 | 
15 | import argparse
16 | import textwrap
17 | import pathlib
18 | 
19 |             
20 | def to_csv(csvfile, myarray):
21 | 
22 |     csvfile = csvfile.replace('array_','median_')
23 |     #with open(csvfile, "w") as f:
24 |     with open(csvfile, "w", newline='') as f:
25 |         writer = csv.writer(f, lineterminator=os.linesep)
26 |         writer.writerows(myarray)
27 | 
28 | def main(args):
29 |     try:
30 |         if not args.inputpath:
31 |             raise Exception('Missing input or output dir argument. Try --help .')
32 | 
33 |         print(f'\n\n05-medianize.py')
34 |         print(f'\nInput files in {args.inputpath}')
35 | 
36 |         out_dir_path = pathlib.Path(os.path.expanduser(args.outdir))
37 |         out_dir_path.mkdir(parents=True, exist_ok=True)
38 | 
39 |         datadir = args.inputpath
40 | 
41 |           
42 |         #print('Reading arrayfiles...')
43 | 
44 |         for arrayfile in os.listdir(datadir):
45 |             if arrayfile.startswith('array_'):
46 |                 #print(arrayfile)
47 |                 lista = []
48 |                 arraypath = os.path.join(datadir,arrayfile)
49 |                 outputpath = os.path.join(out_dir_path,arrayfile)
50 |                 with open(arraypath, "r") as f:
51 |                     reader = csv.reader(f)
52 |                     
53 |                     for line in reader:
54 |                         myid = [line[0]]
55 |                         #if myid:
56 |                         line = [int(elem) for elem in line if not '_' in elem] 
57 |                         median = np.median(line)
58 |                         myid.extend([int(median)])
59 |                         median2 = myid
60 |                         lista.append(median2)
61 |                         
62 |                     if lista:
63 |                         to_csv(outputpath,lista)
64 |                     
65 |     except Exception as e:
66 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
67 |         parser.print_help()
68 |         raise e
69 | 
70 | if __name__ == '__main__':
71 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
72 |                                      epilog=textwrap.dedent(__doc__))
73 |     parser.add_argument('-i', '--inputpath',
74 |                         type=str,
75 |                         help='Path to the directory with array csv files.',
76 |                         default='.')
77 | 
78 |     parser.add_argument('-o', '--outdir',
79 |                         type=str,
80 |                         help='Name of the output directory.',
81 |                         default='.')
82 | 
83 |     args = parser.parse_args()
84 |     main(args)
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/python/06-histo2stack.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 17.8.2020 MY
  3 | 23.10.2020 no more features as tuples but as array.
  4 | 19.8.2021 modified to save into dataStack_annuals (instead of dataStack_temp).
  5 | 31.8.2021 added option to use tempdir
  6 | 
  7 | Make histo-files into annual dataframes. Saves into outputdir_annuals.
  8 | 
  9 | createMissingFiles() checks if all 10 bands exists per observation (farm). If not, makes a copy of any band from the same doy and sets all values to zero.
 10 | 
 11 | getAttributesFromFilename() adds tile-, DOY ja band information from filename to data.
 12 | 
 13 | mergeAllGetNumpyArrays() makes one big dataframe for one year. Save to outputdir_annuals.
 14 | 
 15 | testing(outputfile) tests if output file is ok.
 16 | 
 17 | RUN: 
 18 | 
 19 | python 06-histo2stack.py -i histo_test1110_2016 -n 32 -o dataStack -f test1110_2016.pkl -t TEMPDIRPATH
 20 | 
 21 | After this into 07-stack2ARD.py.
 22 | 
 23 | """
 24 | 
 25 | import os
 26 | import pandas as pd
 27 | import numpy as np
 28 | import pickle
 29 | 
 30 | from pathlib import Path
 31 | 
 32 | import argparse
 33 | import textwrap
 34 | from datetime import datetime
 35 | 
 36 | 
 37 | ###### FUNCTIONS:
 38 | 
 39 | def load_intensities(filename):
 40 |     with open(filename, "rb") as f:
 41 |         data = pickle.load(f)
 42 |     return data
 43 | 
 44 | def save_intensities(filename, arrayvalues):
 45 |     with open(filename, 'wb+') as outputfile:
 46 |         pickle.dump(arrayvalues, outputfile)
 47 | 
 48 | def createMissingFiles(datadir):
 49 |     # List all files
 50 |     list_of_files = os.listdir(datadir)
 51 | 
 52 |     # histogram_35VNL_20200830_B8A.csv
 53 |     # This removes the .csv and splits the name to three parts
 54 |     list_of_filename_parts = [i.replace(".csv","").split("_") for i in list_of_files]
 55 |     
 56 |     # Makes a df of all filenames
 57 |     df = pd.DataFrame(list_of_filename_parts, columns=['histo','tile','date','band'])
 58 |     #print(df.head())
 59 | 
 60 |     # Group and iterate by date, see if bands are missing
 61 |     grouped_df = df.groupby(['date', 'tile'])
 62 | 
 63 |     # Bands as text that should exist
 64 |     bands = ['B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12']
 65 |     
 66 |     # Iterate
 67 |     for name, date_group in grouped_df:
 68 |         #print(name[1])
 69 |         existing_bands = list(date_group['band'])
 70 |         for band in bands:
 71 |             if band not in existing_bands:
 72 |               	# Band is missing create a mockup dataframe and save
 73 |                 print(f"For date {name} band {band} is missing!")
 74 | 
 75 |                 ### Copy from existing band, same date, set all values to 0 (or np.nan)
 76 |                 
 77 |                 temp_filename = os.path.join(datadir,"histogram_" + name[1] + "_" + name[0] + "_" + existing_bands[0] + ".csv")
 78 |                 #print(temp_filename)
 79 |                 dftemp = pd.read_csv(temp_filename, encoding='utf-8', header = None)
 80 |                 #print(dftemp.iloc[:, 1:])
 81 |                 dftemp.iloc[:,1:] = 0
 82 |                 #print(dftemp)
 83 | 
 84 |                 output_filename = os.path.join(datadir,"histogram_" + name[1] + "_" + name[0] + "_" + band + ".csv")
 85 |                 print(f"Saving a new file named {output_filename}")
 86 |                 dftemp.to_csv(output_filename,encoding='utf-8',index=False, header=False)
 87 | 
 88 | def getAttributesFromFilename(datadir, data_folder2):
 89 |     ### Add date and band to every file as columns
 90 | 
 91 |     # Loop files in data_folder
 92 |     for filename in os.listdir(datadir):
 93 |         if filename.endswith('.csv') and filename.startswith('histogram_'):
 94 |             #print(filename)
 95 |             try:
 96 |                 df = pd.read_csv(os.path.join(datadir,filename), encoding='utf-8', header = None)
 97 |             except pd.errors.EmptyDataError:
 98 |                 print(f'{os.path.join(datadir,filename)} was empty. Skipping.')
 99 |                 continue
100 |             # Add tile, band and date from filename to columns
101 |             df['tile'] = filename.split("_")[1]
102 |             pvm = filename.split("_")[2]
103 |             df['doy'] = datetime.strptime(pvm, '%Y%m%d').timetuple().tm_yday
104 |             #print(doy)
105 |             df['band'] = filename.split("_")[3].replace(".csv","")
106 |             #print(band)
107 | 
108 |             ### Write to data_folder2
109 |             df.to_csv(os.path.join(data_folder2,filename), encoding='utf-8',index=False, header=False)
110 |             
111 | def mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile):
112 |     ### Merge all files to one big dataframe
113 | 
114 |     df_array = []
115 | 
116 |     ### Read files to pandas, add the dataframes to the array
117 |     for filename in os.listdir(data_folder2):
118 |         df = pd.read_csv(os.path.join(data_folder2,filename), encoding="utf-8", header=None)
119 |         df.rename(columns={(bins + 1): 'tile', (bins + 2): 'doy', (bins + 3): 'band'}, inplace=True)
120 |         try:
121 |             df['farmID'] = df[0] + '_' + df['tile']
122 |         except Exception as e:
123 |             print(f'\n\nThere is something wrong with file {os.path.join(data_folder2,filename)}...')
124 |             print('Check that you have set the right number of bins!')
125 |             raise e
126 |         old_names = df.columns.tolist()[1:bins+1]
127 |         new_names = []
128 |         for bin in range(bins):
129 |             new_names.append("bin" + str(bin+1))
130 | 
131 |         df = df.rename(columns=dict(zip(old_names, new_names)))
132 |         df = df.drop(0, axis = 1)
133 |         df = df[['farmID', 'band','doy', *df.columns[df.columns.str.startswith("bin")]]]
134 |         df_array.append(df)
135 | 
136 |     ### Make a big dataframe out of the list of dataframes
137 |     all_files_df = pd.concat(df_array)
138 |     ### And save to temp:
139 |     save_intensities(os.path.join(data_folder3,outputfile), all_files_df)
140 |     
141 |     return all_files_df
142 | 
143 | def addDOYrank(all_files_df, out_dir_path, outputfile):
144 |     #print(all_files_df.head())
145 |     days = all_files_df.doy.sort_values().unique()
146 |     days_dict = dict(zip(days, range(len(days))))
147 |     print(days_dict)
148 |     all_files_df2 = all_files_df
149 |     return all_files_df2
150 |     
151 | def testing(all_files_df, out_dir_path, outputfile):
152 |     print("Output written to file: ", outputfile)
153 | 
154 |     tmp2 = all_files_df.groupby(['doy', 'farmID']).count()#.unstack().fillna(0)
155 | 
156 |     if tmp2[tmp2.band != 10]['band'].any():
157 |         print("Some bands missing!")
158 |     else:
159 |         print("All farms have full 10 bands!")
160 | 
161 |     # kuinka monta tilaa mukana?
162 |     print("How many farms are observed from one or several S2 granules?:", len(all_files_df[['farmID']].drop_duplicates()))
163 |     
164 |     # kuinka monta tilaa mukana oikeasti?
165 |     farmIDs = all_files_df['farmID'].str.rsplit('_',1).str[0]
166 |     print("How many farms we really have?: ", len(farmIDs.drop_duplicates()))
167 |         
168 |     # Kuinka monta havaintoa per tila koko kesältä, mediaani?
169 |     print("How many observations per farm in one season (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['farmID']).count().median()))
170 | 
171 |     # kuinka monta havaintoa per päivä, mediaani?
172 |     print("How many observations per day (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['doy']).count().median()))
173 | 
174 |               
175 | def main(args):
176 |     
177 |     try:
178 |         if not args.inputpath or not args.outdir:
179 |             raise Exception('Missing input or output dir argument or bin number (e.g. 32). Try --help .')
180 | 
181 |         print(f'\n\nhisto2stack.py')
182 |         print(f'\nInput files in {args.inputpath}')
183 |         print(f'Bins: {args.bins}')
184 |         out_dir_path = Path(os.path.expanduser(args.outdir))
185 |         out_dir_path.mkdir(parents=True, exist_ok=True)
186 | 
187 |         datadir = args.inputpath
188 |         bins = args.bins
189 |         outputfile = args.outfile
190 | 
191 |         # temp directory for annual histograms:
192 |         data_folder2 = args.tmpdir
193 |         Path(data_folder2).mkdir(parents=True, exist_ok=True)
194 | 
195 |         # directory for annual dataframes:
196 |         data_folder3 = args.outdir + "_annual"
197 |         Path(data_folder3).mkdir(parents=True, exist_ok=True)
198 | 
199 | 
200 |         createMissingFiles(datadir)
201 |         getAttributesFromFilename(datadir, data_folder2)
202 |         
203 |         # tämä tekee jo varsinaisen osuuden:
204 |         all_files_df = mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile) 
205 |         
206 |         # loput on testausta:
207 |         all_files_df = load_intensities(os.path.join(data_folder3,outputfile))
208 |         all_files_df = addDOYrank(all_files_df, out_dir_path, outputfile)
209 |         testing(all_files_df, out_dir_path, outputfile)
210 | 
211 | 
212 |     except Exception as e:
213 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
214 |         parser.print_help()
215 |         raise e
216 | 
217 | if __name__ == '__main__':
218 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
219 |                                      epilog=textwrap.dedent(__doc__))
220 |     parser.add_argument('-i', '--inputpath',
221 |                         type=str,
222 |                         help='Path to the directory with histogram csv files.',
223 |                         default='.')
224 |     parser.add_argument('-n', '--bins',
225 |                         type=int,
226 |                         default=16,
227 |                         help='Number of bins.')
228 |     parser.add_argument('-o', '--outdir',
229 |                         type=str,
230 |                         help='Name of the output directory.',
231 |                         default='.')
232 |     parser.add_argument('-f', '--outfile',
233 |                         type=str,
234 |                         help='Name of the output file.',
235 |                         default='.')
236 |     parser.add_argument('-t', '--tmpdir',
237 |                         type=str,
238 |                         help='Name of the temp directory.',
239 |                         default='.')
240 |     args = parser.parse_args()
241 |     main(args)
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/python/06-median2stack.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 1.12.2021 MY
  3 | 
  4 | Make histo-files into annual dataframes. Saves into outputdir_annuals.
  5 | 
  6 | createMissingFiles() checks if all 10 bands exists per observation (farm). If not, makes a copy of any band from the same doy and sets all values to zero.
  7 | 
  8 | getAttributesFromFilename() adds tile-, DOY ja band information from filename to data.
  9 | 
 10 | mergeAllGetNumpyArrays() makes one big dataframe for one year. Save to outputdir_annuals.
 11 | 
 12 | testing(outputfile) tests if output file is ok.
 13 | 
 14 | RUN: 
 15 | 
 16 | python 06-median2stack.py -i median_test1110_2016 -o medianStack -f test1110_2016.pkl -t TEMPDIRPATH
 17 | 
 18 | After this into 07-stack2ARD.py.
 19 | 
 20 | """
 21 | 
 22 | import os
 23 | import pandas as pd
 24 | import numpy as np
 25 | import pickle
 26 | 
 27 | from pathlib import Path
 28 | 
 29 | import argparse
 30 | import textwrap
 31 | from datetime import datetime
 32 | 
 33 | 
 34 | ###### FUNCTIONS:
 35 | 
 36 | def load_intensities(filename):
 37 |     with open(filename, "rb") as f:
 38 |         data = pickle.load(f)
 39 |     return data
 40 | 
 41 | def save_intensities(filename, arrayvalues):
 42 |     with open(filename, 'wb+') as outputfile:
 43 |         pickle.dump(arrayvalues, outputfile)
 44 | 
 45 | def createMissingFiles(datadir):
 46 |     # List all files
 47 |     list_of_files = os.listdir(datadir)
 48 | 
 49 |     # median_35VNL_20200830_B8A.csv
 50 |     # This removes the .csv and splits the name to three parts
 51 |     list_of_filename_parts = [i.replace(".csv","").split("_") for i in list_of_files]
 52 |     
 53 |     # Makes a df of all filenames
 54 |     df = pd.DataFrame(list_of_filename_parts, columns=['histo','tile','date','band'])
 55 |     #print(df.head())
 56 | 
 57 |     # Group and iterate by date, see if bands are missing
 58 |     grouped_df = df.groupby(['date', 'tile'])
 59 | 
 60 |     # Bands as text that should exist
 61 |     bands = ['B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12']
 62 |     
 63 |     # Iterate
 64 |     for name, date_group in grouped_df:
 65 |         #print(name[1])
 66 |         existing_bands = list(date_group['band'])
 67 |         for band in bands:
 68 |             if band not in existing_bands:
 69 |               	# Band is missing create a mockup dataframe and save
 70 |                 print(f"For date {name} band {band} is missing!")
 71 | 
 72 |                 ### Copy from existing band, same date, set all values to 0 (or np.nan)
 73 |                 
 74 |                 temp_filename = os.path.join(datadir,"median_" + name[1] + "_" + name[0] + "_" + existing_bands[0] + ".csv")
 75 |                 #print(temp_filename)
 76 |                 dftemp = pd.read_csv(temp_filename, encoding='utf-8', header = None)
 77 |                 #print(dftemp.iloc[:, 1:])
 78 |                 dftemp.iloc[:,1:] = 0
 79 |                 #print(dftemp)
 80 | 
 81 |                 output_filename = os.path.join(datadir,"median_" + name[1] + "_" + name[0] + "_" + band + ".csv")
 82 |                 print(f"Saving a new file named {output_filename}")
 83 |                 dftemp.to_csv(output_filename,encoding='utf-8',index=False, header=False)
 84 | 
 85 | def getAttributesFromFilename(datadir, data_folder2):
 86 |     ### Add date and band to every file as columns
 87 | 
 88 |     # Loop files in data_folder
 89 |     for filename in os.listdir(datadir):
 90 |         if filename.endswith('.csv') and filename.startswith('median_'):
 91 |             #print(filename)
 92 |             try:
 93 |                 df = pd.read_csv(os.path.join(datadir,filename), encoding='utf-8', header = None)
 94 |             except pd.errors.EmptyDataError:
 95 |                 print(f'{os.path.join(datadir,filename)} was empty. Skipping.')
 96 |                 continue
 97 |             # Add tile, band and date from filename to columns
 98 |             df['tile'] = filename.split("_")[1]
 99 |             pvm = filename.split("_")[2]
100 |             df['doy'] = datetime.strptime(pvm, '%Y%m%d').timetuple().tm_yday
101 |             #print(doy)
102 |             df['band'] = filename.split("_")[3].replace(".csv","")
103 |             #print(band)
104 | 
105 |             ### Write to data_folder2
106 |             df.to_csv(os.path.join(data_folder2,filename), encoding='utf-8',index=False, header=False)
107 |             
108 | def mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile):
109 |     ### Merge all files to one big dataframe
110 | 
111 |     df_array = []
112 | 
113 |     ### Read files to pandas, add the dataframes to the array
114 |     for filename in os.listdir(data_folder2):
115 |         df = pd.read_csv(os.path.join(data_folder2,filename), encoding="utf-8", header=None)
116 |         df.rename(columns={(bins + 1): 'tile', (bins + 2): 'doy', (bins + 3): 'band'}, inplace=True)
117 |         try:
118 |             df['farmID'] = df[0] + '_' + df['tile']
119 |         except Exception as e:
120 |             print(f'\n\nThere is something wrong with file {os.path.join(data_folder2,filename)}...')
121 |             print('Check that you have set the right number of bins!')
122 |             raise e
123 |         old_names = df.columns.tolist()[1:bins+1]
124 |         new_names = []
125 |         for bin in range(bins):
126 |             new_names.append("bin" + str(bin+1))
127 | 
128 |         df = df.rename(columns=dict(zip(old_names, new_names)))
129 |         df = df.drop(0, axis = 1)
130 |         df = df[['farmID', 'band','doy', *df.columns[df.columns.str.startswith("bin")]]]
131 |         df_array.append(df)
132 | 
133 |     ### Make a big dataframe out of the list of dataframes
134 |     all_files_df = pd.concat(df_array)
135 |     ### And save to temp:
136 |     save_intensities(os.path.join(data_folder3,outputfile), all_files_df)
137 |     
138 |     return all_files_df
139 | 
140 | def addDOYrank(all_files_df, out_dir_path, outputfile):
141 |     #print(all_files_df.head())
142 |     days = all_files_df.doy.sort_values().unique()
143 |     days_dict = dict(zip(days, range(len(days))))
144 |     print(days_dict)
145 |     all_files_df2 = all_files_df
146 |     return all_files_df2
147 |     
148 | def testing(all_files_df, out_dir_path, outputfile):
149 |     print("Output written to file: ", outputfile)
150 | 
151 |     tmp2 = all_files_df.groupby(['doy', 'farmID']).count()#.unstack().fillna(0)
152 | 
153 |     if tmp2[tmp2.band != 10]['band'].any():
154 |         print("Some bands missing!")
155 |     else:
156 |         print("All farms have full 10 bands!")
157 | 
158 |     # kuinka monta tilaa mukana?
159 |     print("How many farms are observed from one or several S2 granules?:", len(all_files_df[['farmID']].drop_duplicates()))
160 |     
161 |     # kuinka monta tilaa mukana oikeasti?
162 |     farmIDs = all_files_df['farmID'].str.rsplit('_',1).str[0]
163 |     print("How many farms we really have?: ", len(farmIDs.drop_duplicates()))
164 |         
165 |     # Kuinka monta havaintoa per tila koko kesältä, mediaani?
166 |     print("How many observations per farm in one season (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['farmID']).count().median()))
167 | 
168 |     # kuinka monta havaintoa per päivä, mediaani?
169 |     print("How many observations per day (median)?: ", float(all_files_df[['farmID', 'doy']].drop_duplicates().groupby(['doy']).count().median()))
170 | 
171 |               
172 | def main(args):
173 |     
174 |     try:
175 |         if not args.inputpath or not args.outdir:
176 |             raise Exception('Missing input or output dir argument. Try --help .')
177 | 
178 |         print(f'\n\n06-median2stack.py')
179 |         print(f'\nInput files in {args.inputpath}')
180 | 
181 |         out_dir_path = Path(os.path.expanduser(args.outdir))
182 |         out_dir_path.mkdir(parents=True, exist_ok=True)
183 | 
184 |         datadir = args.inputpath
185 |         outputfile = args.outfile
186 |         
187 |         bins = 1 # vain yksi feature eli mediaani
188 | 
189 |         # temp directory for annual medians:
190 |         data_folder2 = args.tmpdir
191 |         Path(data_folder2).mkdir(parents=True, exist_ok=True)
192 | 
193 |         # directory for annual dataframes:
194 |         data_folder3 = args.outdir + "_annual"
195 |         Path(data_folder3).mkdir(parents=True, exist_ok=True)
196 | 
197 | 
198 |         createMissingFiles(datadir)
199 |         getAttributesFromFilename(datadir, data_folder2)
200 |         
201 |         # tämä tekee jo varsinaisen osuuden:
202 |         all_files_df = mergeAllGetNumpyArrays(data_folder2, data_folder3, bins, outputfile) 
203 |         
204 |         # loput on testausta:
205 |         all_files_df = load_intensities(os.path.join(data_folder3,outputfile))
206 |         all_files_df = addDOYrank(all_files_df, out_dir_path, outputfile)
207 |         testing(all_files_df, out_dir_path, outputfile)
208 | 
209 | 
210 |     except Exception as e:
211 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
212 |         parser.print_help()
213 |         raise e
214 | 
215 | if __name__ == '__main__':
216 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
217 |                                      epilog=textwrap.dedent(__doc__))
218 |     parser.add_argument('-i', '--inputpath',
219 |                         type=str,
220 |                         help='Path to the directory with median csv files.',
221 |                         default='.')
222 | 
223 |     parser.add_argument('-o', '--outdir',
224 |                         type=str,
225 |                         help='Name of the output directory.',
226 |                         default='.')
227 |     parser.add_argument('-f', '--outfile',
228 |                         type=str,
229 |                         help='Name of the output file.',
230 |                         default='.')
231 |     parser.add_argument('-t', '--tmpdir',
232 |                         type=str,
233 |                         help='Name of the temp directory.',
234 |                         default='.')
235 |     args = parser.parse_args()
236 |     main(args)
237 | 
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/python/07-medianstack2ARD.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 1.12.2021 
  3 | 
  4 | Combine annual stack-files into one array stack.
  5 | 
  6 | combineAllYears() reads all annuals into one big dataframe.
  7 | 
  8 | reshapeAndSave() pivots the dataframe by farmID and doy, converts to numpy array, fills with na (-> not ragged) and reshapes into 3D. Saves array and farmIDs into separate files.
  9 | 
 10 | RUN: 
 11 | 
 12 | python 07-medianstack2ARD.py -i medianStack_annual -o medianStack/ -f 1400 -y 2018 2019
 13 | 
 14 | After this into 08-mergeTarget.py.
 15 | 
 16 | """
 17 | import glob
 18 | import os
 19 | import pandas as pd
 20 | import numpy as np
 21 | import pickle
 22 | 
 23 | from pathlib import Path
 24 | 
 25 | import argparse
 26 | import textwrap
 27 | from datetime import datetime
 28 | 
 29 | 
 30 | ###### FUNCTIONS:
 31 | 
 32 | def load_intensities(filename):
 33 |     with open(filename, "rb") as f:
 34 |         data = pickle.load(f)
 35 |     return data
 36 | 
 37 | def save_intensities(filename, arrayvalues):
 38 |     with open(filename, 'wb+') as outputfile:
 39 |         pickle.dump(arrayvalues, outputfile)
 40 | 
 41 | def combineAllYears(data_folder3, setti, years):
 42 |     # read files in inputdir:
 43 |     s = pd.Series(glob.glob(data_folder3 + '/*.pkl'))
 44 | 
 45 |     filepaths = [] 
 46 | 
 47 |     for filename in s:
 48 |         for keyword1 in years:
 49 |             if keyword1 in filename:
 50 |                 for keyword2 in setti:
 51 |                     if keyword2 in filename:
 52 |                         #print(filename)
 53 |                         filepaths.append(filename)
 54 |     #print(filepaths)                    
 55 |     # open all chosen years into one dataframe:
 56 |     allyears = pd.concat(map(pd.read_pickle, filepaths), sort=False)
 57 |     return allyears  
 58 | 
 59 | def reshapeAndSave(full_array_stack, out_dir_path, outputfile, rank):    
 60 |     # reshape and save data to 3D:
 61 |     print(f"\nLength of the data stack dataframe: {len(full_array_stack)}")
 62 | 
 63 |     if rank:
 64 |         dateVar = 'doyid'
 65 |     else:
 66 |         dateVar = 'doy'
 67 | 
 68 |     full_array_stack['doyid'] = full_array_stack.groupby(['farmID', 'band'])['doy'].rank(method="first", ascending=True).astype('int')
 69 | 
 70 |     #print(full_array_stack.sort_values(['farmID', 'doy']).tail(15))
 71 |     
 72 |     # printtaa esimerkkitila:
 73 |     #tmp = full_array_stack[full_array_stack['farmID'] == '2019_12026885_35VMH'][['farmID', 'doy', 'band', 'doyid']]
 74 |     #print(tmp.sort_values(['doy', 'band']))
 75 |     
 76 |     # printtaa sellaiset, joilla bin1 on 1:
 77 |     #print(len(full_array_stack[full_array_stack['bin1'] == 1]))
 78 |     # printtaa sellaiset, joilla bin32 on 1:
 79 |     #print(len(full_array_stack[full_array_stack['bin32'] == 1]))
 80 |     
 81 |     # printtaa sellaiset, joiden rivisumma ei ole 1:
 82 |     #print(full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) != 1])
 83 |     
 84 |     # printtaa näiden rivisummat:
 85 |     #tmp = full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) < 1]
 86 |     #print(len(tmp)) # jotain pyöristysvirhettä ehkäpä vain
 87 |     #print(tmp.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1))
 88 |     
 89 |     # Predictions to compare with forecasts: 15.6. eli DOY 166, that is pythonic 165.
 90 |     # and 15.7. eli DOY 196 
 91 |     # and 15.8. eli DOY 227
 92 |     # and the last DOY 243 -> the final state
 93 |     
 94 |     #june = full_array_stack[full_array_stack['doy'] <= 165]
 95 |     #print(june.sort_values(['doy', 'band']).tail(20))
 96 |     #print(june['doyid'].value_counts())
 97 | 
 98 |     #july = full_array_stack[full_array_stack['doy'] <= 195]
 99 |     #august = full_array_stack[full_array_stack['doy'] <= 226]
100 | 
101 |     
102 |     final = full_array_stack
103 |     #print(final['doyid'].value_counts())
104 |     
105 |     # Kuinka monta havaintoa per tila koko kesältä, mediaani?
106 |     print("How many observations per farm in one season (median)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().median()))
107 |     # Kuinka monta havaintoa per tila koko kesältä, max?
108 |     print("How many observations per farm in one season (max)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().max()))
109 |     # Kuinka monta havaintoa per tila koko kesältä, min?
110 |     print("How many observations per farm in one season (min)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().min()))
111 | 
112 |     # koko kausi:
113 |     farms = final.farmID.nunique()
114 |     doys = final[dateVar].nunique()
115 |     bands = 10
116 |     bins = 1 # nyt vain yksi feature eli median
117 |     pivoted = final.pivot(index=['farmID', dateVar], columns='band', values=[*final.columns[final.columns.str.startswith('bin')]])
118 |     m = pd.MultiIndex.from_product([pivoted.index.get_level_values(0).unique(), pivoted.index.get_level_values(1).sort_values().unique()], names=pivoted.index.names)
119 |     pt = pivoted.reindex(m, fill_value = 0)
120 |     finalfinal = pt.to_numpy().reshape(farms, doys, bins, bands).swapaxes(2,3).reshape(farms,doys,bands*bins)
121 |     
122 |     outputfile2 = 'array_' + outputfile
123 |     fp = os.path.join(out_dir_path, outputfile2)
124 |     
125 |     print(f"Shape of the 3D stack dataframe: {finalfinal.shape}")
126 |     print(f"Output into file: {fp}")
127 |     np.savez_compressed(fp, finalfinal)
128 |     #save_intensities(fp, finalfinal)
129 |     
130 |     # save farmIDs for later merging with target y:
131 |     farmIDs = pt.index.get_level_values(0).unique().str.rsplit('_',1).str[0].values
132 |     print(f"\n\nNumber of farms: {len(farmIDs)}")
133 |     outputfile2 = 'farmID_' + outputfile + '.pkl'
134 |     fp = os.path.join(out_dir_path, outputfile2)
135 |     print(f"Output farmIDs in file: {fp}")
136 |     save_intensities(fp, farmIDs)
137 |     
138 | 
139 |     
140 | def main(args):
141 |     
142 |     try:
143 |         if not args.outdir or not args.setti:
144 |             raise Exception('Missing output dir argument or dataset label (e.g. test1110). Try --help .')
145 | 
146 |         print(f'\n\nstack2ARD.py')
147 |         print(f'\nInput files in {args.inputdir}')
148 | 
149 |         # directory for input, i.e. annual results:
150 |         data_folder3 = args.inputdir
151 |         
152 |         # directory for outputs:
153 |         out_dir_path = args.outdir
154 |         Path(out_dir_path).mkdir(parents=True, exist_ok=True)
155 |         
156 |         # years:
157 |         years = args.ylist
158 |         setti = args.setti
159 |         
160 |         # outputfilename:
161 |         #outputfile = '-'.join(setti) + '-' + '-'.join(years) + '.pkl'
162 |         outputfile = '-'.join(setti) + '-' + '-'.join(years)
163 |         
164 | 
165 |                 
166 |         print("\nPresuming preprocessing done earlier. If not done previously, please, run with histo2stack.py first!")
167 | 
168 |         print("\nCombining the years and data sets...")
169 |         allyears = combineAllYears(data_folder3, setti, years)
170 |         reshapeAndSave(allyears, out_dir_path, outputfile, args.rank)
171 |         
172 | 
173 |     except Exception as e:
174 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
175 |         parser.print_help()
176 |         raise e
177 | 
178 | if __name__ == '__main__':
179 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
180 |                                      epilog=textwrap.dedent(__doc__))
181 | 
182 |     parser.add_argument('-i', '--inputdir',
183 |                         type=str,
184 |                         help='Name of the input directory (where annual histogram dataframes are).',
185 |                         default='.')
186 |     parser.add_argument('-o', '--outdir',
187 |                         type=str,
188 |                         help='Name of the output directory.',
189 |                         default='.')
190 |     # is not true: cannot combine multiple data sets (crops), because farmID does not hold crop information -> duplicated farmIDs  
191 |     parser.add_argument('-f', '--setti', action='store', dest='setti',
192 |                          type=str, nargs='*', default=['1400'],
193 |                          help='Name of the data set. Can be also multiple. E.g. -f 1310 1320.')
194 |     #parser.add_argument('-f', '--setti', 
195 |     #                    type=str,
196 |     #                    default=['1400'],
197 |     #                    help='Name of the data set. E.g. -f 1310.')
198 |     parser.add_argument('-y', '--years', action='store', dest='ylist',
199 |                        type=str, nargs='*', default=['2018', '2019', '2020', '2021'],
200 |                        help="Optionally e.g. -y 2018 2019, default all")
201 |     
202 |     parser.add_argument('-r', '--rank',
203 |                         help='If saving time series by rank of days.',
204 |                         default=False,
205 |                         action='store_true')
206 |         
207 |     args = parser.parse_args()
208 |     main(args)
209 | 
210 | 
211 | 
212 | 


--------------------------------------------------------------------------------
/python/07-stack2ARD.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | MY 23.10.2020 
  4 | 
  5 | Combine annual stack-files into one array stack.
  6 | 
  7 | combineAllYears() reads all annuals into one big dataframe.
  8 | 
  9 | reshapeAndSave() pivots the dataframe by farmID and doy, converts to numpy array, fills with na (-> not ragged) and reshapes into 3D. Saves array and farmIDs into separate files.
 10 | 
 11 | RUN: 
 12 | 
 13 | python 07-stack2ARD.py -i dataStack_annual -o dataStack/ -f 1400 -y 2018 2019
 14 | 
 15 | After this into 08-mergeTarget.py.
 16 | 
 17 | """
 18 | import glob
 19 | import os
 20 | import pandas as pd
 21 | import numpy as np
 22 | import pickle
 23 | 
 24 | from pathlib import Path
 25 | 
 26 | import argparse
 27 | import textwrap
 28 | from datetime import datetime
 29 | 
 30 | 
 31 | ###### FUNCTIONS:
 32 | 
 33 | def load_intensities(filename):
 34 |     with open(filename, "rb") as f:
 35 |         data = pickle.load(f)
 36 |     return data
 37 | 
 38 | def save_intensities(filename, arrayvalues):
 39 |     with open(filename, 'wb+') as outputfile:
 40 |         pickle.dump(arrayvalues, outputfile)
 41 | 
 42 | def combineAllYears(data_folder3, setti, years):
 43 |     # read files in inputdir:
 44 |     s = pd.Series(glob.glob(data_folder3 + '/*.pkl'))
 45 | 
 46 |     filepaths = [] 
 47 | 
 48 |     for filename in s:
 49 |         for keyword1 in years:
 50 |             if keyword1 in filename:
 51 |                 for keyword2 in setti:
 52 |                     if keyword2 in filename:
 53 |                         #print(filename)
 54 |                         filepaths.append(filename)
 55 |     #print(filepaths)                    
 56 |     # open all chosen years into one dataframe:
 57 |     allyears = pd.concat(map(pd.read_pickle, filepaths), sort=False)
 58 |     return allyears  
 59 | 
 60 | def reshapeAndSave(full_array_stack, out_dir_path, outputfile, rank):    
 61 |     # reshape and save data to 3D:
 62 |     print(f"\nLength of the data stack dataframe: {len(full_array_stack)}")
 63 | 
 64 |     if rank:
 65 |         dateVar = 'doyid'
 66 |     else:
 67 |         dateVar = 'doy'
 68 | 
 69 |     full_array_stack['doyid'] = full_array_stack.groupby(['farmID', 'band'])['doy'].rank(method="first", ascending=True).astype('int')
 70 | 
 71 |     #print(full_array_stack.sort_values(['farmID', 'doy']).tail(15))
 72 |     
 73 |     # printtaa esimerkkitila:
 74 |     #tmp = full_array_stack[full_array_stack['farmID'] == '2019_12026885_35VMH'][['farmID', 'doy', 'band', 'doyid']]
 75 |     #print(tmp.sort_values(['doy', 'band']))
 76 |     
 77 |     # printtaa sellaiset, joilla bin1 on 1:
 78 |     #print(len(full_array_stack[full_array_stack['bin1'] == 1]))
 79 |     # printtaa sellaiset, joilla bin32 on 1:
 80 |     #print(len(full_array_stack[full_array_stack['bin32'] == 1]))
 81 |     
 82 |     # printtaa sellaiset, joiden rivisumma ei ole 1:
 83 |     #print(full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) != 1])
 84 |     
 85 |     # printtaa näiden rivisummat:
 86 |     #tmp = full_array_stack[full_array_stack.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1) < 1]
 87 |     #print(len(tmp)) # jotain pyöristysvirhettä ehkäpä vain
 88 |     #print(tmp.drop(['farmID', 'doy', 'band', 'doyid'], axis = 1).sum(axis = 1))
 89 |     
 90 |     # Predictions to compare with forecasts: 15.6. eli DOY 166, that is pythonic 165.
 91 |     # and 15.7. eli DOY 196 
 92 |     # and 15.8. eli DOY 227
 93 |     # and the last DOY 243 -> the final state
 94 |     
 95 |     #june = full_array_stack[full_array_stack['doy'] <= 165]
 96 |     #print(june.sort_values(['doy', 'band']).tail(20))
 97 |     #print(june['doyid'].value_counts())
 98 | 
 99 |     #july = full_array_stack[full_array_stack['doy'] <= 195]
100 |     #august = full_array_stack[full_array_stack['doy'] <= 226]
101 | 
102 |     
103 |     final = full_array_stack
104 |     #print(final['doyid'].value_counts())
105 |     
106 |     # Kuinka monta havaintoa per tila koko kesältä, mediaani?
107 |     print("How many observations per farm in one season (median)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().median()))
108 |     # Kuinka monta havaintoa per tila koko kesältä, max?
109 |     print("How many observations per farm in one season (max)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().max()))
110 |     # Kuinka monta havaintoa per tila koko kesältä, min?
111 |     print("How many observations per farm in one season (min)?: ", float(final[['farmID', dateVar]].drop_duplicates().groupby(['farmID']).count().min()))
112 | 
113 |     # koko kausi:
114 |     farms = final.farmID.nunique()
115 |     doys = final[dateVar].nunique()
116 |     bands = 10
117 |     bins = 32
118 |     pivoted = final.pivot(index=['farmID', dateVar], columns='band', values=[*final.columns[final.columns.str.startswith('bin')]])
119 |     m = pd.MultiIndex.from_product([pivoted.index.get_level_values(0).unique(), pivoted.index.get_level_values(1).sort_values().unique()], names=pivoted.index.names)
120 |     pt = pivoted.reindex(m, fill_value = 0)
121 |     finalfinal = pt.to_numpy().reshape(farms, doys, bins, bands).swapaxes(2,3).reshape(farms,doys,bands*bins)
122 |     
123 |     outputfile2 = 'array_' + outputfile
124 |     fp = os.path.join(out_dir_path, outputfile2)
125 |     
126 |     print(f"Shape of the 3D stack dataframe: {finalfinal.shape}")
127 |     print(f"Output into file: {fp}")
128 |     np.savez_compressed(fp, finalfinal)
129 |     #save_intensities(fp, finalfinal)
130 |     
131 |     # save farmIDs for later merging with target y:
132 |     farmIDs = pt.index.get_level_values(0).unique().str.rsplit('_',1).str[0].values
133 |     print(f"\n\nNumber of farms: {len(farmIDs)}")
134 |     outputfile2 = 'farmID_' + outputfile + '.pkl'
135 |     fp = os.path.join(out_dir_path, outputfile2)
136 |     print(f"Output farmIDs in file: {fp}")
137 |     save_intensities(fp, farmIDs)
138 |     
139 | 
140 |     
141 | def main(args):
142 |     
143 |     try:
144 |         if not args.outdir or not args.setti:
145 |             raise Exception('Missing output dir argument or dataset label (e.g. test1110). Try --help .')
146 | 
147 |         print(f'\n\nstack2ARD.py')
148 |         print(f'\nInput files in {args.inputdir}')
149 | 
150 |         # directory for input, i.e. annual results:
151 |         data_folder3 = args.inputdir
152 |         
153 |         # directory for outputs:
154 |         out_dir_path = args.outdir
155 |         Path(out_dir_path).mkdir(parents=True, exist_ok=True)
156 |         
157 |         # years:
158 |         years = args.ylist
159 |         setti = args.setti
160 |         
161 |         # outputfilename:
162 |         #outputfile = '-'.join(setti) + '-' + '-'.join(years) + '.pkl'
163 |         outputfile = '-'.join(setti) + '-' + '-'.join(years)
164 |         
165 | 
166 |                 
167 |         print("\nPresuming preprocessing done earlier. If not done previously, please, run with histo2stack.py first!")
168 | 
169 |         print("\nCombining the years and data sets...")
170 |         allyears = combineAllYears(data_folder3, setti, years)
171 |         reshapeAndSave(allyears, out_dir_path, outputfile, args.rank)
172 |         
173 | 
174 |     except Exception as e:
175 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
176 |         parser.print_help()
177 |         raise e
178 | 
179 | if __name__ == '__main__':
180 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
181 |                                      epilog=textwrap.dedent(__doc__))
182 | 
183 |     parser.add_argument('-i', '--inputdir',
184 |                         type=str,
185 |                         help='Name of the input directory (where annual histogram dataframes are).',
186 |                         default='.')
187 |     parser.add_argument('-o', '--outdir',
188 |                         type=str,
189 |                         help='Name of the output directory.',
190 |                         default='.')
191 |     # is not true: cannot combine multiple data sets (crops), because farmID does not hold crop information -> duplicated farmIDs  
192 |     parser.add_argument('-f', '--setti', action='store', dest='setti',
193 |                          type=str, nargs='*', default=['1400'],
194 |                          help='Name of the data set. Can be also multiple. E.g. -f 1310 1320.')
195 |     #parser.add_argument('-f', '--setti', 
196 |     #                    type=str,
197 |     #                    default=['1400'],
198 |     #                    help='Name of the data set. E.g. -f 1310.')
199 |     parser.add_argument('-y', '--years', action='store', dest='ylist',
200 |                        type=str, nargs='*', default=['2018', '2019', '2020', '2021'],
201 |                        help="Optionally e.g. -y 2018 2019, default all")
202 |     
203 |     parser.add_argument('-r', '--rank',
204 |                         help='If saving time series by rank of days.',
205 |                         default=False,
206 |                         action='store_true')
207 |         
208 |     args = parser.parse_args()
209 |     main(args)
210 | 
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/python/07C-doyFusion-median.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MY 2022-03-24
 3 | 
 4 | Apply to all annual stack-files: add/sum of duplicates per doy, i.e. merge all observations per day per farm into one.
 5 | 
 6 | 
 7 | RUN: 
 8 | 
 9 | python 07C-doyFusion-median.py -i cloudless/medianStack_annual -o cloudless/medianStack_annualFused 
10 | 
11 | """
12 | import glob
13 | import os
14 | import pandas as pd
15 | import numpy as np
16 | import pickle
17 | import utils
18 | 
19 | from pathlib import Path
20 | 
21 | import argparse
22 | import textwrap
23 | 
24 | 
25 | 
26 | ###### FUNCTIONS:
27 | 
28 | def combineAllDOYs(data_folder, out_dir_path):
29 |     # read files in inputdir:
30 |     s = pd.Series(glob.glob(data_folder + '/*.pkl'))
31 | 
32 |     for filename in s:
33 |         df = utils._load_intensities(filename)
34 |         df2 = df.replace(0, np.nan)
35 |         df3 = df2.groupby(['farmID', 'band', 'doy']).mean().reset_index()
36 |         
37 | 
38 |         filename2 = os.path.join(out_dir_path, filename.split('/')[-1])
39 | 
40 |         print(f"Saving {filename} to file: {filename2}")
41 |         utils.save_intensities(filename2, df3)
42 |     
43 | 
44 |     
45 | def main(args):
46 |     
47 |     try:
48 |         if not args.inputdir or not args.outdir:
49 |             raise Exception('Missing input or output dir. Try --help .')
50 | 
51 |         print(f'\n\n07C-doyFusion-median.py')
52 |         print(f'\nInput files in {args.inputdir}')
53 | 
54 |         # directory for input, i.e. annual results:
55 |         data_folder = args.inputdir
56 |         
57 |         # directory for outputs:
58 |         out_dir_path = args.outdir
59 |         Path(out_dir_path).mkdir(parents=True, exist_ok=True)
60 | 
61 |         print("\nCombining the doys within fused time window (11-days)...")
62 |         combineAllDOYs(data_folder, out_dir_path)
63 |         
64 | 
65 |     except Exception as e:
66 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
67 |         parser.print_help()
68 |         raise e
69 | 
70 | if __name__ == '__main__':
71 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
72 |                                      epilog=textwrap.dedent(__doc__))
73 | 
74 |     parser.add_argument('-i', '--inputdir',
75 |                         type=str,
76 |                         help='Name of the input directory (where annual histogram dataframes are).',
77 |                         default='.')
78 |     parser.add_argument('-o', '--outdir',
79 |                         type=str,
80 |                         help='Name of the output directory.',
81 |                         default='.')        
82 |     args = parser.parse_args()
83 |     main(args)
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/python/07C-doyFusion.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MY 2022-03-24
 3 | 
 4 | Apply to all annual stack-files: add/sum of duplicates per doy, i.e. merge all observations per day per farm into one.
 5 | 
 6 | 
 7 | RUN: 
 8 | 
 9 | python 07C-doyFusion.py -i dataStack_annual -o dataStack_annualFused 
10 | 
11 | Before this 06-histo2stack.py, after this 07-stack2ARD.py.
12 | 
13 | """
14 | import glob
15 | import os
16 | import pandas as pd
17 | import numpy as np
18 | import pickle
19 | import utils
20 | 
21 | from pathlib import Path
22 | 
23 | import argparse
24 | import textwrap
25 | 
26 | 
27 | 
28 | ###### FUNCTIONS:
29 | 
30 | def combineAllDOYs(data_folder, out_dir_path):
31 |     # read files in inputdir:
32 |     s = pd.Series(glob.glob(data_folder + '/*.pkl'))
33 | 
34 |     for filename in s:
35 |         df = utils._load_intensities(filename)
36 |         df2 = df.groupby(['farmID', 'band', 'doy']).aggregate(np.sum).reset_index()
37 | 
38 |         filename2 = os.path.join(out_dir_path, filename.split('/')[-1])
39 | 
40 |         print(f"Saving {filename} to file: {filename2}")
41 |         utils.save_intensities(filename2, df2)
42 |     
43 | 
44 |     
45 | def main(args):
46 |     
47 |     try:
48 |         if not args.inputdir or not args.outdir:
49 |             raise Exception('Missing input or output dir. Try --help .')
50 | 
51 |         print(f'\n\n07C-doyFusion.py')
52 |         print(f'\nInput files in {args.inputdir}')
53 | 
54 |         # directory for input, i.e. annual results:
55 |         data_folder = args.inputdir
56 |         
57 |         # directory for outputs:
58 |         out_dir_path = args.outdir
59 |         Path(out_dir_path).mkdir(parents=True, exist_ok=True)
60 | 
61 |         print("\nCombining the doys within fused time window (11-days)...")
62 |         combineAllDOYs(data_folder, out_dir_path)
63 |         
64 | 
65 |     except Exception as e:
66 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
67 |         parser.print_help()
68 |         raise e
69 | 
70 | if __name__ == '__main__':
71 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
72 |                                      epilog=textwrap.dedent(__doc__))
73 | 
74 |     parser.add_argument('-i', '--inputdir',
75 |                         type=str,
76 |                         help='Name of the input directory (where annual histogram dataframes are).',
77 |                         default='.')
78 |     parser.add_argument('-o', '--outdir',
79 |                         type=str,
80 |                         help='Name of the output directory.',
81 |                         default='.')        
82 |     args = parser.parse_args()
83 |     main(args)
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/python/08-mergeTarget-parallel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 20.8.2021 MY 
  3 | 
  4 | Merge farmID with target y. 
  5 | 
  6 | RUN:
  7 | 
  8 | python 08-mergeTarget-parallel.py -i dataStack/ -k references-all.csv
  9 | 
 10 | """
 11 | import pandas as pd
 12 | import numpy as np
 13 | import pickle
 14 | import os.path
 15 | from pathlib import Path
 16 | import argparse
 17 | import textwrap
 18 | import re
 19 | import glob
 20 | import utils
 21 | from itertools import repeat
 22 | from multiprocessing import Pool
 23 | 
 24 | maxcores = 18
 25 | 
 26 | # FUNCTIONS:
 27 |         
 28 | def makeTarget(inputfile, refefile, out_dir_path):
 29 |     # read array:
 30 |     arrayfile = utils.load_npintensities(inputfile)
 31 |     # read farmIDs:
 32 |     farmid = utils.readTargetID(inputfile)
 33 |     setti = utils.parse_xpath(inputfile)
 34 |     print(setti)
 35 |     fp1 = os.path.join(out_dir_path, 'y_' + setti + '.pkl')
 36 |     fp2 = os.path.join(out_dir_path, 'farmID_' + setti + '.pkl')
 37 |     fp3 = os.path.join(out_dir_path, inputfile.split('/')[-1])
 38 |     
 39 |     idsdf = pd.DataFrame(farmid)
 40 |     idsdf.columns = ['farmID']    
 41 |     #print(idsdf.tail())
 42 |     # read crop yields (target):
 43 |     targets = pd.read_csv(refefile)
 44 |     # merge:
 45 |     df = idsdf.merge(targets, how = 'left')
 46 |     if len(idsdf) == len(df):
 47 |         print(f'Length of farmIDs before and after merge match ({len(df)}).')
 48 |     if df['y'].isna().any():
 49 |         print(f'There are NAs!')
 50 |         # this means, some y not found. Let's filter also array and farmID.
 51 |         print(f"There are {df['y'].isna().sum()} NAs.")
 52 |         #print(arrayfile.shape, farmid.shape, len(targets))
 53 |         rowmaskNAs = np.array(df['y'].isna())
 54 | 
 55 |         arrayfileClear = arrayfile[~rowmaskNAs, :, :]
 56 |         farmidClear = df['farmID'][~rowmaskNAs]
 57 |         yClear = df['y'][~rowmaskNAs]
 58 |               
 59 |         print(f'Saving filtered data.')
 60 | 
 61 |         print(f'Saving target y to {fp1}.')
 62 |         utils.save_intensities(fp1, yClear)
 63 |         
 64 |         print(f'Saving farmID to {fp2}.')
 65 |         utils.save_intensities(fp2, farmidClear)
 66 |         
 67 |         print(f'Saving arrayfiles into {fp3}.')
 68 |         np.savez_compressed(fp3, arrayfileClear)    
 69 |         
 70 |         print(len(yClear), len(farmidClear), arrayfileClear.shape)
 71 |         
 72 |     else:
 73 |         print(f'Saving without the need to filter out NA data.')
 74 |         # Saving:
 75 | 
 76 |         print(f'Saving target y to {fp1}.')
 77 |         utils.save_intensities(fp1, df['y'])
 78 |         
 79 |         print(f'Saving farmID to {fp2}.')
 80 |         utils.save_intensities(fp2, df['farmID'])
 81 |         
 82 |         print(f'Saving arrayfiles into {fp3}.')
 83 |         np.savez_compressed(fp3, arrayfile)
 84 |         
 85 | # HERE STARTS MAIN:
 86 | 
 87 | def main(args):
 88 |     try:
 89 |         if not args.inputpath or not args.refefile:
 90 |             raise Exception('Missing farmID or target filepath argument. Try --help .')
 91 | 
 92 |         print(f'\n08-mergeTarget-parallel.py')
 93 |         print(f'\nStacked data in: {args.inputpath}')
 94 | 
 95 |         # directory for results:
 96 |         if 'median' in args.inputpath:
 97 |             out_dir_path = os.path.join(str(Path(args.inputpath).parents[0]),  'medianStack_ard')
 98 |             Path(out_dir_path).mkdir(parents=True, exist_ok=True)
 99 |         else:
100 |             out_dir_path = os.path.join(str(Path(args.inputpath).parents[0]),  'dataStack_ard')
101 |             Path(out_dir_path).mkdir(parents=True, exist_ok=True)
102 |             
103 |         
104 |         print("\nMerging farmID and crop yields to make target set y...")
105 |         filenames = glob.glob(args.inputpath + 'array*.npz')
106 |         #print(filenames)
107 |         if filenames:
108 |             p = Pool(maxcores)
109 |             p.starmap(makeTarget, zip(filenames, repeat(args.refefile), repeat(out_dir_path)))
110 |             # wait for all tasks to finish
111 |             p.close()
112 | 
113 | 
114 |             #for fp in filenames:
115 |             #    print(fp)
116 |             #    makeTarget(fp, args.refefile, out_dir_path)
117 |                 
118 |                 
119 |             
120 |         print(f'\nDone.')
121 | 
122 |     except Exception as e:
123 |         print('\n\nUnable to read input or write out statistics. Check prerequisites and see exception output below.')
124 |         parser.print_help()
125 |         raise e
126 |         
127 |         
128 | if __name__ == '__main__':
129 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
130 |                                      epilog=textwrap.dedent(__doc__))
131 | 
132 |     parser.add_argument('-i', '--inputpath',
133 |                         help='Path to data directory (dataStack_duplicatesRemoved).',
134 |                         type=str)
135 | 
136 |     parser.add_argument('-k', '--refefile',
137 |                         help='Filename of crop yields data. Remove bad data beforehand (like suspiciously low yields).',
138 |                         type=str)
139 |     parser.add_argument('--debug',
140 |                         help='Verbose output for debugging.',
141 |                         action='store_true')
142 | 
143 |     args = parser.parse_args()
144 |     main(args)
145 | 


--------------------------------------------------------------------------------
/python/08A-removeDuplicates-parallel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MY
  3 | 13.2.2022 Remove duplicates, compute marix addition.
  4 | 
  5 | Reads all files in input path. Handles duplicates and corrects farmID file. Saves into *_duplicatesRemoved
  6 | 
  7 | Do this before 08-mergeTarget.py.
  8 | 
  9 | RUN:
 10 | 
 11 | python 08A-removeDuplicates.py -i dataStack/
 12 | 
 13 | python 08A-removeDuplicates.py -i dataStack/
 14 | 
 15 | """
 16 | 
 17 | import pandas as pd
 18 | import numpy as np
 19 | import pickle
 20 | import os.path
 21 | from pathlib import Path
 22 | import argparse
 23 | import textwrap
 24 | import re
 25 | import glob
 26 | import utils
 27 | from iteration_utilities import duplicates, unique_everseen
 28 | 
 29 | from multiprocessing import Pool
 30 | 
 31 | maxcores = 18
 32 | 
 33 | # FUNCTIONS:
 34 | 
 35 | def theworks(fp, inputpath, out_dir_path):
 36 |             
 37 |     arrayfile = utils.load_npintensities(fp)
 38 |     farmid = utils.readTargetID(fp)
 39 |     #print(list(duplicates(farmid)))
 40 |     rowmaskDuplicated = np.array([True if x in list(duplicates(farmid)) else False for x in farmid])
 41 | 
 42 | 
 43 |     # if there are duplicates:
 44 |     if any(rowmaskDuplicated):
 45 |         # save the unique cases first:
 46 |         arrayfileClear = arrayfile[~rowmaskDuplicated, :, :]
 47 |         farmidClear = farmid[~rowmaskDuplicated]
 48 | 
 49 |         for farm in list(unique_everseen(duplicates(farmid))):
 50 |             alist = arrayfile[farm == farmid, :, :]
 51 |             # matrix addition of multiple arrays:
 52 |             uusi = np.add.reduce(alist) 
 53 | 
 54 |             arrayfileClear = np.concatenate([arrayfileClear, uusi[np.newaxis,:,:]])
 55 |             farmidClear = np.append(farmidClear, farm)
 56 | 
 57 | 
 58 |     else: # if there are no duplicates at all
 59 |         arrayfileClear = arrayfile
 60 |         farmidClear = farmid
 61 |         print('There are no duplicates at all.')
 62 | 
 63 |     # last check:
 64 |     if arrayfileClear.shape[0] != farmidClear.shape[0]:
 65 |         print(f'List lengths not matching! Check {fp}')
 66 | 
 67 |     print(f'There was {sum([True if x in list(duplicates(farmid)) else False for x in farmid])} duplicates.')
 68 |     print(f'Old array shape: {arrayfile.shape}')
 69 |     print(f'Old farm list shape: {farmid.shape}')
 70 |     print(f'New array shape: {arrayfileClear.shape}')
 71 |     print(f'New farm list shape: {farmidClear.shape}')
 72 | 
 73 |     # Saving:
 74 |     tail = utils.parse_xpath(fp)
 75 |     print(tail)
 76 |     fp2 = 'farmID_' + tail + '.pkl'
 77 |     print(f'Saving farmID files into {os.path.join(out_dir_path, fp2)}.')
 78 |     utils.save_intensities(os.path.join(out_dir_path, fp2), farmidClear)
 79 |     fp3 = fp.split('/')[-1]
 80 |     print(f'Saving arrayfiles into {os.path.join(out_dir_path, fp3)}.')
 81 |     np.savez_compressed(os.path.join(out_dir_path, fp3), arrayfileClear)
 82 | 
 83 |         
 84 | 
 85 | # HERE STARTS MAIN:
 86 | 
 87 | def main(args):  
 88 |     try:
 89 |         if not args.inputpath:
 90 |             raise Exception('Missing input dir argument. Try --help .')
 91 | 
 92 |         print(f'\n\n08A-removeDuplicates-parallel.py')
 93 |         print(f'\nInput files in {args.inputpath}')
 94 | 
 95 |         datadir = args.inputpath
 96 | 
 97 |         # directory for results:
 98 |         out_dir_path = os.path.dirname(datadir) + "_duplicatesRemoved"
 99 |         Path(out_dir_path).mkdir(parents=True, exist_ok=True)
100 |         
101 |         list_of_files = glob.glob(inputpath + 'array*.npz')
102 |         if list_of_files:
103 |             p = Pool(maxcores)
104 |             p.starmap(theworks, zip(list_of_files, repeat(datadir), repeat(out_dir_path)))
105 |             # wait for all tasks to finish
106 |             p.close()
107 |             
108 |         #theworks(datadir, out_dir_path)
109 |         
110 |         print('Done.')
111 | 
112 |     except Exception as e:
113 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
114 |         parser.print_help()
115 |         raise e
116 | 
117 | if __name__ == '__main__':
118 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
119 |                                      epilog=textwrap.dedent(__doc__))
120 |     parser.add_argument('-i', '--inputpath',
121 |                         type=str,
122 |                         help='Path to the directory with stacked array files.',
123 |                         default='.')
124 | 
125 |     args = parser.parse_args()
126 |     main(args)
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/python/08B-mergeObservations-parallel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MY
  3 | 9.3.2022 Merge selected observations (e.g. by region), compute marix addition.
  4 | 
  5 | Reads all files in input path. Handles duplicates and corrects farmID file. Saves into *_merged
  6 | 
  7 | Run only for single years (regions are not meant to be used for training).
  8 | 
  9 | After this run 08-mergeTarget.py.
 10 | 
 11 | RUN:
 12 | 
 13 | python 08B-mergeObservations-parallel.py -i dataStack/ -o dataStack/ \
 14 | -k satotilalistaJaKunta.csv -c 8
 15 | 
 16 | """
 17 | 
 18 | import pandas as pd
 19 | import numpy as np
 20 | import pickle
 21 | import os.path
 22 | from pathlib import Path
 23 | import argparse
 24 | import textwrap
 25 | import re
 26 | import glob
 27 | import utils
 28 | from iteration_utilities import duplicates, unique_everseen
 29 | from itertools import repeat
 30 | from multiprocessing import Pool
 31 | 
 32 | maxcores = 18
 33 | 
 34 | # FUNCTIONS:
 35 | 
 36 | def theworks(fp, inputpath, out_dir_path, chosenFarms):
 37 |     
 38 |     tail = utils.parse_xpath(fp)
 39 |     print(f'Starting processing {tail}')
 40 |             
 41 |     arrayfile = utils.load_npintensities(fp)
 42 |     farmid = utils.readTargetID(fp)
 43 | 
 44 |     rowmask = np.array([True if x in list(chosenFarms['farmID'].tolist()) else False for x in farmid])
 45 |     
 46 | 
 47 |     # if there are any farms:
 48 |     if any(rowmask):
 49 |         
 50 |         newfarmid = farmid[rowmask]
 51 |         newarray = arrayfile[rowmask, :, :]
 52 | 
 53 |         newdf = pd.DataFrame(newfarmid, columns = ['farmID'])
 54 |         newdf2 = newdf.merge(chosenFarms)
 55 |         newdf2[['Year', 'farm_ID', 'Crop']] = newdf2['farmID'].str.split('_', expand = True)
 56 |         newfarmid = newdf2['Year'] + '_' +  newdf2['KUNTA_KNRO_VUOSI'].astype('str') + '_' + newdf2['Crop']
 57 | 
 58 |         # are there cases (regions) with only one observation (tila)?
 59 |         if set(newfarmid) - set(duplicates(newfarmid)):
 60 |             print(f'There are cases (regions) with only one observation (tila): {len(set(newfarmid) - set(duplicates(newfarmid)))}')
 61 |             print(f'Namely, these: {len(set(newfarmid) - set(duplicates(newfarmid)))}')
 62 |     
 63 |         l = []
 64 |         lfarmid = []
 65 | 
 66 |         print(f'There are {len(list(unique_everseen(duplicates(newfarmid))))} duplicated regions.')
 67 | 
 68 |         for farm in list(unique_everseen(duplicates(newfarmid))):
 69 |             print(farm)
 70 |             alist = newarray[[i in farm for i in newfarmid], :, :]
 71 |             # matrix addition of multiple arrays:
 72 |             uusi = np.add.reduce(alist) 
 73 |             #l.append(uusi[np.newaxis,:,:])
 74 |             l.append(uusi)
 75 |             lfarmid.append(farm)
 76 | 
 77 |         newarrayMerged = np.asarray(l)
 78 | 
 79 | 
 80 |         # last check:
 81 |         if newarray.shape[0] != newfarmid.shape[0]:
 82 |             print(f'List lengths not matching! Check {fp}')
 83 | 
 84 |         print(f'There were {rowmask.sum()} chosen farms.')
 85 |         print(f'Old array shape: {arrayfile.shape}')
 86 |         print(f'Old farm list shape: {farmid.shape}')
 87 |         print(f'New array shape: {newarrayMerged.shape}')
 88 |         print(f'New farm list shape: {len(lfarmid)}')
 89 | 
 90 |         # Saving:
 91 | 
 92 |         fp2 = 'farmID_' + tail + '.pkl'
 93 |         print(f'Saving farmID files into {os.path.join(out_dir_path, fp2)}.')
 94 |         utils.save_intensities(os.path.join(out_dir_path, fp2), lfarmid)
 95 |         fp3 = fp.split('/')[-1]
 96 |         print(f'Saving arrayfiles into {os.path.join(out_dir_path, fp3)}.')
 97 |         np.savez_compressed(os.path.join(out_dir_path, fp3), newarrayMerged)
 98 |         
 99 |     else: # if there are no duplicates at all
100 |         print(f'There are no selected farms in {fp}.')
101 | 
102 |         
103 | 
104 | # HERE STARTS MAIN:
105 | 
106 | def main(args):  
107 |     try:
108 |         if not args.inputpath:
109 |             raise Exception('Missing input dir argument. Try --help .')
110 | 
111 |         print(f'\n\n08B-mergeObservations-parallel.py')
112 |         print(f'\nInput files in {args.inputpath}')
113 | 
114 |         datadir = args.inputpath
115 | 
116 |         # directory for results:
117 |         out_dir_path = os.path.dirname(args.outputpath) + "_merged"
118 |         Path(out_dir_path).mkdir(parents=True, exist_ok=True)
119 |         
120 |         
121 |         chosenFarms = pd.read_csv(args.kunnat)
122 |         
123 |         # only annual data sets:
124 |         list_of_files = glob.glob(datadir + 'array_1' + ('[0-9]' * 3) + '-20' + ('[0-9]' * 2) + '.npz')
125 |         if list_of_files:
126 |             p = Pool(maxcores)
127 |             p.starmap(theworks, zip(list_of_files, repeat(datadir), repeat(out_dir_path), repeat(chosenFarms)))
128 |             # wait for all tasks to finish
129 |             p.close()
130 |             
131 |         #theworks(datadir, out_dir_path)
132 |         
133 |         print('Done.')
134 | 
135 |     except Exception as e:
136 |         print('\n\nUnable to read input or write out results. Check prerequisites and see exception output below.')
137 |         parser.print_help()
138 |         raise e
139 | 
140 | if __name__ == '__main__':
141 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
142 |                                      epilog=textwrap.dedent(__doc__))
143 |     parser.add_argument('-i', '--inputpath',
144 |                         type=str,
145 |                         help='Path to the directory with stacked array files.',
146 |                         default='.')
147 |     parser.add_argument('-o', '--outputpath',
148 |                         type=str,
149 |                         help='Path to the output directory with merged array files.',
150 |                         default='.')
151 |     parser.add_argument('-k', '--kunnat',
152 |                         type=str,
153 |                         help='Path to the file with kunnat, tilat.')
154 |     parser.add_argument('-c', '--ncores',
155 |                         type=int,
156 |                         help='Number of cores to use.',
157 |                         default = 1)
158 | 
159 |     args = parser.parse_args()
160 |     main(args)
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/python/09-runRF-article-iterate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 2021-11-30 RF / iterable 2.1.2022
  3 | 
  4 | RUN:
  5 | 
  6 | Without testing set (makes train/validation split automatically):
  7 | python 09-runRF-article-iterate.py -i dataStack/array_1110-2020.npz 
  8 | 
  9 | With testing set (kunta or separate year):
 10 | python 09-runRF-article-iterate.py -i dataStack/array_1110-2018-2019.npz \
 11 | -j dataStack/array_1110-2020.npz 
 12 | 
 13 | 
 14 | NOTE: if you test with a separate year, be sure that training set excludes that year!
 15 | 
 16 | 
 17 | """
 18 | import glob
 19 | import pandas as pd
 20 | import numpy as np
 21 | import os.path
 22 | from pathlib import Path
 23 | import argparse
 24 | import textwrap
 25 | import math
 26 | import time
 27 | import csv
 28 | from scipy import stats
 29 | import seaborn as sns
 30 | import utils
 31 | 
 32 | from sklearn.ensemble import RandomForestRegressor
 33 | from sklearn import metrics
 34 | import matplotlib.pyplot as plt
 35 | 
 36 | # EDIT:
 37 | # How many times to iterate each data set?
 38 | ntimes = 10
 39 | 
 40 | 
 41 | t = time.localtime()
 42 | timeString  = time.strftime("%Y-%m-%d", t)
 43 | 
 44 | # FUNCTIONS:
 45 | 
 46 | def runModel(model, Xtrain, ytrain, Xtest):
 47 |     model.fit(Xtrain, ytrain)
 48 |     test_predictions = model.predict(Xtest)
 49 |     return test_predictions
 50 | 
 51 | def doRMSE(residuals):
 52 |     return np.sqrt(np.square(residuals).mean())
 53 | 
 54 | # HERE STARTS MAIN:
 55 | 
 56 | def main(args):
 57 |     try:
 58 |         if not args.inputfile :
 59 |             raise Exception('Missing input filepath argument. Try --help .')
 60 | 
 61 |         print(f'\n09-runRF-article-iterate.py')
 62 |         print(f'\nARD data set in: {args.inputfile}')
 63 |         
 64 | 
 65 |         
 66 |         if 'median' in args.inputfile:
 67 |             print('Median as a sole feature')
 68 |             normalizer = 'median'
 69 |         else:   
 70 |             # EDIT:
 71 |             #normalizer = "linear" # or "L1"
 72 |             normalizer = "L1"
 73 |         
 74 |         # read in array data:
 75 |         xtrain0 = utils.load_npintensities(args.inputfile)
 76 |         # normalize:
 77 |         xtrain = utils.normalise3D(xtrain0, normalizer)
 78 |         # read in target y:
 79 |         ytrain = utils.readTarget(args.inputfile)
 80 |         # jos ei anneta test set, niin tehdään split:       
 81 |         if not args.testfile:
 82 |             print(f"\nSplitting {args.inputfile} into validation and training set:")
 83 |             xtrain, ytrain, xval, yval = utils.split_data(xtrain, ytrain)
 84 |             setID = utils.parse_xpath(args.inputfile)
 85 |         else:
 86 |             xval0 = utils.load_npintensities(args.testfile)
 87 |             # normalize:
 88 |             xval = utils.normalise3D(xval0, normalizer)
 89 |             yval = utils.readTarget(args.testfile)
 90 |             setID = utils.parse_xpath(args.testfile)
 91 |         
 92 |         # this needs 3D:
 93 |         m,n = xtrain.shape[:2]
 94 |         xtrain3d = xtrain.reshape(m,n,-1) 
 95 |         m,n = xval.shape[:2]
 96 |         xval3d = xval.reshape(m,n,-1) 
 97 | 
 98 |         if xval3d.shape[1] < xtrain3d.shape[1]:
 99 |             doysToAdd = xtrain3d.shape[1] - xval3d.shape[1]
100 |             print(f"Shape of testing set differs from training set. We need to pad it with {doysToAdd} DOYs.")
101 |             b = np.zeros( (xval3d.shape[0],doysToAdd,xval3d.shape[2]) )
102 |             xval3d = np.column_stack((xval3d,b))
103 |             print(f'New shape of padded xval3d is {xval3d.shape}.')   
104 |             
105 |         if xtrain3d.shape[1] < xval3d.shape[1]:
106 |             doysToAdd = xval3d.shape[1] - xtrain3d.shape[1]
107 |             print(f"Shape of training set differs from testing set. We need to pad it with {doysToAdd} DOYs.")
108 |             b = np.zeros( (xtrain3d.shape[0],doysToAdd,xtrain3d.shape[2]) )
109 |             xtrain3d = np.column_stack((xtrain3d,b))
110 |             print(f'New shape of padded xtrain3d is {xtrain3d.shape}.')   
111 |         
112 |         # 2D:
113 |         # make 2D:
114 |         m = xval3d.shape[0]
115 |         xval2d = xval3d.reshape(m,-1)
116 |         m = xtrain3d.shape[0]
117 |         xtrain2d = xtrain3d.reshape(m,-1)
118 |         
119 |         #pitää tehdä se in-season ennen kuin 2D:
120 |         june = 43
121 |         july = 73
122 |         august = 104    
123 |         # June:
124 |         xtrain3dnew = xtrain3d[:,:june,:]
125 |         xval3dnew = xval3d[:,:june,:]
126 | 
127 |         # make 2D:
128 |         m = xval3dnew.shape[0]
129 |         XtestJune= xval3dnew.reshape(m,-1)
130 |         m = xtrain3dnew.shape[0]
131 |         XtrainJune = xtrain3dnew.reshape(m,-1)
132 | 
133 |         # July:
134 |         xtrain3dnew = xtrain3d[:,:july,:]
135 |         xval3dnew = xval3d[:,:july,:]
136 | 
137 |         # make 2D:
138 |         m = xval3dnew.shape[0]
139 |         XtestJuly = xval3dnew.reshape(m,-1)
140 |         m = xtrain3dnew.shape[0]
141 |         XtrainJuly = xtrain3dnew.reshape(m,-1)
142 | 
143 |         # August:
144 |         xtrain3dnew = xtrain3d[:,:august,:]
145 |         xval3dnew = xval3d[:,:august,:]
146 | 
147 |         # make 2D:
148 |         m = xval3dnew.shape[0]
149 |         XtestAugust = xval3dnew.reshape(m,-1)
150 |         m = xtrain3dnew.shape[0]
151 |         XtrainAugust = xtrain3dnew.reshape(m,-1)
152 |         
153 |         
154 |         # MODEL:
155 |         model = RandomForestRegressor(max_features = 8, n_jobs = -1, n_estimators = 500)
156 | 
157 |         if normalizer == 'median':
158 |             modelname = 'RFmedian'
159 |         else:
160 |             if not args.testfile:
161 |                 modelname = 'RF'
162 |             else:
163 |                 if 'ely' in args.testfile:
164 |                     modelname = 'RFely'
165 |                 if 'Rank' in args.testfile:
166 |                     modelname = 'RFrank'
167 |                 else:
168 |                     modelname = 'RFtest'
169 |                     
170 |         df = []
171 | 
172 |         # iterate predictions:
173 |         for i in range(ntimes):
174 |             print(f'Iteration {i+1}...')
175 |             test_predictions = runModel(model, xtrain2d, ytrain, xval2d)
176 |             dfResiduals = pd.DataFrame(np.subtract(test_predictions, yval))
177 |             dfResiduals.columns = ['farmfinal']
178 | 
179 |             # June:
180 |             test_predictions = runModel(model, XtrainJune, ytrain, XtestJune)
181 |             dfResiduals['farm43'] = np.subtract(test_predictions, yval)
182 | 
183 |             # July:
184 |             test_predictions = runModel(model, XtrainJuly, ytrain, XtestJuly)
185 |             dfResiduals['farm73'] = np.subtract(test_predictions, yval)
186 | 
187 |             # August:
188 |             test_predictions = runModel(model, XtrainAugust, ytrain, XtestAugust)
189 |             dfResiduals['farm104'] = np.subtract(test_predictions, yval)
190 | 
191 |             df.append(dfResiduals)
192 | 
193 |         if not args.testfile:
194 |             basepath = args.inputfile.split('/')[:-2]
195 |             out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative')
196 |             Path(out_dir_results).mkdir(parents=True, exist_ok=True)
197 |         else:
198 |             basepath = args.testfile.split('/')[:-2]
199 |             out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative')
200 |             Path(out_dir_results).mkdir(parents=True, exist_ok=True)
201 |             
202 |         t = time.localtime()
203 |         timeString2  = time.strftime("%Y-%m-%d-%H:%M:%S", t)
204 |         
205 |         pklfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', timeString2 + '-allIteratedRMSE-' + modelname + '-' + setID + '.pkl')
206 |         print(f"\nWriting results to file {pklfile}.")
207 |         utils.save_intensities(pklfile, df)
208 |         
209 |         csvfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', 'iteratedRMSE.csv')
210 |         print(f"\nWriting results to file {csvfile}.")
211 |             
212 |         for setti in ['farmfinal', 'farm43', 'farm73', 'farm104']:
213 |             residuals = []
214 |             for i in range(ntimes):
215 |                 residuals.extend(df[i][setti])
216 |             rmse = doRMSE(residuals)
217 |             
218 |             with open(csvfile, "a+") as f:
219 |                 writer = csv.writer(f)
220 |                 writer.writerow([setID, modelname, round(rmse, 3), setti])
221 |         
222 |         
223 |         print(f'\nDone.')
224 | 
225 |     except Exception as e:
226 |         print('\n\nUnable to read input or write out statistics. Check prerequisites and see exception output below.')
227 |         parser.print_help()
228 |         raise e
229 | 
230 | 
231 | if __name__ == '__main__':
232 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
233 |                                      epilog=textwrap.dedent(__doc__))
234 | 
235 |     parser.add_argument('-i', '--inputfile',
236 |                         help='Filepath of array intensities (training set).',
237 |                         type=str)
238 |     parser.add_argument('-j', '--testfile',
239 |                         help='Filepath of the testing set (optional).',
240 |                         type=str)   
241 | 
242 |     parser.add_argument('--debug',
243 |                         help='Verbose output for debugging.',
244 |                         action='store_true')
245 | 
246 |     args = parser.parse_args()
247 |     main(args)
248 | 
249 | 


--------------------------------------------------------------------------------
/python/09-runTCN-article-iterate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 2021-09-01 MY added normalization
  3 | 2022-01-02 iterable, returns mean RMSE of ntimes iterated trainings.
  4 | 2022-03-05 return also all RMSEs
  5 | 
  6 | RUN:
  7 | 
  8 | Without testing set (makes train/validation split automatically):
  9 | python 09-runTCN-article-iterate.py -i dataStack/array_1110-2020.npz \
 10 | --epochs 200 --batchsize 128 --learningrate 0.001 --epsilon 0.1
 11 | 
 12 | With testing set (region or separate year):
 13 | python 09-runTCN-article-iterate.py -i dataStack/array_1110-2018-2019.npz \
 14 | -j dataStack/array_1110-2020.npz \
 15 | --epochs 200 --batchsize 128 --learningrate 0.001 --epsilon 0.1
 16 | 
 17 | 
 18 | NOTE: if you test with a separate year, be sure that training set excludes that year!
 19 | 
 20 | 
 21 | """
 22 | import glob
 23 | import pandas as pd
 24 | import numpy as np
 25 | import os.path
 26 | from pathlib import Path
 27 | import argparse
 28 | import textwrap
 29 | import math
 30 | import time
 31 | import csv
 32 | from scipy import stats
 33 | import utils
 34 | 
 35 | #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
 36 | from tensorflow.keras.models import Sequential, save_model, load_model
 37 | from tensorflow.keras.layers import Dense, Dropout, SimpleRNN, LSTM
 38 | from tensorflow.keras.callbacks import EarlyStopping
 39 | from tensorflow.keras.utils import plot_model
 40 | from tensorflow.keras.optimizers import Adam
 41 | 
 42 | from tcn import TCN, tcn_full_summary
 43 | 
 44 | # pip install keras-tcn --user
 45 | 
 46 | t = time.localtime()
 47 | timeString  = time.strftime("%Y-%m-%d", t)
 48 | 
 49 | # EDIT:
 50 | # How many times to iterate each data set?
 51 | ntimes = 10
 52 | 
 53 | # FUNCTIONS:
 54 | 
 55 | def doRMSE(residuals):
 56 |     return np.sqrt(np.square(residuals).mean())
 57 | 
 58 | def temporalConvolutionalNetworks(shape1, shape2):
 59 |     print("\nTraining TCN...")
 60 | 
 61 |     tcn_layer = TCN(input_shape=(None, shape2), nb_filters = 32, padding = 'causal', kernel_size = 2, 
 62 |             nb_stacks=1, dilations = [1, 2, 4, 8, 16], 
 63 |             return_sequences=True
 64 |            )
 65 |     
 66 |     # The receptive field tells you how far the model can see in terms of timesteps.
 67 |     print('Receptive field size =', tcn_layer.receptive_field)
 68 | 
 69 |     model = Sequential([
 70 |         tcn_layer,
 71 |         Dense(1)
 72 |         ])
 73 |     
 74 | 
 75 |     # Model summary:
 76 |     print('\nNetwork architecture:')
 77 |     print(model.summary())
 78 |     #print(tcn_full_summary(model))
 79 |     
 80 |     return model
 81 | 
 82 | 
 83 | 
 84 | def runModel(model, modelname, Xtrain, ytrain, Xtest, ytest, outputdir, epochs, batchsize, optimizeri, lera, epsiloni, setID, normalizer):
 85 | 
 86 |     # monitor validation progress
 87 |     early = EarlyStopping(monitor = "val_loss", mode = "min", patience = 10)
 88 |     callbacks_list = [early]
 89 |     
 90 |     if optimizeri == 'adam':
 91 |         model.compile(loss = 'mean_squared_error',
 92 |                   optimizer = Adam(learning_rate=lera, epsilon = epsiloni),
 93 |                   metrics = ['mse'])
 94 |     df = []
 95 |     
 96 |     # iterate training:
 97 |     for i in range(ntimes):
 98 |         print(f'Iteration {i+1}...')
 99 |         history = model.fit(Xtrain, ytrain,
100 |             epochs=epochs,  batch_size=batchsize, verbose=0,
101 |             validation_split = 0.20,
102 |             callbacks = callbacks_list)
103 | 
104 |         test_predictions = model.predict(Xtest)
105 | 
106 |         dfResiduals = pd.DataFrame(np.subtract(test_predictions[:, -1, 0], ytest))
107 |         dfResiduals.columns = ['farmfinal']
108 | 
109 |         # in this case using doys (130-243) (43, 73, 104) with zero-padding:
110 |         june = 43
111 |         july = 73
112 |         august = 104
113 | 
114 |         #June:
115 |         dfResiduals['farm43'] = np.subtract(test_predictions[:, june, 0], ytest)
116 | 
117 |         #July:
118 |         dfResiduals['farm73'] = np.subtract(test_predictions[:, july, 0], ytest)
119 | 
120 |         #August:
121 |         dfResiduals['farm104'] = np.subtract(test_predictions[:, august, 0], ytest)
122 | 
123 |         df.append(dfResiduals)
124 | 
125 |     return df
126 |     
127 |     
128 | # HERE STARTS MAIN:
129 | 
130 | def main(args):
131 |     try:
132 |         if not args.inputfile :
133 |             raise Exception('Missing input filepath argument. Try --help .')
134 | 
135 |         print(f'\n09-runTCN-article-iterate.py')
136 |         print(f'\nARD data set in: {args.inputfile}')
137 |         
138 |         if 'median' in args.inputfile:
139 |             print('Median as a sole feature...')
140 |             normalizer = 'median'
141 |         else:   
142 |             # EDIT:
143 |             #normalizer = "linear" # or "L1"
144 |             normalizer = "L1"
145 | 
146 |         ############################# Preprocessing:        
147 |         # read in array data:
148 |         xtrain0 = utils.load_npintensities(args.inputfile)
149 |         # normalize:
150 |         xtrain = utils.normalise3D(xtrain0, normalizer)
151 |         # read in target y:
152 |         ytrain = utils.readTarget(args.inputfile)
153 |         # jos ei anneta test set, niin tehdään split:       
154 |         if not args.testfile:
155 |             print(f"\nSplitting {args.inputfile} into validation and training set:")
156 |             xtrain, ytrain, xval, yval = utils.split_data(xtrain, ytrain)
157 |             setID = utils.parse_xpath(args.inputfile)
158 |         else:
159 |             xval0 = utils.load_npintensities(args.testfile)
160 |             # normalize:
161 |             xval = utils.normalise3D(xval0, normalizer)
162 |             yval = utils.readTarget(args.testfile)
163 |             setID = utils.parse_xpath(args.testfile) 
164 |           
165 |         
166 |         if not args.testfile:
167 |             basepath = args.inputfile.split('/')[:-2]
168 |             out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative')
169 |             Path(out_dir_results).mkdir(parents=True, exist_ok=True)
170 |         else:
171 |             basepath = args.testfile.split('/')[:-2]
172 |             out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative')
173 |             Path(out_dir_results).mkdir(parents=True, exist_ok=True)
174 |             
175 |         
176 |         # this needs 3D:
177 |         m,n = xtrain.shape[:2]
178 |         xtrain3d = xtrain.reshape(m,n,-1) 
179 |         m,n = xval.shape[:2]
180 |         xval3d = xval.reshape(m,n,-1) 
181 | 
182 |         # forget zero-padding:
183 |         #if xval3d.shape[1] < xtrain3d.shape[1]:
184 |         #    doysToAdd = xtrain3d.shape[1] - xval3d.shape[1]
185 |         #    print(f"Shape of testing set differs from training set. We need to pad it with {doysToAdd} DOYs.")
186 |         #    b = np.zeros( (xval3d.shape[0],doysToAdd,xval3d.shape[2]) )
187 |         #    xval3d = np.column_stack((xval3d,b))
188 |         #    print(f'New shape of padded xval3d is {xval3d.shape}.')   
189 |             
190 |         #if xtrain3d.shape[1] < xval3d.shape[1]:
191 |         #    doysToAdd = xval3d.shape[1] - xtrain3d.shape[1]
192 |         #    print(f"Shape of training set differs from testing set. We need to pad it with {doysToAdd} DOYs.")
193 |         #    b = np.zeros( (xtrain3d.shape[0],doysToAdd,xtrain3d.shape[2]) )
194 |         #    xtrain3d = np.column_stack((xtrain3d,b))
195 |         #    print(f'New shape of padded xtrain3d is {xtrain3d.shape}.')   
196 | 
197 |         ##################################### Models:    
198 |         # model topology:      
199 |         model = temporalConvolutionalNetworks(xtrain3d.shape[1], xtrain3d.shape[2])
200 |         if normalizer == 'median':
201 |             modelname = 'TCNmedian'
202 |         else:
203 |             if not args.testfile:
204 |                 modelname = 'TCN'
205 |             else:
206 |                 modelname = 'TCNtest'
207 |                 
208 |         df = runModel(model, modelname, xtrain3d, ytrain, xval3d, yval, out_dir_results, args.epochs, args.batchsize, args.optimizer, args.learningrate, args.epsilon, setID, normalizer)
209 |  
210 |         basepath = args.inputfile.split('/')[:-2]
211 |         out_dir_results = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative')
212 |         Path(out_dir_results).mkdir(parents=True, exist_ok=True)
213 | 
214 |         t = time.localtime()
215 |         timeString2  = time.strftime("%Y-%m-%d-%H:%M:%S", t)
216 |         
217 |         pklfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', timeString2 + '-allIteratedRMSE-' + modelname + '-' + setID + '.pkl')
218 |         
219 |         print(f"\nWriting results to file {pklfile}.")
220 |         utils.save_intensities(pklfile, df)
221 |         
222 |         csvfile = os.path.join(os.path.sep, *basepath, 'predictions', timeString + '-iterative', 'iteratedRMSE.csv')
223 |         print(f"\nWriting results to file {csvfile}.")
224 | 
225 | 
226 |         for setti in ['farmfinal', 'farm43', 'farm73', 'farm104']:
227 |             residuals = []
228 |             for i in range(ntimes):
229 |                 residuals.extend(df[i][setti])
230 |             rmse = doRMSE(residuals)
231 | 
232 |             with open(csvfile, "a+") as f:
233 |                 writer = csv.writer(f)
234 |                 writer.writerow([setID, modelname, round(rmse, 3), setti])
235 |             
236 | 
237 |         print(f'\nDone.')
238 | 
239 |     except Exception as e:
240 |         print('\n\nUnable to read input or write out statistics. Check prerequisites and see exception output below.')
241 |         parser.print_help()
242 |         raise e
243 | 
244 | 
245 | if __name__ == '__main__':
246 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
247 |                                      epilog=textwrap.dedent(__doc__))
248 | 
249 |     parser.add_argument('-i', '--inputfile',
250 |                         help='Filepath of array intensities (training set).',
251 |                         type=str)
252 |     parser.add_argument('-j', '--testfile',
253 |                         help='Filepath of the testing set (optional).',
254 |                         type=str)   
255 |     parser.add_argument('-e', '--epochs',
256 |                         help='An epoch is an iteration over the entire x and y data provided (default 20).',
257 |                         type=int, default = 20)      
258 |     parser.add_argument('-b', '--batchsize',
259 |                         help='Number of samples per gradient update (default 32).',
260 |                         type=int, default = 32)  
261 |     parser.add_argument('-o', '--optimizer',
262 |                         help='Optimizer (default adam).',
263 |                         type=str, default = 'adam') 
264 |     parser.add_argument('-l', '--learningrate',
265 |                         help='Learning rate (defaults to 0.001).',
266 |                         type=float, default = '0.001') 
267 |     parser.add_argument('-p', '--epsilon',
268 |                         help='A small constant for numerical stability (defaults to 1e-07).',
269 |                         type=float, default = '0.0000001') 
270 |     parser.add_argument('--debug',
271 |                         help='Verbose output for debugging.',
272 |                         action='store_true')
273 | 
274 |     args = parser.parse_args()
275 |     main(args)
276 | 


--------------------------------------------------------------------------------