├── .gitattributes
├── Copernicus_data_download
    ├── CDSE_S3_download_with_polygon.sh
    ├── CDSE_S3_download_with_tile_names.sh
    └── README.md
├── NLS_geopackage_tests.md
├── R
    ├── README.md
    ├── R_LiDAR
    │   ├── README.md
    │   ├── R_lidar_course_exercises
    │   │   ├── README.md
    │   │   ├── R_lidar.Rproj
    │   │   ├── las_files.txt
    │   │   ├── session3.1.-rlas.Rmd
    │   │   ├── session3.2-lidR-LAS-class.Rmd
    │   │   ├── session3.3-lidR-LAScatalog-class.Rmd
    │   │   ├── simple_catalog_lidR_batchjob_cluster.sh
    │   │   ├── simple_catalog_lidR_batchjob_multicore.sh
    │   │   ├── simple_catalog_lidR_cluster.R
    │   │   ├── simple_catalog_lidR_multicore.R
    │   │   ├── simple_lidR.R
    │   │   └── simple_lidR_arrayjob.sh
    │   └── rlas-DEM_example
    │   │   ├── README.md
    │   │   ├── area_of_interest.dbf
    │   │   ├── area_of_interest.prj
    │   │   ├── area_of_interest.qpj
    │   │   ├── area_of_interest.shp
    │   │   ├── area_of_interest.shx
    │   │   ├── basic_rlas.R
    │   │   ├── batchjob_rlas_basics.sh
    │   │   └── get_lidar_files_function.R
    ├── STAC
    │   ├── Readme.md
    │   └── STAC_CSC_example.Rmd
    ├── allas
    │   └── working_with_allas_from_R_S3.R
    ├── geopackage
    │   ├── README.md
    │   └── read_gpkg.R
    ├── puhti
    │   ├── 01_serial
    │   │   ├── Contours_simple.R
    │   │   └── serial_batch_job.sh
    │   ├── 02_parallel_future
    │   │   ├── Calc_contours_future_cluster.R
    │   │   ├── Calc_contours_future_multicore.R
    │   │   ├── parallel_batch_job_future_cluster.sh
    │   │   └── parallel_batch_job_future_multicore.sh
    │   ├── 03_parallel_snow
    │   │   ├── Calc_contours_snow.R
    │   │   └── parallel_batch_job_snow.sh
    │   ├── 04_parallel_foreach
    │   │   ├── Calc_contours_foreach.R
    │   │   └── parallel_batch_job_foreach.sh
    │   ├── 05_array
    │   │   ├── Contours_array.R
    │   │   └── array_batch_job.sh
    │   ├── README.md
    │   ├── mapsheets.txt
    │   └── mapsheets_URLs.txt
    ├── raster_predict
    │   ├── README.md
    │   ├── Rplots.pdf
    │   ├── r_run.sh
    │   ├── rlogo.grd
    │   ├── rlogo.gri
    │   └── rtest.R
    └── virtual_rasters.R
├── README.md
├── force
    ├── LEVEL2_parameters.prm
    ├── README.md
    ├── file_queue.txt
    └── force_batch_job.sh
├── gdal
    ├── gdal_batch_job_parallel.sh
    ├── gdal_batch_job_serial.sh
    ├── gdal_parallel.sh
    ├── gdal_serial.sh
    └── readme.md
├── grass
    ├── 01_serial_cli
    │   ├── grass_cli.sh
    │   └── grass_cli_serial.sh
    ├── 02_python_scripting_serial
    │   ├── python_scripting_serial.py
    │   └── python_scripting_serial.sh
    ├── 03_pygrass_serial
    │   ├── pygrass_serial.py
    │   └── pygrass_serial.sh
    ├── 04_pygrass_parallel
    │   ├── pygrass_parallel_with_gridmodule.py
    │   └── pygrass_parallel_with_gridmodule.sh
    └── readme.md
├── machineLearning
    └── README.md
├── noppe
    └── Readme.md
├── pdal
    ├── 01_crop_pipeline.json
    ├── 01_split_laz.sh
    ├── 02_pipeline.json
    ├── 03_batch_job_gnu_parallel.sh
    ├── 04_batch_job_array.sh
    ├── 04_filelist.csv
    ├── 07_batch_job_python.sh
    ├── 07_pdal_ground.py
    └── README.md
├── pouta
    ├── README.md
    ├── arcpy
    │   ├── ArcGIS_Server_manual_installation.sh
    │   ├── README.md
    │   ├── ansible_install_arcpy.yml
    │   ├── ansible_preparations.md
    │   ├── ansible_run_arcpy.yml
    │   └── test_data
    │   │   └── my_arcpy_script.py
    ├── docker_geoserver_or_opendronemap
    │   ├── README.md
    │   ├── ansible.cfg
    │   ├── group_vars
    │   │   └── all.yml
    │   ├── install-geoserver.yml
    │   ├── install-odm.yml
    │   ├── requirements.yml
    │   └── roles
    │   │   ├── docker
    │   │       └── tasks
    │   │       │   └── main.yml
    │   │   ├── geoserver
    │   │       └── tasks
    │   │       │   └── main.yml
    │   │   ├── opendronemap
    │   │       └── tasks
    │   │       │   └── main.yml
    │   │   └── openstack
    │   │       └── tasks
    │   │           └── main.yml
    └── metashape_with_VNC
    │   └── readme.md
├── python
    ├── README.md
    ├── STAC
    │   ├── Readme.md
    │   ├── STAC_CSC_example.ipynb
    │   ├── csc_stac_example.py
    │   ├── csc_stac_example_batch_job.sh
    │   ├── environment.yml
    │   ├── img
    │   │   ├── DEM_data_source_cpu_walltime.gif
    │   │   ├── DEM_tile_size_cpu_walltime.gif
    │   │   ├── S1_data_source_cpu_walltime.gif
    │   │   └── S1_tile_size_cpu_walltime.gif
    │   ├── stac_xarray_dask_example.ipynb
    │   └── static_stac.ipynb
    ├── allas
    │   ├── working_with_allas_from_Python_S3.py
    │   └── working_with_allas_from_Python_Swift.py
    ├── dask_geopandas
    │   ├── README.md
    │   └── dask-geopandas.ipynb
    ├── geopackage
    │   ├── README.md
    │   ├── list_layers_info.py
    │   ├── make_each_layer_a_file.py
    │   └── read_gpkg.py
    ├── puhti
    │   ├── 00_interactive
    │   │   └── interactive_single_core_example.py
    │   ├── 01_serial
    │   │   ├── single_core_example.py
    │   │   ├── single_core_example.sh
    │   │   ├── single_core_example_folder.py
    │   │   ├── single_core_example_folder.sh
    │   │   └── single_core_example_list.sh
    │   ├── 02_gnu_parallel
    │   │   ├── gnu_parallel_example.py
    │   │   └── gnu_parallel_example.sh
    │   ├── 03_array
    │   │   ├── array_job_example.py
    │   │   └── array_job_example.sh
    │   ├── 04_parallel_multiprocessing
    │   │   ├── multiprocessing_example.py
    │   │   └── multiprocessing_example.sh
    │   ├── 05_parallel_joblib
    │   │   ├── joblib_example.py
    │   │   └── joblib_example.sh
    │   ├── 06_parallel_dask
    │   │   ├── multi_node
    │   │   │   ├── dask_multinode.py
    │   │   │   └── dask_multinode.sh
    │   │   └── single_node
    │   │   │   ├── dask_singlenode.py
    │   │   │   └── dask_singlenode.sh
    │   ├── README.md
    │   └── interactive.ipynb
    ├── routing
    │   ├── batch_igraph.sh
    │   ├── batch_nx.sh
    │   ├── data
    │   │   └── hanko.graphml
    │   ├── igraph_parallel.py
    │   ├── nx_parallel.py
    │   ├── osmnx-graphml.py
    │   └── readme.md
    ├── sentinel
    │   ├── README.md
    │   ├── helsinki.geojson
    │   └── sentinelsat_download_from_finhub.py
    ├── sentinel_without_credentials
    │   ├── README.md
    │   └── get_open_sentinel_buckets.py
    └── zonal_stats
    │   ├── README.md
    │   ├── raster-stats
    │       ├── README.md
    │       ├── batch_job-stac-parallel.sh
    │       ├── batch_job_parallel.sh
    │       ├── batch_job_serial.sh
    │       ├── zonal-stats-stac-parallel.py
    │       ├── zonal_stats_parallel.py
    │       └── zonal_stats_serial.py
    │   └── xarray-spatial
    │       ├── xarray-spatial-batch-job.sh
    │       └── xarray-spatial-zonal-stats.py
├── routing.md
├── snap
    ├── 01_simple_job
    │   ├── CreateStackGraph.xml
    │   └── snap_batch_job.sh
    ├── 02_array_job
    │   ├── resample_and_lai.xml
    │   └── snap_array_job.sh
    └── README.md
└── supercomputer_installations
    ├── ames-stereo_3.2.0.yml
    ├── arcgis-python-api-2.1.0.yml
    └── geoconda
        ├── geoconda_3.10.6.yml
        ├── geoconda_3.10.9.yml
        ├── geoconda_3.11.0.yml
        └── whitebox_tools_postinstall
            ├── download_wbt
            └── start_wbt.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Keep linux line endings always.
2 | * text eol=lf


--------------------------------------------------------------------------------
/Copernicus_data_download/CDSE_S3_download_with_polygon.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ###############
 4 | # Example script to query and download data from Copernicus Data Space Ecosystem WITH POLYGON.
 5 | # See readme for connection set up details.
 6 | #
 7 | # Based on script provided by Maria Yli-Heikkilä (LUKE) - adapted to CDSE by Samantha Wittke and Kylli Ek, CSC - IT center for Science
 8 | 
 9 | 
10 | # If you suitable polygon ready, then save it as CSV with geometry in WKT
11 | # https://geojson.io/ can be used for quick creation of the input file in GeoJson format - draw a polygon on map and save the text to a .json file.
12 | # The input file could be also .shp, .gpkg or some other format supported by GDAL.
13 | ogr2ogr -f CSV area.csv input.json -lco GEOMETRY=AS_WKT
14 | 
15 | # Alternatively, if you do not have the polygon ready, but would like to calculate based on some exsisting vector file:
16 | # Note that besides changing the name of the file, you have to change also the layer name, in case of Shape file, it is the same as file name.
17 | #
18 | # ogr2ogr -f CSV area.csv /appl/data/geo/tilastokeskus/tieliikenne/2022/tieliikenne_2022.shp -dialect SQLite -sql "select st_concavehull(st_collect(geometry)) from tieliikenne_2022" -lco GEOMETRY=AS_WKT -t_srs EPSG:4326
19 | 
20 | # Get the WKT polygon from file and remove quotes.
21 | # Only the first polygon of the the file is used.
22 | wkt=$(sed '2q;d' area.csv)
23 | wkt2=${wkt//\"/}
24 | 
25 | # Provide the timedelta - start date and time
26 | STARTDATE=2023-05-01T00:00:00
27 | 
28 | # end date and time ; fixed or set to current date
29 | #CURRENTDATE=$(date +"%Y-%m-%dT%T")
30 | #ENDDATE=$CURRENTDATE
31 | ENDDATE=2023-05-06T23:59:59
32 | 
33 | CLOUDCOVER="[0,95]"
34 | 
35 | # Baseurl to reach the CDSE catalog with json output
36 | BASEURL="http://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel2/search.json?"
37 | 
38 | # Query the catalog with previously defined variables, 20 is the default max record number, which you can adapt to your needs
39 | # See https://documentation.dataspace.copernicus.eu/APIs/OpenSearch.html#output-sorting-and-limiting for further options for sorting
40 | QUERY="productType=S2MSI2A&startDate=${STARTDATE}.000Z&completionDate=${ENDDATE}.000Z&cloudCover=${CLOUDCOVER}&geometry=${wkt2}&maxRecords=1"
41 | # echo $BASEURL$QUERY
42 | 
43 | wget --output-document=query.json "$BASEURL$QUERY"
44 | 
45 | # JSON includes much more information than only product paths -> extract product path from the JSON and safe to 
46 | jq -r  '.. | .productIdentifier? | select( . != null ) ' query.json  | grep "/eodata/" > safe_files.txt
47 | 
48 | # Read the file with product paths and download each file from CDSE
49 | while IFS="" read -r FILE || [ -n "$FILE" ]
50 | do
51 | 	echo $FILE
52 | 	# Define folder name for each .SAFE file
53 | 	SAFENAME="$(basename -- $FILE)"
54 | 	
55 | 	# Download to local disk
56 | 	rclone copy -P -v cdse:$FILE /scratch/project_2000599/cdse/$SAFENAME
57 | 	
58 | 	# OR Download to Allas
59 | 	#rclone copy -P -v cdse:$FILE s3allas:yourBucketName/$SAFENAME
60 | done < safe_files.txt
61 | 
62 | # Delete temporary files
63 | rm area.csv
64 | rm query.json
65 | rm safe_files.txt


--------------------------------------------------------------------------------
/Copernicus_data_download/CDSE_S3_download_with_tile_names.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ###############
 4 | # Example script to query and download data from Copernicus Data Space Ecosystem WITH SENTINEL-2 TILE NAMES.
 5 | # See readme for connection set up details.
 6 | #
 7 | # Based on script provided by Maria Yli-Heikkilä (LUKE) - adapted to CDSE by Samantha Wittke and Kylli Ek, CSC - IT center for Science
 8 | 
 9 | # Provide Sentinel-2 tilenames that you want to download
10 | TILES=("T34VDM" "T34VEM" "T34VEN")
11 | 
12 | # Provide the timedelta - start date and time
13 | STARTDATE=2023-05-01T00:00:00
14 | 
15 | # end date and time ; fixed or set to current date
16 | #CURRENTDATE=$(date +"%Y-%m-%dT%T")
17 | #ENDDATE=$CURRENTDATE
18 | ENDDATE=2023-05-06T23:59:59
19 | 
20 | CLOUDCOVER="[0,95]"
21 | 
22 | # Baseurl to reach the CDSE catalog with json output
23 | BASEURL="http://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel2/search.json?"
24 | 
25 | YEAR=${ENDDATE:0:4}
26 | 
27 | for TILE in ${TILES[@]}
28 | 
29 | do
30 |     # Query the catalog with previously defined variables, 20 is the default max record number, which you can adapt to your needs
31 |     # See https://documentation.dataspace.copernicus.eu/APIs/OpenSearch.html#output-sorting-and-limiting for further options for sorting
32 |     QUERY="productType=S2MSI2A&startDate=${STARTDATE}.000Z&completionDate=${ENDDATE}.000Z&cloudCover=${CLOUDCOVER}&productIdentifier=${TILE}&maxRecords=20"
33 |     # echo $BASEURL$QUERY
34 |     wget --output-document=query_${YEAR}_${TILE}.json $BASEURL$QUERY
35 | 
36 |     # JSON includes much more information than only product paths -> extract product path from the JSON and safe to 
37 |     jq -r  '.. | .productIdentifier? | select( . != null ) ' query_${YEAR}_${TILE}.json  | grep "/eodata/" > name_${YEAR}_${TILE}.txt
38 | 
39 |     # Read the file with product paths and download each file from CDSE to Allas bucket defined above
40 |     while IFS="" read -r FILE || [ -n "$FILE" ]
41 |     do
42 |         echo $FILE
43 |         # Define folder name for each .SAFE file
44 |         SAFENAME="$(basename -- $FILE)"
45 | 		
46 | 	# Download to local disk
47 | 	rclone copy -P -v cdse:$FILE /scratch/project_2000599/cdse/$SAFENAME
48 | 		
49 | 	# OR Download to Allas
50 |         #rclone copy -P -v cdse:$FILE s3allas:yourBucketName/$SAFENAME
51 |     done < name_${YEAR}_${TILE}.txt
52 | 	
53 | 	# Delete temporary files
54 | 	rm query_${YEAR}_${TILE}.json
55 | 	rm name_${YEAR}_${TILE}.txt
56 | done
57 | 
58 | 


--------------------------------------------------------------------------------
/Copernicus_data_download/README.md:
--------------------------------------------------------------------------------
 1 | # Downloading data from Copernicus Data Space Ecosystem
 2 | 
 3 | The [Copernicus Data Space Ecosystem](https://dataspace.copernicus.eu/) (CDSE) provides multiple ways of querying and downloading data. Check out the [CSC Earth Observation guide](https://docs.csc.fi/support/tutorials/gis/eo_guide) for further information about the CDSE.
 4 | 
 5 | ## CDSE S3 download with rclone
 6 | 
 7 | These examples show how find and to copy data from CDSE S3 object storage using `rclone`:
 8 | 
 9 | 1. Query CDSE Sentinel-2 catalog based on startdate, enddate, cloudcover using [openSearch API](https://documentation.dataspace.copernicus.eu/APIs/OpenSearch.html)
10 | 2. Download the found data from CDSE object storage via `s3` using `rclone`. Data can be downloaded to local disk or directly to CSC's object storage Allas. 
11 | 
12 | The scripts are otherwise very similar, but the area of interest is defined in 2 different ways:
13 | 
14 | * Area is defined by Sentinel-2 tile names: [CDSE_S3_download_with_tile_names.sh](CDSE_S3_download_with_tile_names.sh)
15 | * Area is defined by polygon in a file: [CDSE_S3_download_with_polygon.sh](CDSE_S3_download_with_polygon.sh). The polygon many also be calculated as convex hull of a vector layer. 
16 | 
17 | 
18 | To run the script, first **connection details** to must be set up.
19 | 
20 | 1. [Get secret and access keys for CDSE](https://documentation.dataspace.copernicus.eu/APIs/S3.html#generate-secrets).
21 | 2. Configure rclone to use CDSE.
22 |     * [General rclone configuration instructions](https://rclone.org/docs/#configure)
23 |     * For most settings CDSE can use the same as Allas, see [Allas rclone settings](https://docs.csc.fi/data/Allas/using_allas/rclone_local/#configuring-s3-connection-in-windows)
24 |     * CDSE endpoint: `eodata.dataspace.copernicus.eu`.
25 |     * Name of the remote: `cdse`
26 | 3. If you want to download files to your local disk, you are ready to go. 
27 | 4. If you want to copy files to another object storage, for example CSC Allas, then set up `rclone` connection details also for the second service. Follow [Allas: Copying files directly between object storages](https://docs.csc.fi/data/Allas/accessing_allas/#copying-files-directly-between-object-storages) instructions. In this example `s3allas` is the name of the second remote connection.
28 | 
29 | The script should work on any Linux/Mac machine that has `rclone` installed. In CSC supercomputers, `rclone` is included in the `allas`-module. The search with polygon versioni requires `GDAL`, which is available in [several modules in Puhti](https://docs.csc.fi/apps/gdal/), for example `geoconda`.
30 | 
31 | ## Direct use of S3 data with GDAL, Python and R
32 | CDSE S3 data can also be used directly with [GDAL-based tools, inc Python, R and QGIS](https://docs.csc.fi/support/tutorials/gis/gdal_cloud/#vsis3-reading-and-writing-files-fromto-s3-services).
33 | 


--------------------------------------------------------------------------------
/NLS_geopackage_tests.md:
--------------------------------------------------------------------------------
 1 | ## NLS Geopackage tests
 2 | 
 3 | Main findings:
 4 | * Tested using with geopandas, R sf library and ogr2ogr. Geopandas and ogr2ogr both have some good advantages, sf not so much.
 5 | * Memory usage: Ogr2ogr is clearly more memory efficient as it seems that it's memory consumption is more or less constant regardless of input size. It probably handles the files in some sort of pieces. Geopanadas and sf on the other hand have to read the whole file into memory at once, but sf memory consumption is about 2x that of geopandas. Ogr2ogr 0.5GB, geopandas up to 15GB for single layer, sf up to 39 GB single layer.
 6 | * Computation time: Extracting the largest layers takes 21 and 24 mins with sf and geopandas (only read to memory), 49 mnins for ogr2ogr (includes saving to another file). Interestingly there wasn't significant time performance differences in smaller layers.
 7 | * Extracting a small area efficiently is possible. At least geopandas can make use of spatial indexes built into the geopackage format and read a small area defined by a bounding box fast even from largest layers. For the other two this should also be possible as there's an option to use SQL queries, but I couldn't make them perform any faster than reading the whole layer.
 8 | * Buffering & similar analysis using SQL queries is possible with ogr2ogr. With sf and geopandas you'll have to use different libraries after reading the file into a dataframe.
 9 | * Geopackage standard doesn't enforce a specific name for the geometry column, you can see it with ogrinfo.
10 | * Example code for [R](R/geopackage) and [Python](python/geopackage).
11 | 
12 | ### Testing results
13 | 
14 | Geopandas (read)
15 | 
16 | Layer		|Time	|Max rss|
17 | --------------| ----- | ----- |
18 | Hylky		|1s	|	|
19 | Kallioalue (4,5GB)|10:31|6,6GB|
20 | Suo 14GB|24:09	|15GB|
21 | Suo small bbox (7 features)*|0:03|0.8GB|
22 | 
23 | *~`geopandas.read_file(file, layer, bbox=)` Takes features that are at least partially within bbox.
24 | 
25 | R sf (read)
26 | 
27 | Layer		|Time	|Max rss|
28 | -------------- | ----- | ----- |
29 | Hylky		|2s	|	|
30 | Kallioalue (4,5GB)|8:03|12GB|
31 | Suo 14GB|21:26	|39GB|
32 | 
33 | ogr2ogr (read & write)  
34 | `ogr2ogr kallio.gpkg MTK-geopackage-test-18-06-07.gpkg -sql "select * from kallioalueet"`
35 | 
36 | Layer		|Time	|Max rss|
37 | -------------- | ----- | ----- |
38 | Kallioalue (4,5GB)|11:09|0.38GB|
39 | Suo 14GB|49:09	|0.33GB|
40 | 


--------------------------------------------------------------------------------
/R/README.md:
--------------------------------------------------------------------------------
 1 | Examples for doing spatial analysis with **R** in CSC computing environment:
 2 | 
 3 | * [puhti](puhti) - examples for different job types in Puhti: simple serial, array and parallel: `snow`, `parallel` and `future`.
 4 | * [Working with Allas data from R](allas)
 5 | * [Reading NLS topographic database geopackage with R](geopackage)
 6 | * [R for LiDAR data](R_LiDAR) - examples and exercises
 7 | * Some R packages have in-built support for parallization, for example `raster`, `terra` and `lidR`.
 8 |   * [raster](raster_predict) - includes also the batch job (parallel) file for Puhti.
 9 |   * [lidR](R_LiDAR/R_lidar_course_exercises)
10 |   * `terra` - just follow [terra manual](https://cran.r-project.org/web/packages/terra/terra.pdf), see for example predict example. For batch job file see `raster` package example.
11 | 
12 | References for CSC's R spatial tools:
13 | * [Puhti's R for GIS documentation](https://docs.csc.fi/apps/r-env-for-gis/), at the end of page are several links to good learning materials about R for spatial data analysis.
14 | * [Puhti's R documentation](https://docs.csc.fi/apps/r-env-singularity/)
15 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/README.md:
--------------------------------------------------------------------------------
1 | Examples using R for LiDAR datasets management:
2 | * [loading Puhti's LiDAR datasets with `rlas`](rlas-DEM_example) - load Puhti's LiDAR datasets for an area of interest and do basic reading and merging operations with the `rlas` package
3 | * [R LiDAR Course exercises](R_lidar_course_exercises) - using `rlas` and `lidR` in Puhti supercluster
4 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/README.md:
--------------------------------------------------------------------------------
 1 | ## R lidar exercise CSC's R lidar course
 2 | In these exercises the most basic functions of the `rlas` and `lidR` R packages by Jean-Romain are demonstrated. See the documentation for these packages from [rlas](https://cran.r-project.org/web/packages/rlas/index.html) and [lidR](https://github.com/Jean-Romain/lidR/wiki).
 3 | 
 4 | The materials in this repository are based mostly on the above mentioned documentation with some edits and new parts to adapt them to using these libraries in CSC's Puhti supercluster.
 5 | 
 6 | ## Course related info
 7 | The original 2019 CSC's course page: [Lidar data analysis in Taito, with PDAL and R](https://www.csc.fi/web/training/-/lidar-data-analysis-in-taito-with-pdal-and-r) The same material works in Puhti supercomputer as well.
 8 | 
 9 | The data for these excerises is basic NLS lidar data. In Puhti the lidar data can be found from /appl/data/geo/mml/laserkeilaus/2008_latest/
10 | 
11 | The original NLS lidar files might not workd with `lidR`, because of scale errors. For fixing this, use e.g. las2las 
12 | 
13 | fix one file with: 
14 | ```
15 | las2las -i /scratch/<PROJECT>/mml/laserkeilaus/2008_17/2017/T522/1/T5224F1.laz -rescale 0.01 0.01 0.01 -auto_reoffset -o ~/outfolder/T5224F1.laz
16 | ```
17 | 
18 | all with (in same directory):
19 | ```
20 | las2las -i ~/original_las_dir/*.laz -rescale 0.01 0.01 0.01 -auto_reoffset -olaz -odir ~/outdir/
21 | ```
22 | 
23 | The most up-to-date version of the exercises is in this repository. Download the contents of this repository as a zip file to your project's folders or your home folder in Puhti and unzip it with `unzip R_lidar_2019.zip`. Then connect to Puhti using NoMachine and start RStudio. Open the `R_lidar.Rproj` project from the `r_exercise` folder you just unziped.
24 | 
25 | Note that all the necessary software packages are already installed in the Puhti supercluster and thus their installation is not covered in these exercises. To see a description of the installed R spatial packages ready installed see: https://docs.csc.fi/apps/r-env/
26 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/R_lidar.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/las_files.txt:
--------------------------------------------------------------------------------
1 | /appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/U4422H1.laz
2 | /appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/U4422H2.laz


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/session3.1.-rlas.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "The lidR package - LAS formal class"
 3 | author: "Eduardo González"
 4 | date: "23. February 2019"
 5 | output:
 6 |   html_document:
 7 |     df_print: paged
 8 |     theme: flatly
 9 | ---
10 | 
11 | This materials have been adapted from the official [lidR wiki documentation](https://github.com/Jean-Romain/lidR/wiki) and [A Brief Introduction of lidR](http://xzsunbest.tk/2018/07/30/ABriefIntroductionOfLidR/).
12 | 
13 | 
14 | # The `rlas` package
15 | 
16 | The `rlas` package relies on a modified version of `LASlib` and `LASzip` libraries (by Martin Isenburg) that were modified to be compatible with `R`. See the official documentation of [the `rlas` package](https://cran.r-project.org/web/packages/rlas/index.html) for more information.
17 | 
18 | The main use of the `rlas` package is to read and write `.las` and `.laz` binary files used to store LiDAR data. LAS version 1.0 to 1.4 are supported. Point data record format 0,1,2,3,6,7,8 are supported.
19 | 
20 | The tools offer some basic functionality to directly read and manipulate LAS data.
21 | 
22 | ```{r}
23 | library(rlas)
24 | file_name <- "/appl/data/geo/mml/laserkeilaus/2008_latest/2019/L331/1/L3313F3.laz"
25 | lasdata   <- read.las(file_name)
26 | lasheader <- read.lasheader(file_name)
27 | ```
28 | 
29 | 
30 | ## Basic structure of an rlas data object
31 | 
32 | You can verify that the an `rlas` object is simply a data table with the data for each lidar point as a row:
33 | ```{r }
34 | class(lasdata)
35 | names(lasdata)
36 | ```
37 | 
38 | See a subset of the lidar data table:
39 | ```{r}
40 | print(lasdata[c(1:10),])
41 | ```
42 | 
43 | The original laz file size is 190Mb, but its size when loaded in memory to R is 3.3Gb. To see the size of the data table in memory:
44 | ```{r}
45 | size <- object.size(lasdata)
46 | print(size, units = "auto")
47 | ```
48 | 
49 | You may apply filters to columns and what points to load based on their attributes when loading LAS files:
50 | ```{r}
51 | filtered_las <- read.las(file_name, select = "ia", filter = "-keep_first -drop_intensity_below 95")
52 | size <- object.size(filtered_las)
53 | print(size, units = "auto")
54 | ```
55 | 
56 | The example above loads the coordinate columns x, y and z which are always loaded plus the intensity (i) column and the san angle (a) column. Then only the rows representing a first return point are loaded. Check the documentation for this function with `?read.las` or from the [rlas reference manual](https://cran.r-project.org/web/packages/rlas/rlas.pdf) to see all the available values for selection and filtering. Note that the filter values are the same as those in LAStools and can be checked with:
57 | ```{r}
58 | rlas:::lasfilterusage()
59 | ```
60 | 
61 | To write the fildered LAS data as a las or compressed laz file you ned to create the header first and then use `write.las()` function A las or laz file is created following the extension you inditate in the command:
62 | ```{r}
63 | filtered_header <- header_create(filtered_las)
64 | write.las("./outputs/out_las.las", filtered_header, filtered_las)
65 | write.las("./outputs/out_las.laz", filtered_header, filtered_las)
66 | ```
67 | 
68 | It is recommendable to also create an index for LAS file, you can do that with the `writelax()` function.
69 | ```{r}
70 | writelax("./outputs/out_las.laz")
71 | ```
72 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_batchjob_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | # SBATCH --account <YOUR-PROJECT>
 3 | #SBATCH --account project_2001659
 4 | #Name of the job, this makes it easier to identify your job
 5 | #output_%j.txt - Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job
 6 | #SBATCH --output=batch_output_%j.txt  # File to write the standard output to.
 7 | #error_%j.txt - As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both.
 8 | #SBATCH --error=batch_error_%j.txt  # File to write the standard error to.
 9 | #Partition you want to submit your job to. 
10 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished.
12 | #SBATCH --time=00:10:00  # Maximum duration of the job. Upper limit depends on partition.
13 | #Reserve 5 cores: 1 for master and 4 for workers
14 | #Compared to multicore version, use --ntasks setting, not --cpus-per-task
15 | #SBATCH --ntasks=5  # Number of tasks. Upper limit depends on partition.
16 | #Reserve 10000MB (10GB) of memory per node
17 | #SBATCH --mem=10000  # Real memory required per node.
18 | 
19 | module load r-env-singularity
20 | 
21 | # If you have installed packages this helps resolve problems related to those
22 | if test -f ~/.Renviron; then
23 |     sed -i '/TMPDIR/d' ~/.Renviron
24 |     sed -i '/OMP_NUM_THREADS/d' ~/.Renviron   
25 | fi
26 | 
27 | # Specify a temp folder path
28 | echo "TMPDIR=/scratch/<YOUR-PROJECT>" >> ~/.Renviron
29 | 
30 | # Remove and creates new output folder
31 | rm -rf batch_output
32 | mkdir batch_output
33 | # Use RMPISNOW instead of Rscript
34 | srun singularity_wrapper exec RMPISNOW --no-save --slave -f simple_catalog_lidR_cluster.R
35 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_batchjob_multicore.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | # SBATCH --account <YOUR-PROJECT>
 3 | #SBATCH --account project_2001659
 4 | #Name of the job, this makes it easier to identify your job
 5 | #output_%j.txt - Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job
 6 | #SBATCH --output=batch_output_%j.txt  # File to write the standard output to.
 7 | #error_%j.txt - As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both.
 8 | #SBATCH --error=batch_error_%j.txt  # File to write the standard error to.
 9 | #Partition you want to submit your job to. 
10 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished.
12 | #SBATCH --time=00:10:00  # Maximum duration of the job. Upper limit depends on partition.
13 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
14 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
15 | #The number of cpus used by the lidR task, equal to number of workers. Max 40 in Puhti.
16 | #SBATCH --cpus-per-task=5  # How many processors work on one task. Upper limit depends on number of CPUs per node.
17 | 
18 | #Tells the batch job sytem to reserve 8000MB (8GB) of memory for core
19 | #SBATCH --mem-per-cpu=8000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
20 | 
21 | module load r-env-singularity
22 | 
23 | # If you have installed packages this helps resolve problems related to those
24 | if test -f ~/.Renviron; then
25 |     sed -i '/TMPDIR/d' ~/.Renviron
26 |     sed -i '/OMP_NUM_THREADS/d' ~/.Renviron   
27 | fi
28 | 
29 | # Specify a temp folder path
30 | echo "TMPDIR=/scratch/<YOUR-PROJECT>" >> ~/.Renviron
31 | 
32 | # Match thread number with set_lidr_threads(n) setting in the R-script
33 | # echo "OMP_NUM_THREADS=2" >> ~/.Renviron
34 | 
35 | # Remove and creates new output folder
36 | rm -rf batch_output_multicore
37 | mkdir batch_output_multicore
38 | srun singularity_wrapper exec Rscript --no-save simple_catalog_lidR_multicore.R
39 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_cluster.R:
--------------------------------------------------------------------------------
 1 | # lidR supports parallel computation of project/catalog. 
 2 | # https://rdrr.io/cran/lidR/man/lidR-parallelism.html
 3 | # Parallelization is based on chunks. By default 1 chunk = 1 file.
 4 | # It is possible to define also chunks smaller than the file. 
 5 | # This increases the complexity of calculation and calculation time a little bit.
 6 | # But reduces the required memory significantly. 
 7 | # The number of workers should be equal or smaller than the number of chunks.
 8 | # In this example 4 workers are used.
 9 | # All lidR functions support using workers.
10 | 
11 | library(future)
12 | library("lidR")
13 | #Enabling this will print out a little bit more info about the parallelization plan used.
14 | options(lidR.verbose = TRUE)
15 | options(future.availableCores.methods = "Slurm")
16 | 
17 | # With plan(cluster) the number of workers is based on batch job reservation details: ntasks or ntasks-per-node.
18 | cl<-getMPIcluster()
19 | fcn <- plan(cluster, workers = cl)
20 | 
21 | # In Puhti R snow clusters do not seem to support OpenMP parallelization, so do not use this option in Puhti. 
22 | # set_lidr_threads(2)
23 | 
24 | # load catalog
25 | project <- readLAScatalog("/appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/")
26 | 
27 | # lascheck(ctg_subset)
28 | 
29 | # output file naming options
30 | opt_output_files(project) <- "batch_output/dtm_ctg_{XLEFT}_{YBOTTOM}_{ID}"
31 | 
32 | # NLS lidar files cover 3 x 3 km, so here 1500 x 1500 m chunk size is used -> 4 chunks per file -> ~16 chunks 
33 | # (with some overlapping it will be 20 chunks in practice).
34 | opt_chunk_size(project) <- 1500
35 | 
36 | # summary(ctg_subset)
37 | 
38 | # Calculate DTM for the catalog, note that the files are written by the catalog itself
39 | # https://rdrr.io/cran/lidR/man/catalog_apply.html
40 | output  <- catalog_sapply(project, grid_terrain, algorithm = tin())
41 | 
42 | #Last, stop the snow cluster
43 | stopCluster(cl)
44 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_multicore.R:
--------------------------------------------------------------------------------
 1 | # lidR supports parallel computation of project/catalog. 
 2 | # https://rdrr.io/cran/lidR/man/lidR-parallelism.html
 3 | # Parallelization is based on chunks. By default 1 chunk = 1 file.
 4 | # It is possible to define also chunks smaller than the file. 
 5 | # This increases the complexity of calculation and calculation time a little bit.
 6 | # But reduces the required memory significantly. 
 7 | # The number of workers should be equal or smaller than the number of chunks.
 8 | # In this example 4 workers are used.
 9 | # All lidR functions support using workers.
10 | 
11 | library(future)
12 | library("lidR")
13 | #Enabling this will print out a little bit more info about the parallelization plan used.
14 | options(lidR.verbose = TRUE)
15 | options(future.availableCores.methods = "Slurm")
16 | 
17 | # With plan(multicore) the number of workers is based on batch job reservation details.
18 | plan("multicore")
19 | 
20 | # In Puhti R OpenMP parallelization does not seem to work with lidR, so do not use this option in Puhti. 
21 | # set_lidr_threads(2)
22 | 
23 | # load catalog
24 | project <- readLAScatalog("/appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/")
25 | 
26 | # lascheck(ctg_subset)
27 | 
28 | # output file naming options
29 | opt_output_files(project) <- "batch_output_multicore/dtm_ctg_{XLEFT}_{YBOTTOM}_{ID}"
30 | 
31 | # NLS lidar files cover 3 x 3 km, so here 1500 x 1500 m chunk size is used -> 4 chunks per file.
32 | opt_chunk_size(project) <- 1500
33 | 
34 | # summary(ctg_subset)
35 | 
36 | # Calculate DTM for the catalog, note that the files are written by the catalog itself
37 | # https://rdrr.io/cran/lidR/man/catalog_apply.html
38 | output  <- catalog_sapply(project, grid_terrain, algorithm = tin())
39 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/simple_lidR.R:
--------------------------------------------------------------------------------
 1 | #Read the command line argument, which is the path to a las file.
 2 | args = commandArgs(trailingOnly=TRUE)
 3 | #Gets las_name from argument
 4 | if (length(args)==0) {
 5 |   stop("Please give a las file name.", call.=FALSE)
 6 | } else if (length(args)==1) {
 7 |   # input file
 8 |   las_name <- args[1]
 9 | }
10 | 
11 | 
12 | library("lidR")
13 | print(las_name)
14 | 
15 | # name with tif extension
16 | out_name <- paste0("dtm_", tools::file_path_sans_ext(basename(las_name)),".tif")
17 | 
18 | # Open las file
19 | las <- readLAS(las_name)
20 | print(las)
21 | # Calculate DTM and save to disk
22 | dtm <- grid_terrain(las, algorithm = tin())
23 | writeRaster(dtm, paste0("array_output/", out_name), format="GTiff", overwrite=TRUE)
24 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/R_lidar_course_exercises/simple_lidR_arrayjob.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #Name of the job, this makes it easier to identify your job
 4 | #output_%j.txt - Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job
 5 | #SBATCH --output=array_output/array_output_%j.txt  # File to write the standard output to.
 6 | #error_%j.txt - As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both.
 7 | #SBATCH --error=array_output/array_error_%j.txt  # File to write the standard error to.
 8 | #Partition you want to submit your job to. Possible values are serial, parallel, longrun, hugemem and test. In this excerecise we use test as it is for testing, but it shouldn't be used for serious work. See <link> for details.
 9 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
10 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished.
11 | #SBATCH --time=00:15:00  # Maximum duration of the job. Upper limit depends on partition.
12 | #--array - Tells the batch job system that this is an array job that should be run 3 times. It creates a variable named $SLURM_ARRAY_TASK_ID which will get a different value ranging from 1 to 3 for each task.
13 | #SBATCH --array=1-6  # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -.
14 | #Tells the batch job system that this is not a parallel task and only one task should be used. Note that this is one task per job, but array job will actually launch 3 simultaneous jobs.
15 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
16 | #Tells the batch job sytem to reserve 1000MB (1GB) of memory for each of the 3 jobs.
17 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
18 | 
19 | #As the job is not run on the login node where we submit the job from, it's necessary to load necessary modules in the batch job script. Loading the modules on the login node before sending the batch job will not help.
20 | module load r-env-singularity
21 | 
22 | # If you have installed packages this helps resolve problems related to those
23 | if test -f ~/.Renviron; then
24 |     sed -i '/TMPDIR/d' ~/.Renviron
25 | fi
26 | 
27 | #Read the file to be processed from a list of input files. This is done by getting the line corresponding to the $SLURM_ARRAY_TASK_ID from the input file list.
28 | input=$(sed -n "$SLURM_ARRAY_TASK_ID"p las_files.txt)
29 | 
30 | srun singularity_wrapper exec Rscript --no-save simple_lidR.R $input
31 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/README.md:
--------------------------------------------------------------------------------
 1 | # Basic Lidar data management in Puhti
 2 | 
 3 | For convinience, the full open Lidar datasets from National Land Survey of Finland (Maanmittauslaitos) have been made available in Puhti. This allows researchers to directly use these datasets in their work in Puhti without the need to download particular files. See more information about the LiDAR and other datasets available in Puhti from the [GIS Data in Puhti](https://research.csc.fi/gis_data_in_csc_computing_env) page.
 4 | 
 5 | The following are some simple R scripts demonstrating how the LiDAR dataset can be queried in your R scripts.
 6 | 
 7 | - [Get LiDAR files intersecting a given polygon](get_lidar_files_function.R)
 8 | - [Basic LiDAR Puhti files management with `rlas` package](basic_rlas.R)
 9 | - [Example batch job to run basic_rlas.R script](batchjob_rlas_basics.sh)
10 | 
11 | See more LiDAR tools and examples in the [R LiDAR Course exercises](../R_lidar_course_exercises)
12 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/area_of_interest.dbf:
--------------------------------------------------------------------------------
1 | v   A                      id         N    
2 |                          1


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/area_of_interest.prj:
--------------------------------------------------------------------------------
1 | PROJCS["ETRS89_TM35FIN_E_N",GEOGCS["GCS_ETRS_1989",DATUM["D_ETRS_1989",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",27],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["Meter",1]]


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/area_of_interest.qpj:
--------------------------------------------------------------------------------
1 | PROJCS["ETRS89 / TM35FIN(E,N)",GEOGCS["ETRS89",DATUM["European_Terrestrial_Reference_System_1989",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],TOWGS84[0,0,0,0,0,0,0],AUTHORITY["EPSG","6258"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4258"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",27],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","3067"]]
2 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/area_of_interest.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/R_LiDAR/rlas-DEM_example/area_of_interest.shp


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/area_of_interest.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/R_LiDAR/rlas-DEM_example/area_of_interest.shx


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/basic_rlas.R:
--------------------------------------------------------------------------------
 1 | ## Use get_lidar_files_function to get LiDAR files for a area of interest
 2 | ## and rlas to read and merge the LiDAR data
 3 | 
 4 | library(rlas)
 5 | library(foreach)
 6 | source("get_lidar_files_function.R")
 7 | 
 8 | # Get lidar file names
 9 | lidar_files <- lidar_files_puhti("area_of_interest.shp")
10 | print(lidar_files)
11 | 
12 | # Get basic information from the LiDAR files
13 | headers <- vector("list")
14 | for (file in lidar_files){
15 |   headers[[file]] <- read.lasheader(file)
16 | }
17 | names(headers)
18 | 
19 | # Preview the LiDAR data for the first file in the list
20 | lidar_data <- read.las(lidar_files[1]) # this is a data.frame
21 | tail(lidar_data)
22 | summary(lidar_data)
23 | 
24 | # Merge the LiDAR files into a single R data frame
25 | i <- 1:length(lidar_files)
26 | merged_lidar_data <- foreach(i, .combine='rbind') %do% {
27 |   read.las(lidar_files[i])}
28 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/batchjob_rlas_basics.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --output=out_R.txt  # File to write the standard output to.
 4 | #SBATCH --error=err_R.txt  # File to write the standard error to.
 5 | #SBATCH --time=00:15:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #SBATCH --mem-per-cpu=4000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
 7 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 8 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 9 | 
10 | module load r-env-singularity
11 | 
12 | # If you have installed packages this helps resolve problems related to those
13 | if test -f ~/.Renviron; then
14 |     sed -i '/TMPDIR/d' ~/.Renviron
15 | fi
16 | 
17 | 
18 | srun singularity_wrapper exec Rscript --no-save --slave basic_rlas.R
19 | 


--------------------------------------------------------------------------------
/R/R_LiDAR/rlas-DEM_example/get_lidar_files_function.R:
--------------------------------------------------------------------------------
 1 | # Function to get LiDAR files from Puhti's GIS datasets for a
 2 | # given area of interest
 3 | 
 4 | # Make sure that you have Puhti's R spatial environment loaded by using module load r-env or by using interactively 
 5 | # GIS RStudio in NoMachine.
 6 | 
 7 | lidar_files_puhti <- function(f_poly){
 8 | 
 9 |   # f_poly - the path to a polygon layer with polygons covering an area from
10 |   #          you want LiDAR files collected
11 | 
12 |   library(raster)
13 | 
14 |   # The LiDAR dataset in Puhti is located at /appl/data/geo/mml/laserkeilaus/2008_latest/2008_latest.shp
15 |   # You should use the lidar_auto_all.shp index file to spatially look for LiDAR files
16 |   f_lidar_index <- "/appl/data/geo/mml/laserkeilaus/2008_latest/2008_latest.shp"
17 |   lidar_index       <<- shapefile(f_lidar_index)
18 | 
19 |   # Polygon to get lidar datasets from an area
20 |   f_poly <<- "area_of_interest.shp"
21 |   poly <- shapefile(f_poly)
22 | 
23 |   # What lidar index tiles intersect with our polygon
24 |   inters <- intersect(lidar_index, poly)
25 |   if (is.null(inters)) {
26 |     print("ERROR: LiDAR tiles do not cover the study area!!!!")
27 |     next
28 |   }
29 | 
30 |   # From the index file, use the "path" attribute to get the file names of
31 |   # the LiDAR tiles intersecting your area
32 |   files <- vector()
33 |   for (i in 1:length(inters@polygons)){
34 |     file <- paste0("/appl/data/geo/",inters$path[i])
35 |     files <- c(files, file)
36 |   }
37 |   print(files)
38 |   return(files)
39 | }
40 | 
41 | # Test the function with the example area_of_interest.shp file
42 | # lidar_files <- lidar_files_puhti("./area_of_interest.shp")
43 | # print(lidar_files)
44 | 


--------------------------------------------------------------------------------
/R/STAC/Readme.md:
--------------------------------------------------------------------------------
 1 | # STAC R examples
 2 | 
 3 | The [STAC](https://stacspec.org/en/) is a specification to describe geospatial information, so it can easily **searched and downloaded**. 
 4 | STAC includes metadata of datasets and links to actual files, the data files are usually stored in the cloud. See [Paituli STAC page](https://paituli.csc.fi/stac.html) for general introduction about STAC and what Collections (=datasets) are included in Paituli STAC.
 5 | 
 6 | In this repository we provide examples to work with:
 7 | 
 8 | * [Paituli STAC API](STAC_CSC_example.R)
 9 | * See, also similar [Python STAC examples](../../python/STAC)
10 | 
11 | The examples mainly cover data search and download, using [rstac](https://cran.r-project.org/web/packages/rstac/index.html). For analyzing data [gdalcubes](https://gdalcubes.github.io/) or [terra](https://cran.r-project.org/web/packages/terra/index.html) can be used.  When working with bigger datasts, gdalcubes supports also parallelization.
12 | 
13 | The examples can be run on any computer with R installation. The required R packages can be seen from the beginning of the example scripts. The examples download all data from cloud storage, so relatively good internet connection is needed. For using data in JP2000 format, GDAL must be installed with JP2000-support.
14 | 
15 | It is possible to try this script also in CSC Puhti supercomputer. The easiest option is to start RStudio in Puhti web interface. For learning STAC, it is recommended to reserve 1 core and 8 Gb memory. Data analysis part with gdalcubes would benefit from more cores. Currently Puhti r-env does not support JP2000 format, so sentinel2-l2a can not be used with gdalcubes. Searching all collections works.
16 | 
17 | In CSC Puhti supercomputer, the examples can be run with [r-env module](https://docs.csc.fi/apps/r-env/), which includes all necessary R packages. The easiest is to use RStudio with Puhti web interface:
18 | 
19 | * Open [Puhti web interface](https://www.puhti.csc.fi/)
20 | * Click "RStudio" on dashboard
21 | * Select following settings:
22 | 	* Project: project_2002044 during course, own project otherwise 
23 | 	* Partition: interactive
24 | 	* Number of CPU cores: 1
25 | 	* Memory (Gb): 8 
26 | 	* Local disk (GB): 0
27 | 	* Time: 1:00:00 (or adjust to reasonable)
28 | * Click launch and wait until granted resources 
29 | * Click "Connect to RStudio Server" 
30 | 
31 | 
32 | If you want to run the example on your own computer, you'll need RStudio installed, along with the following packages: ```geojsonsf, sf, terra, tidyverse, rstac, httr, gdalcubes```. 
33 | 
34 | Download the example code to your computer (either by copying the whole repository to your computer with git (`git clone https://github.com/csc-training/geocomputing.git`) or by downloading only the needed file via github webinterface (find `STAC_CSC_example.Rmd` and in the upper right corner of the file "Download raw file button"). 
35 | 


--------------------------------------------------------------------------------
/R/allas/working_with_allas_from_R_S3.R:
--------------------------------------------------------------------------------
 1 | # Example script for using Allas directly from an R script:
 2 | # - Reading and wrtiing raster and vector files
 3 | # - Looping over all files of certain type in a bucket
 4 | # - Writing raster and vector files (not working properly) Older version
 5 | 
 6 | # Please notice that this example works ONLY with GDAL-based libraries for spatial data: sf, terra etc.
 7 | 
 8 | # Note this does not work with R version 442 in Puhti, use some other version.
 9 | 
10 | library("terra")
11 | library("sf")
12 | library("aws.s3")
13 | library("tidyverse")
14 | 
15 | # Before starting to use Allas with aws.s3 set up your credentials and endpoint to Allas.
16 | # This example here applies for using Allas from CSC Puhti or Mahti supercomputers.
17 | # To use some other S3 stroage or from some other computer,
18 | # See https://docs.csc.fi/support/tutorials/gis/gdal_cloud/#s3-connection-details
19 | 
20 | # 1) Set up your credentials to Allas:
21 | # module load allas
22 | # allas-conf --mode s3cmd
23 | # This is needed only once, as long as you are using the same CSC project.
24 | # This also sets S3 endopoint to .aws/config file.
25 | 
26 | # 2) Set S3 region for aws.s3-library.
27 | options("cloudyr.aws.default_region" = "")
28 | 
29 | # If you want to WRITE files with terra/sf directly to Allas, set also this.
30 | Sys.setenv("CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE" = "YES")
31 | 
32 | # Reading raster file
33 | r <- rast('/vsis3/name_of_your_Allas_bucket/name_of_your_input_raster_file.tif')
34 | 
35 | # This should work, but has had some bugs in terra code, so does not work with any R version in Puhti (8.8.2023)
36 | writeRaster(r, filename='/vsis3/name_of_your_Allas_bucket/name_of_your_output_raster_file.tif')
37 | 
38 | # Reading vector file
39 | v <- st_read('/vsis3/name_of_your_Allas_bucket/name_of_your_input_vector_file.gpkg')
40 | 
41 | # Writing vector file
42 | st_write(v, '/vsis3/name_of_your_Allas_bucket/name_of_your_output_vector_file.gpkg')
43 | 
44 | # Looping through all files in a bucket, having the same file type (tif).
45 | # First get list of all objects in the bucket
46 | all_files_in_bucket <- get_bucket_df(name_of_your_Allas_bucket)
47 | # Filter out only .tif-files and keep only the file name information (=Key)
48 | tif_files = all_files_in_bucket %>% filter(str_detect(Key, '.tif$')) %>% select(Key)
49 | # Loop through the files, here just printing the extent of each file as example.
50 | for (row in 1:nrow(tif_files)) {    
51 |   filepath <- paste('/vsis3/name_of_your_Allas_bucket/', tif_files[row,], sep = "")
52 |   print (filepath)
53 |   r <- raster(filepath)
54 |   print (extent(r))
55 | }
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/R/geopackage/README.md:
--------------------------------------------------------------------------------
1 | ## Reading NLS topographic database geopackage with R
2 | The NLS topographic database has been saved into several geopackage files in Puhti at /appl/data/geo/mml/maastotietokanta/20XX/gpkg. The larger layers are in their own gpkg files and the smaller layers have been bundled into a single file. The larger layers are quite large and reading them takes some time. If the whole layers are not however needed it is possible to use SF package in R to only read the desired parts of the files. The examples on how to read only certain rows can be found in the read_gpkg.R script.
3 | 
4 | In Puhti use the [r-env-singularity module](https://docs.csc.fi/apps/r-env-singularity/) for R.
5 | 
6 | Similar examples for reading the geopackage with Python using Geopandas can be found [here](https://github.com/csc-training/geocomputing/tree/master/python/geopackage).
7 | 


--------------------------------------------------------------------------------
/R/geopackage/read_gpkg.R:
--------------------------------------------------------------------------------
 1 | # Examples on reading data from NLS geopackages with sf. 
 2 | # The geopackges are quite large files so reading the whole thing takes a while. 
 3 | # We can however read parts of it quickly without having to inspect each row as shown in below examples. 
 4 | 
 5 | library(sf)
 6 | fn_muut <- "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-muut_20-02-06.gpkg"
 7 | fn_suo <- "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-suo_20-02-06.gpkg" 
 8 | #Reading a layer into a dataframe. Some layers are large, but for smaller layers this can be quick enough.
 9 | read_whole_layer <- function(){
10 |     layer="hylky"    
11 |     df<-read_sf(fn_muut, layer)
12 |     print(df)
13 | }
14 | 
15 | # Geopackage is internally an sqlite database which can be connected to and queried. 
16 | # The read_sf function takes a query= parameter that allows us to specify an SQL query to select only some parts of data. 
17 | # The given SQL is handled by OGR, so see https://www.gdal.org/ogr_sql.html for available further details.
18 | # Basically selection based on any attribute is possible, but selection by geometry does not seem to be possible.
19 | 
20 | # SQL selections can be used in several ways:
21 | 
22 | # Reading rows in range 10-20. Only the rows that we want will be read regardless of the actual number of rows in the layer.
23 | read_rows_in_range <- function(){
24 |     layer<-"suo"
25 |     start <- 10
26 |     end <- 20
27 |     sql <- sprintf("select * from %s where rowid >= %s and rowid < %s",layer, start, end)
28 |     df<-read_sf(fn_suo, layer=layer, query=sql)
29 |     print(df)
30 | }
31 | 
32 | #As above but for specific rows
33 | read_specific_rows <- function(){
34 |     layer<-"suo"
35 |     rows<-c(1,5,100)
36 |     sql <- sprintf("select * from %s where rowid in (%s)",layer, paste(rows, collapse=", "))
37 |     print(sql)
38 |     df<-read_sf(fn_suo, layer=layer, query=sql)
39 |     print(df)
40 | }
41 | 
42 | 
43 | #We can use the query parameter to ask rows based on any attribute not just rowid. This may however be slow depending on number of rows and indexes available in the geopackage. If the column you want to use is not indexed you can create the index as follows
44 | create_index <- function(){
45 |     layer<-"suo"
46 |     attr_col<-"mtk_id"
47 |     attr_value<-219920480
48 |     con <- dbConnect(RSQLite::SQLite(), fn)
49 |     res<-dbSendQuery(con, sprintf("CREATE INDEX index_%s_%s ON %s (%s)",layer, attr_col, layer, attr_col))	
50 |     dbClearResult(res)    
51 |     dbDisconnect(con)
52 | }
53 | 
54 | #After this you can quickly query the layer based on the column.
55 | read_by_attribute <- function(){
56 |     layer<-"suo"
57 |     attr_col<-"mtk_id"
58 |     attr_value<-219920480
59 |     
60 |     sql <- sprintf("select * from %s where %s=%s", layer, attr_col, attr_value)
61 |     df<-read_sf(fn,layer=layer, query=sql)
62 |     print(df)
63 | 
64 | }
65 | 
66 | 
67 | #If we want to query based on a bounding box efficiently we need to be able to take advantage of spatial indexing. The NLS Geopackage includes a spatial index for each layer already so we don't have to worry about creating it. Depending on how your version of GDAL has been compiled we still may need to enable spatilite extension to be able to take advantage of the indexing (=use RTreeIntersects function). To do this first open connection to the geopackage, enable spatialite extension and then supply that connection to read_sf rather than the filename. You can also try to just supply the filename to read_sf function and skip enabling spatialite as this may also work. 
68 | 
69 | read_area <- function(){
70 |     con <- dbConnect(RSQLite::SQLite(), fn)
71 |     
72 |     #Linux:    
73 |     res<-dbSendQuery(con, "select load_extension('libspatialite.so')")
74 |     #Windows:    
75 |     #res<-dbSendQuery(con, "select load_extension('libspatialite.dll')")    
76 |     
77 |     dbClearResult(res)    
78 |     bb<-c(374692, 6671989, 379750, 6676677)
79 |     layer<-"suo"
80 |     geom_col<-"sijainti_alue"
81 |     sql <- sprintf("select * from %s where rowid in (select id from rtree_%s_%s where id match RTreeIntersects(%s,%s,%s,%s))",layer, layer, geom_col, bb[1],bb[2],bb[3],bb[4])
82 |     df<-read_sf(con, query=sql)
83 |     dbDisconnect(con)
84 |     print(df)
85 | }
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/R/puhti/01_serial/Contours_simple.R:
--------------------------------------------------------------------------------
 1 | # This is an spatial analysis example script for using R in CSC Puhti
 2 | # This scipt can be used for serial or array jobs.
 3 | # Here countours are calculated based on a DEM file and saved in GeoPackage format.
 4 | # The file given as input is a 10m DEM file from Finnish NLS.
 5 | 
 6 | # load terra library
 7 | library(terra)
 8 | 
 9 | # Set the working directory with RStudio
10 | # mainDir <- "/scratch/project_2002044/students/ekkylli/geocomputing/R/puhti/01_serial"
11 | # setwd(mainDir)
12 | 
13 | mapsheets <- readLines('../mapsheets.txt')
14 | 
15 | #Calculate contours and save the results as GeoPackage
16 | for (mapsheet in mapsheets){
17 |   DEM <- rast(mapsheet)
18 |   file <- gsub("tif", "gpkg", basename(mapsheet))
19 |   contours <- as.contour(DEM)
20 |   writeVector(contours, file, filetype="GPKG", overwrite=TRUE)
21 | }


--------------------------------------------------------------------------------
/R/puhti/01_serial/serial_batch_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx    # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 5 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 6 | #SBATCH --time=0:05:00  # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 8 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
 9 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
10 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
11 | 
12 | module load r-env
13 | 
14 | # Clean up .Renviron file in home directory
15 | if test -f ~/.Renviron; then
16 |     sed -i '/TMPDIR/d' ~/.Renviron
17 | fi
18 | 
19 | # Specify a temp folder path
20 | # echo "TMPDIR=/scratch/<project>/tmp" >> ~/.Renviron
21 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron
22 | 
23 | srun apptainer_wrapper exec Rscript --no-save Contours_simple.R
24 | 


--------------------------------------------------------------------------------
/R/puhti/02_parallel_future/Calc_contours_future_cluster.R:
--------------------------------------------------------------------------------
 1 | # This is an spatial analysis example script for using R in CSC Puhti
 2 | # This script can be used for parallel jobs.
 3 | # The countours are calculated and saved in GeoPackage format.
 4 | # The file given as input is a 10m DEM file from Finnish NLS.
 5 | # The input files are listed in the mapsheet.txt file
 6 | 
 7 | # For parallel tasks future package is used.
 8 | 
 9 | # load libraries
10 | library(furrr)
11 | library(terra)
12 | 
13 | # Start the snow cluster and create a plan with future package
14 | cl<-getMPIcluster()
15 | plan(cluster, workers = cl)
16 | 
17 | # The function run on each core
18 | funtorun <- function(mapsheet) {
19 |   DEM <- rast(mapsheet)
20 |   file <- gsub("tif", "gpkg", basename(mapsheet))
21 |   contours <- as.contour(DEM)
22 |   writeVector(contours, file, filetype="GPKG", overwrite=TRUE)
23 | }
24 | 
25 | # Read the mapsheets from external file
26 | mapsheets <- readLines('../mapsheets.txt')
27 | 
28 | # Give cluster the work to be done
29 | system.time(a<-future_map(mapsheets,funtorun))
30 | 
31 | #Stop cluster
32 | stopCluster(cl)
33 | 
34 | 


--------------------------------------------------------------------------------
/R/puhti/02_parallel_future/Calc_contours_future_multicore.R:
--------------------------------------------------------------------------------
 1 | # This is an spatial analysis example script for using R in CSC Puhti
 2 | # This script can be used for parallel jobs.
 3 | # The countours are calculated and saved in GeoPackage format.
 4 | # The file given as input is a 10m DEM file from Finnish NLS.
 5 | # The input files are listed in the mapsheet.txt file
 6 | 
 7 | # For parallel tasks future package is used.
 8 | 
 9 | # load libraries
10 | library(furrr)
11 | library(terra)
12 | 
13 | # With plan(multicore) the number of workers is based on batch job reservation details.
14 | plan("multicore")
15 | 
16 | # The function run on each core
17 | funtorun <- function(mapsheet) {
18 |   DEM <- rast(mapsheet)
19 |   file <- gsub("tif", "gpkg", basename(mapsheet))
20 |   contours <- as.contour(DEM)
21 |   writeVector(contours, file, filetype="GPKG", overwrite=TRUE)
22 | }
23 | 
24 | # Read the mapsheets from external file
25 | mapsheets <- readLines('../mapsheets.txt')
26 | 
27 | # Give cluster the work to be done
28 | system.time(a<-future_map(mapsheets,funtorun))
29 | 


--------------------------------------------------------------------------------
/R/puhti/02_parallel_future/parallel_batch_job_future_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_20xxxxx    # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=output.txt  # File to write the standard output to.
 5 | #SBATCH --error=errors.txt  # File to write the standard error to.
 6 | #SBATCH --time=00:05:00  # Maximum duration of the job. Upper limit depends on partition.
 7 | #Reserve cores for 1 master + 3 workers
 8 | #SBATCH --ntasks=4  # Number of tasks. Upper limit depends on partition.
 9 | #Test partition is used for testing, for real jobs use either serial or parallel depending on how many nodes you need.
10 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
12 | 
13 | module load r-env
14 | 
15 | # If you have installed packages this helps resolve problems related to those
16 | if test -f ~/.Renviron; then
17 |     sed -i '/TMPDIR/d' ~/.Renviron
18 |     sed -i '/OMP_NUM_THREADS/d' ~/.Renviron   
19 | fi
20 | 
21 | # Specify a temp folder path
22 | # echo "TMPDIR=/scratch/<project>/tmp" >> ~/.Renviron
23 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron
24 | 
25 | srun apptainer_wrapper exec RMPISNOW --no-save --slave -f Calc_contours_future_cluster.R
26 | 


--------------------------------------------------------------------------------
/R/puhti/02_parallel_future/parallel_batch_job_future_multicore.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_20xxxxx    # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=output.txt  # File to write the standard output to.
 5 | #SBATCH --error=errors.txt  # File to write the standard error to.
 6 | #SBATCH --time=00:05:00  # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 8 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | #Equal to number of workers. Max 40 in Puhti.
11 | #SBATCH --cpus-per-task=3  # How many processors work on one task. Upper limit depends on number of CPUs per node.
12 | 
13 | module load r-env
14 | 
15 | # If you have installed packages this helps resolve problems related to those
16 | if test -f ~/.Renviron; then
17 |     sed -i '/TMPDIR/d' ~/.Renviron
18 |     sed -i '/OMP_NUM_THREADS/d' ~/.Renviron   
19 | fi
20 | 
21 | # Specify a temp folder path
22 | # echo "TMPDIR=/scratch/<project>/tmp" >> ~/.Renviron
23 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron
24 | 
25 | srun apptainer_wrapper exec Rscript --no-save Calc_contours_future_multicore.R
26 | 


--------------------------------------------------------------------------------
/R/puhti/03_parallel_snow/Calc_contours_snow.R:
--------------------------------------------------------------------------------
 1 | # This is an spatial analysis example script for using R in CSC Puhti
 2 | # This scipt can be used for parallel jobs.
 3 | # Here countours are calculated and saved in GeoPackage format.
 4 | # The file given as input is a 10m DEM file from Finnish NLS.
 5 | # The input files are listed in the mapsheet.txt file
 6 | 
 7 | # For parallel tasks the snow package is used.
 8 | 
 9 | # Start the snow cluster
10 | cl<-getMPIcluster()
11 | 
12 | # The function run on each core
13 | # The R modules need to be loaded inside the functions.
14 | # The variables from outside of this function are not visible.
15 | funtorun<-function(mapsheet) {
16 |   DEM <- rast(mapsheet)
17 |   file <- gsub("tif", "gpkg", basename(mapsheet))
18 |   contours <- as.contour(DEM)
19 |   writeVector(contours, file, filetype="GPKG", overwrite=TRUE)
20 | }
21 | 
22 | # load terra library
23 | clusterEvalQ(cl, library(terra))
24 | 
25 | # Read the mapsheets from external file
26 | mapsheets <- readLines('../mapsheets.txt')
27 | 
28 | # Give cluster the work to be done
29 | system.time(a<-clusterApply(cl,mapsheets,funtorun))
30 | 
31 | #Stop cluster
32 | stopCluster(cl)


--------------------------------------------------------------------------------
/R/puhti/03_parallel_snow/parallel_batch_job_snow.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_20xxxxx    # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=output_%j.txt  # File to write the standard output to.
 5 | #SBATCH --error=errors_%j.txt  # File to write the standard error to.
 6 | #SBATCH --time=00:04:00  # Maximum duration of the job. Upper limit depends on partition.
 7 | #Reserve cores for 1 master + 3 workers
 8 | #SBATCH --ntasks=4  # Number of tasks. Upper limit depends on partition.
 9 | #Test partition is for small test jobs only. For real jobs use either serial or parallel partition dependeing on how many nodes you need
10 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
12 | 
13 | module load r-env
14 | 
15 | if test -f ~/.Renviron; then
16 |     sed -i '/TMPDIR/d' ~/.Renviron
17 | fi
18 | 
19 | # Specify a temp folder path
20 | # echo "TMPDIR=/scratch/<project>/tmp" >> ~/.Renviron
21 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron
22 | 
23 | srun apptainer_wrapper exec RMPISNOW --no-save --slave -f Calc_contours_snow.R
24 | 


--------------------------------------------------------------------------------
/R/puhti/04_parallel_foreach/Calc_contours_foreach.R:
--------------------------------------------------------------------------------
 1 | # This is an spatial analysis example script for using R in CSC Puhti
 2 | # This script can be used for parallel jobs.
 3 | # The countours are calculated and saved in GeoPackage format.
 4 | # The file given as input is a 10m DEM file from Finnish NLS.
 5 | # The input files are listed in the mapsheet.txt file
 6 | 
 7 | # For parallel tasks the foreach with doMPI is used.
 8 | # See https://docs.csc.fi/apps/r-env-singularity/
 9 | 
10 | library(doMPI,quietly=TRUE)
11 | cl<-startMPIcluster()
12 | registerDoMPI(cl)
13 | 
14 | # Read the mapsheets from external file, in this case from user's workdirectory
15 | mapsheets <- readLines('../mapsheets.txt')
16 | 
17 | # The function run on each core
18 | # The R modules need to be loaded inside the functions.
19 | # The variables from outside of this function are not visible.
20 | 
21 | funtorun<-function(mapsheet) {
22 |   DEM <- rast(mapsheet)
23 |   file <- gsub("tif", "gpkg", basename(mapsheet))
24 |   contours <- as.contour(DEM)
25 |   writeVector(contours, file, filetype="GPKG", overwrite=TRUE)
26 | }
27 | 
28 | # Run funtorun function in parallel for each mapsheet. .export passes variables and .packages the necessary packages.
29 | # If return value is used .combine can be used to specify which function to use for combining results.
30 | 
31 | a<-foreach(i=1:3, .packages=c("terra"), .combine="c") %dopar% {
32 | 	funtorun(mapsheets[i])
33 | }
34 | #Print combined return values. In this case names of created shapefiles.
35 | print(a)
36 | closeCluster(cl)
37 | mpi.quit()
38 | 
39 | 


--------------------------------------------------------------------------------
/R/puhti/04_parallel_foreach/parallel_batch_job_foreach.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_20xxxxx    # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=output.txt  # File to write the standard output to.
 5 | #SBATCH --error=errors.txt  # File to write the standard error to.
 6 | #SBATCH --time=00:05:00  # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --ntasks=3  # Number of tasks. Upper limit depends on partition.
 8 | #Test partition is used for testing, for real jobs use either serial or parallel depending on how many nodes you need.
 9 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
10 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
11 | 
12 | module load r-env
13 | 
14 | if test -f ~/.Renviron; then
15 |     sed -i '/TMPDIR/d' ~/.Renviron
16 | fi
17 | 
18 | # Specify a temp folder path
19 | # echo "TMPDIR=/scratch/<project>/tmp" >> ~/.Renviron
20 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron
21 | 
22 | srun apptainer_wrapper exec Rscript --no-save --slave Calc_contours_foreach.R
23 | 


--------------------------------------------------------------------------------
/R/puhti/05_array/Contours_array.R:
--------------------------------------------------------------------------------
 1 | # This is an spatial analysis example script for using R in CSC Puhti
 2 | # This script can be used for serial or array jobs.
 3 | # Here a .tif DEM file is provided as an argument
 4 | # and then countours are calculated and saved in GeoPackage format.
 5 | # The file given as input is a 10m DEM file from Finnish NLS.
 6 | 
 7 | # Load the necessary libraries
 8 | library(terra)
 9 | 
10 | # Read the command line argument, which is the path of the .tif file.
11 | args = commandArgs(trailingOnly=TRUE)
12 | 
13 | if (length(args)==0) {
14 |   stop("Please give the map sheet number", call.=FALSE)
15 | } else if (length(args)==1) {
16 |   # The filepath given to this script goes to variable mapsheet
17 |   mapsheet <- args[1]
18 | }
19 | print(mapsheet)
20 | 
21 | # Calculate contours 
22 | DEM <- rast(mapsheet)
23 | file <- gsub("tif", "gpkg", basename(mapsheet))
24 | contours <- as.contour(DEM)
25 | # Save the results as GeoPackage
26 | writeVector(contours, file, filetype="GPKG", overwrite=TRUE)
27 | 


--------------------------------------------------------------------------------
/R/puhti/05_array/array_batch_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx    # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=slurm-%A_%a.out  # File to write the standard output to. %A is replaced by the job ID and %a with the array index.
 5 | #SBATCH --error=slurm-%A_%a.err  # File to write the standard error to. %A is replaced by the job ID and %a with the array index. Defaults to slurm-%A_%a.out if not provided.
 6 | #SBATCH --time=00:02:00  # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
 8 | #SBATCH --array=1-3  # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -.
 9 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
10 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | 
12 | # load the Puhti module for R
13 | module load r-env
14 | 
15 | if test -f ~/.Renviron; then
16 |     sed -i '/TMPDIR/d' ~/.Renviron
17 | fi
18 | 
19 | # Specify a temp folder path
20 | # echo "TMPDIR=/scratch/<project>/tmp" >> ~/.Renviron
21 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron
22 | 
23 | # read the file that has filepaths for mapsheets and pick one row according to variable $SLURM_ARRAY_TASK_ID
24 | name=$(sed -n "$SLURM_ARRAY_TASK_ID"p ../mapsheets.txt)
25 | 
26 | # run the analysis command
27 | srun apptainer_wrapper exec Rscript Contours_array.R $name
28 | 


--------------------------------------------------------------------------------
/R/puhti/mapsheets.txt:
--------------------------------------------------------------------------------
1 | /appl/data/geo/mml/dem10m/2019/W3/W33/W3333.tif
2 | /appl/data/geo/mml/dem10m/2019/W3/W33/W3332.tif
3 | /appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif
4 | 


--------------------------------------------------------------------------------
/R/puhti/mapsheets_URLs.txt:
--------------------------------------------------------------------------------
1 | /vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem10m/2019/W3/W33/W3333.tif
2 | /vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem10m/2019/W3/W33/W3332.tif
3 | /vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem10m/2019/W3/W33/W3331.tif
4 | 


--------------------------------------------------------------------------------
/R/raster_predict/README.md:
--------------------------------------------------------------------------------
 1 | > **_NOTE:_**  This example applies to [raster package](https://cran.r-project.org/web/packages/raster/index.html). The new [terra package](https://cran.r-project.org/web/packages/terra/index.html) is replacing raster package. For parallelization with terra just follow terra documentation, see for example predict example in [terra manual](https://cran.r-project.org/web/packages/terra/terra.pdf).
 2 | 
 3 | Some of the functions in `raster` package support parallel computing. 
 4 | 
 5 | This example includes a **parallel job** using `predict()` funcion from `raster` package.  The precence/absence of R logo is predicted in an image. This type of model is often used to predict species distributions. See the dismo package for more of that.
 6 | 
 7 | The example is from https://www.rdocumentation.org/packages/raster/versions/2.5-8/topics/predict and it's 
 8 | simplified and adapted to be run thorugh batch job system in Puhti.
 9 | 
10 | `r_run.sh` shows how to submit parallel R jobs to Puhti SLURM system.
11 | 
12 | For further details see comments in script. For general instructions on how to use R in Puhti see https://docs.csc.fi/apps/r-env-singularity/
13 | 
14 | > **_NOTE:_**  `raster` package uses for parallelization `snow` package, which can not be used in Puhti in interactive session nor RStudio .
15 | 


--------------------------------------------------------------------------------
/R/raster_predict/Rplots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/raster_predict/Rplots.pdf


--------------------------------------------------------------------------------
/R/raster_predict/r_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --output=output.txt  # File to write the standard output to.
 4 | #SBATCH --error=errors.txt  # File to write the standard error to.
 5 | #SBATCH --time=00:10:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #SBATCH --ntasks=4  # Number of tasks. Upper limit depends on partition.
 7 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
 8 | #SBATCH --mem=1000  # Real memory required per node.
 9 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
10 | 
11 | module load r-env-singularity
12 | 
13 | if test -f ~/.Renviron; then
14 |     sed -i '/TMPDIR/d' ~/.Renviron
15 | fi
16 | 
17 | srun singularity_wrapper exec RMPISNOW --no-save -f rtest.R
18 | 
19 | 


--------------------------------------------------------------------------------
/R/raster_predict/rlogo.grd:
--------------------------------------------------------------------------------
 1 | [general] 
 2 | creator=R package 'raster' 
 3 | created= 2010-04-16 14:57:47 
 4 | [georeference] 
 5 | nrows= 77 
 6 | ncols= 101 
 7 | xmin= 0 
 8 | ymin= 0 
 9 | xmax= 101 
10 | ymax= 77 
11 | projection= +proj=merc +datum=WGS84
12 | [data] 
13 | datatype= FLT4S 
14 | byteorder= little 
15 | nbands= 3 
16 | bandorder= BIL 
17 | categorical= FALSE 
18 | levels= NA 
19 | minvalue= 0:0:0 
20 | maxvalue= 255:255:255 
21 | nodatavalue= -3.4e+38 
22 | [legend] 
23 | legendtype=  
24 | values=  
25 | color=  
26 | [description] 
27 | layername= red:green:blue 
28 | history=  
29 | 


--------------------------------------------------------------------------------
/R/raster_predict/rlogo.gri:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/raster_predict/rlogo.gri


--------------------------------------------------------------------------------
/R/raster_predict/rtest.R:
--------------------------------------------------------------------------------
 1 | #Example from: https://www.rdocumentation.org/packages/raster/versions/2.5-8/topics/predict
 2 | 
 3 | # A simple model to predict the location of the R in the R-logo using 20 presence points 
 4 | # and 50 (random) pseudo-absence points. This type of model is often used to predict
 5 | # species distributions. See the dismo package for more of that.
 6 | 
 7 | library(raster)
 8 | setwd("raster_predict")
 9 | 
10 | # create a RasterStack or RasterBrick with with a set of predictor layers
11 | logo <- brick("rlogo.grd")
12 | names(logo)
13 | 
14 | # known presence and absence points
15 | p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85, 
16 |    66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31, 
17 |    22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2)
18 | 
19 | a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9,
20 |    99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21,
21 |    37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2)
22 | 
23 | 
24 | # extract values for points
25 | xy <- rbind(cbind(1, p), cbind(0, a))
26 | v <- data.frame(cbind(pa=xy[,1], extract(logo, xy[,2:3])))
27 | 
28 | #build a model, here an example with glm 
29 | model <- glm(formula=pa~., data=v)
30 | 
31 | #Serial code for making predictions:
32 | #r1 <- predict(logo, model, progress='text')
33 | 
34 | #Run predict function using an mpi cluster. Note that in Puhti the cluster is already available, you shouldn't start it yourself but use the handle provided by getMPIcluster.
35 | cl<-getMPIcluster()
36 | r1 <- clusterR(logo, predict, args=list(model), cl=cl)
37 | stopCluster(cl)
38 | 
39 | 
40 | #Plot the original data and results
41 | plotRGB(logo)
42 | points(p, bg='blue', pch=21)
43 | points(a, bg='red', pch=21)
44 | plot(r1,col = gray.colors(10, start = 0, end = 1, gamma = 1, alpha = NULL))
45 | quit()
46 | 
47 | 


--------------------------------------------------------------------------------
/R/virtual_rasters.R:
--------------------------------------------------------------------------------
 1 | # Example scripts for using virtual rasters.
 2 | # Here contours are calculated based on a 2m DEM file and saved in GeoPackage format.
 3 | # As input 4 different options are used:
 4 | # 1) Paituli files, copied to Puhti local disk
 5 | # 2) Paituli files, with URLs (from Espoo)
 6 | # 3) GeoPortti GeoCubes file (physically in cPouta next to Puhti, in Kajaani)
 7 | # 4) FMI STAC file (physically somewhere else in Finland)
 8 | # All of these data sources cover all Finland.
 9 | # The contours are calculated only on a subset of the data, defined by BBOX.
10 | # BBOX location can be changed to any other location in Finland.
11 | 
12 | library(terra)
13 | # For measuring computation time
14 | library(tictoc)
15 | 
16 | 
17 | # The extent of used data
18 | # bbox_vrt <- ext(489000, 490000, 7333000, 7334000)
19 | # Bigger BBOX
20 | bbox_vrt <- ext(480000, 490000, 7330000, 7340000)
21 | 
22 | 
23 | # Paituli files in Puhti locally
24 | puhti_dem2m_vrt <- "/appl/data/geo/mml/dem2m/dem2m_direct.vrt"
25 | puhti_result_file <- 'Puhti_vrt_contours.gpkg'
26 | 
27 | # Paituli files (URL)
28 | # Link from here: https://www.nic.funet.fi/index/geodata/mml/dem2m/2008_latest/
29 | paituli_dem2m_vrt <- "/vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem2m/2008_latest/dem2m.vrt"
30 | paituli_result_file <- 'Paituli_vrt_contours.gpkg'
31 | 
32 | 
33 | # GeoPortti GeoCubes files
34 | # Link from here: https://vm0160.kaj.pouta.csc.fi/geocubes/fileaccess/
35 | geocubes_dem2m_vrt <- "/vsicurl/https://vm0160.kaj.pouta.csc.fi/mml/korkeusmalli/km2/2022/km2_2022_2m.vrt"
36 | geocubes_result_file <- 'Geocubes_vrt_contours.gpkg'
37 | 
38 | # FMI STAC files
39 | # Link from here: https://pta.data.lit.fmi.fi/stac/items/MML-DTM-2m/MML-DTM-2m_2020.json
40 | fmi_dem2m_vrt <- '/vsicurl/https://pta.data.lit.fmi.fi/dem/etrs-tm35fin-n2000/MML-DTM-2020-2m-height.vrt'
41 | fmi_result_file <- 'FMI_vrt_contours.gpkg'
42 | 
43 | # Function to handle each dataset
44 | contours_from_vrt <- function(vrt_path, bbox, output_file){
45 |   # Create terra SpatRaster object of virtual raster file.
46 |   # Data is not read to R at this phase.
47 |   vrt <- rast(vrt_path)
48 |   # Crop the SpatRaster to our area of interest.
49 |   DEM = crop(vrt, bbox)
50 |   # Calcualte contours.
51 |   contours <- as.contour(DEM)
52 |   # Save result to a file.
53 |   writeVector(contours, output_file, filetype="GPKG", overwrite=TRUE)
54 | }
55 | 
56 | # Run the function with each data source and see how long it takes.
57 | tic("Paituli files, in Puhti locally")
58 | contours_from_vrt(puhti_dem2m_vrt, bbox_vrt, puhti_result_file)
59 | toc()
60 | 
61 | tic("Paituli files, URL")
62 | contours_from_vrt(geocubes_dem2m_vrt, bbox_vrt, geocubes_result_file)
63 | toc()
64 | 
65 | tic("GeoCubes in cPouta")
66 | contours_from_vrt(geocubes_dem2m_vrt, bbox_vrt, geocubes_result_file)
67 | toc()
68 | 
69 | tic("FMI")
70 | contours_from_vrt(fmi_dem2m_vrt, bbox_vrt, fmi_result_file)
71 | toc()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Geocomputing using CSC resources
 2 | 
 3 | This repository contains examples for use of different geospatial applications. Many of the examples are for [CSC supercomputer Puhti](https://docs.csc.fi/computing/systems-puhti/) but may also be helpful for other systems (or your own computer). Please find a list of all geospatial software that is available on Puhti in [CSC docs](https://docs.csc.fi/apps/#geosciences). 
 4 | 
 5 | ## Puhti
 6 | 
 7 | ### R
 8 | * [Overview](./R/README.md)
 9 | * [Puhti](./R/puhti) - serial/array/parallel processing with R.
10 | * [R for LiDAR data](./R/R_LiDAR): lidR and rlas
11 | * [Working with Allas data from R](./R/allas)
12 | * [Reading NLS topographic database geopackage with R](./R/geopackage)
13 | 
14 | 
15 | ### Python
16 | * [Overview](./python/README.md)
17 | * [Puhti](./Python/puhti/README.md) - serial/array/parallel processing with Python.
18 | * [Working with Allas data from Python](./Python/allas)
19 | * [Reading NLS topographic database geopackage with Python](./python/geopackage/README.md)
20 | * [GRASS multiprocessing from Python](./python/grass_multiprocessing_with_python/README.md)
21 | * [Routing](./python/routing/readme.md)
22 | * [Sentinel data download from Finhub using sentinelsat](python/sentinel/README.md)
23 | * [STAC, xarray and dask for downloading and processing data](./python/STAC/stac_xarray_dask_example.ipynb)
24 | * [Zonal statistics in parallel](./python/zonal_stats/README.md)
25 | * [Python Dask-geopandas](./python/dask_geopandas/README.md)
26 | 
27 | ### Other tools
28 | * [FORCE ](./force/README.md)  
29 | * [GDAL](./gdal/readme.md)
30 | * [GRASS](./grass/readme.md)
31 | * [PDAL](./pdal/README.md) 
32 | * [SNAP graph processing tool gpt](./snap/README.md)
33 | 
34 | ### Use cases / longer examples
35 | * [GeoPortti share Github repository](https://github.com/geoporttishare?tab=repositories) includes several longer examples of HPC usage.
36 | 
37 | ## Pouta 
38 | * [Instructions to setup geospatial tools](./pouta/README.md) to virtual machines in [CSC's cPouta environment](https://docs.csc.fi/cloud/pouta/), inc OpenDroneMap, GeoServer, ArcPy and MetaShape.
39 | 
40 | ## CSC Notebooks
41 | * [Setting up geospatial Python Jupyter environment](./noppe/Readme.md)
42 | 
43 | ## Download
44 | 
45 | If you have installed Git or You can download these scripts to any computer using git. To do this, first navigate to the destination folder (in Puhti this could be your project's **projappl** or **scratch** folder):
46 | 
47 | `cd /projappl/<YOUR-PROJECT>`
48 | or
49 | `cd /scratch/<YOUR-PROJECT>`
50 | 
51 | And then clone this repository there
52 | 
53 | `git clone https://github.com/csc-training/geocomputing.git`
54 | 
55 | If Git is not available, you can download the files also as zip-file from the `Code` drop-down menu above.
56 | 
57 | 
58 | ## License
59 | These examples are free to use under CC 4.0 BY license unless marked otherwise.
60 | 
61 | ## Acknowledgement
62 | 
63 | Please acknowledge CSC and Geoportti in your publications, it is important for project continuation and funding reports. As an example, you can write "The authors wish to thank CSC - IT Center for Science, Finland (urn:nbn:fi:research-infras-2016072531) and the Open Geospatial Information Infrastructure for Research (Geoportti, urn:nbn:fi:research-infras-2016072513) for computational resources and support
64 | 


--------------------------------------------------------------------------------
/force/LEVEL2_parameters.prm:
--------------------------------------------------------------------------------
 1 | ++PARAM_LEVEL2_START++
 2 | 
 3 | # INPUT/OUTPUT DIRECTORIES
 4 | # ------------------------------------------------------------------------
 5 | FILE_QUEUE = /users/johannes/force/file_queue.txt
 6 | DIR_LEVEL2 = /scratch/project_2000599/force/output_L2A
 7 | DIR_LOG = /scratch/project_2000599/force
 8 | DIR_TEMP = /scratch/project_2000599/force/temp
 9 | 
10 | # DIGITAL ELEVATION MODEL
11 | # ------------------------------------------------------------------------
12 | FILE_DEM = NULL
13 | DEM_NODATA = -32767
14 | 
15 | # DATA CUBES
16 | # ------------------------------------------------------------------------
17 | DO_REPROJ = TRUE
18 | DO_TILE = TRUE
19 | FILE_TILE = NULL
20 | TILE_SIZE = 30000
21 | BLOCK_SIZE = 3000
22 | RESOLUTION_LANDSAT = 30
23 | RESOLUTION_SENTINEL2 = 10
24 | ORIGIN_LON = -25
25 | ORIGIN_LAT = 60
26 | PROJECTION = GLANCE7
27 | RESAMPLING = CC
28 | 
29 | # RADIOMETRIC CORRECTION OPTIONS
30 | # ------------------------------------------------------------------------
31 | DO_ATMO = TRUE
32 | DO_TOPO = FALSE
33 | DO_BRDF = TRUE
34 | ADJACENCY_EFFECT = TRUE
35 | MULTI_SCATTERING = TRUE
36 | 
37 | # WATER VAPOR CORRECTION OPTIONS
38 | # ------------------------------------------------------------------------
39 | DIR_WVPLUT = NULL
40 | WATER_VAPOR = NULL
41 | 
42 | # AEROSOL OPTICAL DEPTH OPTIONS
43 | # ------------------------------------------------------------------------
44 | DO_AOD  = TRUE
45 | DIR_AOD  = NULL
46 | 
47 | # CLOUD DETECTION OPTIONS
48 | # ------------------------------------------------------------------------
49 | MAX_CLOUD_COVER_FRAME = 75
50 | MAX_CLOUD_COVER_TILE  = 75
51 | CLOUD_THRESHOLD  = 0.225
52 | SHADOW_THRESHOLD = 0.02
53 | 
54 | # RESOLUTION MERGING
55 | # ------------------------------------------------------------------------
56 | RES_MERGE = IMPROPHE
57 | 
58 | # CO-REGISTRATION OPTIONS
59 | # ------------------------------------------------------------------------
60 | DIR_COREG_BASE = NULL
61 | COREG_BASE_NODATA = -9999
62 | 
63 | # MISCELLANEOUS OPTIONS
64 | # ------------------------------------------------------------------------
65 | IMPULSE_NOISE = TRUE
66 | BUFFER_NODATA = FALSE
67 | 
68 | # TIER LEVEL
69 | # ------------------------------------------------------------------------
70 | TIER = 1
71 | 
72 | # PARALLEL PROCESSING
73 | # ------------------------------------------------------------------------
74 | # Multiprocessing options (NPROC, DELAY) only apply when using the batch
75 | # utility force-level2. They are not used by the core function force-l2ps.
76 | # ------------------------------------------------------------------------
77 | NPROC = 8
78 | NTHREAD = 2
79 | PARALLEL_READS = FALSE
80 | DELAY = 3
81 | TIMEOUT_ZIP = 30
82 | 
83 | # OUTPUT OPTIONS
84 | # ------------------------------------------------------------------------
85 | OUTPUT_FORMAT = GTiff
86 | OUTPUT_DST = FALSE
87 | OUTPUT_AOD = FALSE
88 | OUTPUT_WVP = FALSE
89 | OUTPUT_VZN = FALSE
90 | OUTPUT_HOT = FALSE
91 | OUTPUT_OVV = TRUE
92 | 
93 | ++PARAM_LEVEL2_END++
94 | 


--------------------------------------------------------------------------------
/force/README.md:
--------------------------------------------------------------------------------
 1 | # FORCE example & benchmarks
 2 | 
 3 | This is an example of using FORCE to process L1 Sentinel images to L2 using the **force-level2** command. FORCE documentation can be found here 
 4 | 
 5 | https://force-eo.readthedocs.io/en/latest/index.html
 6 | 
 7 | ## Repository content
 8 | 
 9 | * **file_queue.txt** - the queue file that has all Sentinel images to be processed
10 | * **LEVEL2_parameters.prm** - the parameter file which holds all processing related parameters. Remember to change the NPROC to number of CPUs you reserved
11 | * **force_batch_job.sh** - the batch job file used to submit the job to Puhti
12 | 
13 | ## Benchmarks
14 | 
15 | Processing 4 L1C Sentinel-images to L2A. Test images can be found from /appl/data/geo/sentinel/s2_example_data/L1C
16 | 
17 | relevant parameters in .prm file
18 | 
19 | **DO_TOPO = FALSE**
20 | **NPROC = number of CPU you reserved in the batch job file**
21 | **NTHREAD = 2**
22 | 
23 | ### 4CPU
24 | 
25 | * Nodes: 1
26 | * Cores per node: 4
27 | * CPU Utilized: 01:33:09
28 | * CPU Efficiency: 94.60% of 01:38:28 core-walltime
29 | * Job Wall-clock time: 00:24:37
30 | 
31 | ### 8CPU
32 | 
33 | * Nodes: 1
34 | * Cores per node: 8
35 | * CPU Utilized: 01:35:51
36 | * CPU Efficiency: 80.05% of 01:59:44 core-walltime
37 | * Job Wall-clock time: 00:14:58
38 | 
39 | ### 16CPU
40 | 
41 | * Nodes: 1
42 | * Cores per node: 16
43 | * CPU Utilized: 01:37:39
44 | * CPU Efficiency: 42.19% of 03:51:28 core-walltime
45 | * Job Wall-clock time: 00:14:28
46 | 
47 | # CONCLUSION
48 | 
49 | From the benchmark runs, it seems that **a good rule of thumb is that the optimal number of CPU cores is approximately twice the amount of images processed in parallel.** In this example we had 4 images and 8 CPU cores which produced **80% CPU** efficiency, but 16 CPU cores only **42%**. 
50 | 
51 | Maximum number of available CPU cores for FORCE in Puhti is one full node which is **40 CPU** cores
52 | 
53 | This example used approximately **34GB** memory while processing 4 images at the same time. 
54 | 
55 | 


--------------------------------------------------------------------------------
/force/file_queue.txt:
--------------------------------------------------------------------------------
1 | /appl/data/geo/sentinel/s2_example_data/L1C/S2A_MSIL1C_20200320T094031_N0209_R036_T34VFM_20200320T101331.SAFE QUEUED
2 | /appl/data/geo/sentinel/s2_example_data/L1C/S2A_MSIL1C_20200422T095031_N0209_R079_T35VLG_20200422T133517.SAFE QUEUED
3 | /appl/data/geo/sentinel/s2_example_data/L1C/S2B_MSIL1C_20200407T095029_N0209_R079_T35VLG_20200407T115232.SAFE QUEUED
4 | /appl/data/geo/sentinel/s2_example_data/L1C/S2B_MSIL1C_20200427T095029_N0209_R079_T35VLG_20200427T115137.SAFE QUEUED


--------------------------------------------------------------------------------
/force/force_batch_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 4 | #SBATCH --time=01:00:00  # Maximum duration of the job. Upper limit depends on partition.
 5 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 6 | #SBATCH --cpus-per-task=8  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 7 | #SBATCH --mem=40G  # Real memory required per node.
 8 | 
 9 | module load force
10 | srun force-level2 /users/johannes/force/LEVEL2_parameters.prm
11 | 


--------------------------------------------------------------------------------
/gdal/gdal_batch_job_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ToDo: change project name in the row below
 3 | #SBATCH --account=project_20xxxxx  # Choose the project to be billed
 4 | # SBATCH --reservation=geocomputing_wed # Only available during the course
 5 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 6 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 7 | #SBATCH --time 0:05:00
 8 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
10 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
11 | #SBATCH --mem-per-cpu=300  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
12 | 
13 | # Load geoconda module to have GDAL commandline tools available.
14 | module load parallel geoconda
15 | 
16 | # Find the files that have .tif ending, we do not want to process the .tif.aux.xml files in the same folders.
17 | # Run the GDAL script for each of the found files.
18 | 
19 | find /appl/data/geo/mml/dem10m/2019/W3/W33 -name '*.tif' | \
20 |     parallel -j $SLURM_CPUS_PER_TASK bash gdal_parallel.sh {}
21 | 


--------------------------------------------------------------------------------
/gdal/gdal_batch_job_serial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # ToDo: change project name in the row below
 3 | #SBATCH --account=project_20xxxxx  # Choose the project to be billed
 4 | # SBATCH --reservation=geocomputing_wed # Only available during the course
 5 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 6 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 7 | #SBATCH --time 0:05:00
 8 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
10 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
11 | #SBATCH --mem-per-cpu=300  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
12 | 
13 | # Load geoconda module to have GDAL commandline tools available.
14 | module load geoconda
15 | 
16 | # Run the bash script, which includes the GDAL commands.
17 | srun bash gdal_serial.sh
18 | 


--------------------------------------------------------------------------------
/gdal/gdal_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Get the file name given as input (argument) for this script.
 4 | in=$1
 5 | 
 6 | # Define output file name, based on input file name
 7 | out=$(basename $in)
 8 | 
 9 | # Change the coordinate system to EPSG:2393, which is the old Finnish YKJ (=KKJ3)
10 | gdalwarp $in $out -of COG -t_srs EPSG:2393 -overwrite
11 | 


--------------------------------------------------------------------------------
/gdal/gdal_serial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Find the files that have .tif ending, we do not want to process the .tif.aux.xml files in the same folders.
 4 | for i in $(find /appl/data/geo/mml/dem10m/2019/W3/W33 -name '*.tif')
 5 | 
 6 | # Process the files
 7 | do
 8 |     # Define output file name, based on input file name
 9 | 	out=$(basename $i)
10 | 	# Change the coordinate system to EPSG:2393, which is the old Finnish YKJ (=KKJ3)
11 | 	# ToDo: change project name and username in the row below
12 |     gdalwarp $i /scratch/project_20xxxxx/students/cscusername/geocomputing/gdal/$out -of COG -t_srs EPSG:2393
13 | done
14 | 


--------------------------------------------------------------------------------
/grass/01_serial_cli/grass_cli.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | 
 4 | #Set file paths and names
 5 | DEMFILE="/appl/data/geo/mml/dem10m/2019/V4/V41/V4132.tif"
 6 | GRASSINPUT="V4132"
 7 | GRASSOUTPUT="contours"
 8 | OUTPUT="/scratch/project_2000599/grass/output/V4132.gpkg"
 9 | 
10 | # Register external GeoTIFF in current mapset:
11 | r.external input=$DEMFILE output=$GRASSINPUT --verbose --overwrite
12 | 
13 | # Set GRASS region
14 | g.region raster=$GRASSINPUT
15 | 
16 | # Perform GRASS analysis, here calculate contours from DEM
17 | r.contour in=$GRASSINPUT out=$GRASSOUTPUT minlevel=200 maxlevel=800 step=10 --overwrite
18 | 
19 | #Write output to file
20 | v.out.ogr input=$GRASSOUTPUT output=$OUTPUT --overwrite
21 | 
22 | # These can be left out, just debug info
23 | echo "\n\n ***DEBUG INFO***"
24 | echo "GRASS version"
25 | g.version
26 | 
27 | echo "GRASS env settings: gisdatabase, location, mapset"
28 | g.gisenv
29 | 
30 | echo "Available datasets:"
31 | g.list type=all -m
32 | 
33 | echo "Input file info"
34 | r.info $GRASSINPUT --verbose
35 | 
36 | echo "Output  info"
37 | v.info $GRASSOUTPUT --verbose


--------------------------------------------------------------------------------
/grass/01_serial_cli/grass_cli_serial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 4 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 5 | #SBATCH --time=0:05:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 7 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
 8 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 9 | #SBATCH --mem-per-cpu=4000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | 
11 | module load grassgis
12 | 
13 | # Run the GRASS script with temporary location
14 | grass --tmp-location EPSG:3067 --exec bash grass_cli.sh
15 | 


--------------------------------------------------------------------------------
/grass/02_python_scripting_serial/python_scripting_serial.py:
--------------------------------------------------------------------------------
 1 | import grass.script as gscript
 2 | import grass.script as gcore
 3 | import pprint
 4 | 
 5 | import json
 6 | 
 7 | file='/appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif'
 8 | grassfile='W3331'
 9 | grasscontoursfile='W3331_contours'
10 | contoursfile="/scratch/project_2000599/grass/output/V4132.gpkg"
11 | 
12 | # Register external GeoTIFF in current mapset:
13 | gscript.parse_command("r.external", input=file,output=grassfile,flags="e",overwrite=True)
14 | 
15 | # Set GRASS region
16 | gscript.run_command('g.region', rast=grassfile)
17 | 
18 | # Perform GRASS analysis, here calculate contours from DEM
19 | gscript.run_command('r.contour', input=grassfile, output=grasscontoursfile, minlevel=200, maxlevel=800, step=10, overwrite=True)
20 | 
21 | #Write output to file
22 | gscript.run_command('v.out.ogr', input=grasscontoursfile, output=contoursfile, overwrite=True)
23 | 
24 | # These can be left out, just debug info
25 | # TODO: not working properly!
26 | print( "\n\n ***DEBUG INFO***")
27 | print( "GRASS version")
28 | print(gscript.read_command("g.version"))
29 | 
30 | print("\nGRASS env settings: gisdatabase, location, mapset")
31 | print(gscript.read_command("g.gisenv", flags="s"))
32 | 
33 | print("\nAvailable datasets:")
34 | print(gscript.read_command("g.list", type="all", flags='m'))
35 | 
36 | print("\nInput file info")
37 | print(gscript.read_command("r.info", map=grassfile, flags='g'))
38 | 
39 | print("\nOutput  info")
40 | print(gscript.read_command("v.info", map=grasscontoursfile, flags='g'))


--------------------------------------------------------------------------------
/grass/02_python_scripting_serial/python_scripting_serial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 4 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 5 | #SBATCH --time=0:05:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 7 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
 8 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 9 | #SBATCH --mem-per-cpu=4000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | 
11 | module load grassgis
12 | 
13 | # Run the GRASS Python script with temporary location
14 | grass --tmp-location EPSG:3067 --exec python3 python_scripting_serial.py 
15 | 


--------------------------------------------------------------------------------
/grass/03_pygrass_serial/pygrass_serial.py:
--------------------------------------------------------------------------------
 1 | from grass.pygrass.modules.shortcuts import general as g
 2 | from grass.pygrass.modules.shortcuts import raster as r
 3 | from grass.pygrass.modules.shortcuts import vector as v
 4 | #from grass.pygrass.modules.shortcuts import temporal as t
 5 | 
 6 | from grass.pygrass.modules.grid import GridModule
 7 | 
 8 | file='/appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif'
 9 | grassfile='W3331'
10 | grasscontoursfile='W3331_contours'
11 | contoursfile="/scratch/project_2000599/grass/output/V4132.gpkg"
12 | 
13 | # Register external GeoTIFF in current mapset:
14 | r.external(input=file,output=grassfile,flags="e",overwrite=True)
15 | 
16 | # Set GRASS region
17 | g.region(raster=grassfile)
18 | 
19 | # Perform GRASS analysis, here calculate contours from DEM
20 | r.contour(input=grassfile, output=grasscontoursfile, minlevel=200, maxlevel=800, step=10, overwrite=True)
21 | 
22 | #Write output to file
23 | v.out_ogr(input=grasscontoursfile, output=contoursfile, overwrite=True)
24 | 
25 | # These can be left out, just debug info
26 | # TODO: not working properly!
27 | print( "\n\n ***DEBUG INFO***")
28 | print( "GRASS version")
29 | print(g.version())
30 | 
31 | print("GRASS env settings: gisdatabase, location, mapset")
32 | print(g.gisenv())
33 | 
34 | print("Available datasets:")
35 | print(g.list(type="all", flags='m'))
36 | 
37 | print("Input file info")
38 | print(r.info(map=grassfile, verbose=True))
39 | 
40 | print("Output  info")
41 | print(v.info(map=grasscontoursfile, verbose=True))


--------------------------------------------------------------------------------
/grass/03_pygrass_serial/pygrass_serial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 4 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 5 | #SBATCH --time=0:05:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 7 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
 8 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 9 | #SBATCH --mem-per-cpu=4000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | 
11 | module load grassgis
12 | 
13 | # Run the PyGRASS script with temporary location
14 | grass --tmp-location EPSG:3067 --exec python3 pygrass_serial.py 


--------------------------------------------------------------------------------
/grass/04_pygrass_parallel/pygrass_parallel_with_gridmodule.py:
--------------------------------------------------------------------------------
 1 | import grass.script as gscript
 2 | from grass.pygrass.modules.shortcuts import general as g
 3 | from grass.pygrass.modules.shortcuts import raster as r
 4 | from grass.pygrass.modules.shortcuts import vector as v
 5 | #from grass.pygrass.modules.shortcuts import temporal as t
 6 | 
 7 | from grass.pygrass.modules.grid import GridModule
 8 | 
 9 | file='/appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif'
10 | grassfile='W3331'
11 | grasscontoursfile='W3331_contours'
12 | aspectfile="/scratch/project_2000599/grass/output/aspect.tif"
13 | cpus=4
14 | 
15 | # Register external GeoTIFF in current mapset:
16 | r.external(input=file,output=grassfile,flags="e",overwrite=True)
17 | 
18 | # Set GRASS region
19 | g.region(raster=grassfile)
20 | 
21 | #Perform GRASS analysis, here calculate contours from DEM, parallelization with GridModule
22 | region = gscript.region()
23 | width = region['cols'] // 2 + 1
24 | height = region['rows'] // 2 + 1
25 | 
26 | grd = GridModule('r.slope.aspect',
27 |     width=width, height=height, overlap=2,
28 |     processes=cpus, split=False,
29 |     elevation=grassfile,
30 |     aspect='aspect', overwrite=True)
31 | grd.run()
32 |     
33 | # grd = GridModule('r.contour',
34 |     # width=width, height=height, overlap=20,
35 |     # processes=cpus, input=grassfile,
36 |     # output=grasscontoursfile, 
37 |     # minlevel=200, maxlevel=800, step=10, overwrite=True)    
38 | # grd.run()
39 | 
40 | #Write output to file
41 | r.out_gdal(input='aspect', output=aspectfile, overwrite=True)
42 | #r.out_ogr(input=grasscontoursfile, output=outfile, overwrite=True)
43 | 
44 | # These can be left out, just debug info
45 | g.version()
46 | g.gisenv()
47 | g.list(type="all", flags='m')
48 | r.info(map=grassfile, verbose=True)
49 | v.info(map=grasscontoursfile, verbose=True)
50 | 


--------------------------------------------------------------------------------
/grass/04_pygrass_parallel/pygrass_parallel_with_gridmodule.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 4 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 5 | #SBATCH --time=0:05:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 7 | #SBATCH --nodes=1  # Number of compute nodes. Upper limit depends on partition.
 8 | #SBATCH --ntasks=4  # Number of tasks. Upper limit depends on partition.
 9 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | 
11 | module load grassgis
12 | 
13 | # Run the PyGRASS script with temporary location
14 | grass --tmp-location EPSG:3067 --exec python3 pygrass_parallel_with_gridmodule.py 
15 | 


--------------------------------------------------------------------------------
/grass/readme.md:
--------------------------------------------------------------------------------
 1 | # GRASS GIS example batch jobs for Puhti supercomputer
 2 | 
 3 | * [GRASS shell scripts](https://grasswiki.osgeo.org/wiki/GRASS_Python_Scripting_Library). [Example](01_serial_cli)
 4 | * [GRASS Python Scripting Library](https://grasswiki.osgeo.org/wiki/GRASS_Python_Scripting_Library). [Example](02_python_scripting_serial)
 5 | * [PyGRASS](https://grasswiki.osgeo.org/wiki/Python/pygrass). Examples: [basic serial](03_pygrass_serial) and [parallel with GridModule](04_pygrass_parallel)
 6 | * In these examples temporary location is used, in many cases it is better to use permanend GRASS mapset and location. If using temporary location with bigger datasets, use compute nodes with [local NMVE disk](https://docs.csc.fi/computing/running/creating-job-scripts-puhti/#local-storage), which have more temporary space available or set TMPDIR to be a folder in scratch (`export TMPDIR=/scratch/project_200xxxx/grass/tmp`
 7 | ).
 8 | 
 9 | Python Scripting suits simpler cases when chaining existing tools is enough, PyGRASS enables data access from Python.
10 | See [GRASS page in CSC Docs](https://docs.csc.fi/apps/grass/#references), for addtional external references.
11 | 


--------------------------------------------------------------------------------
/machineLearning/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Scripts for the CSC Practical machine learning for spatial data course have been moved to https://github.com/csc-training/GeoML .
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/noppe/Readme.md:
--------------------------------------------------------------------------------
1 | # Noppe
2 | 
3 | CSC Noppe is a service for providing common easy-to-use exercise environment for courses. Noppe suppors currently Jupyter Lab/Notebooks and RStudio. Current log-in options are: CSC accounts, HAKA, Virtu and MOOC.fi.
4 | It is possible to use existing Docker containers or, if you have special requirements, you can also create the docker containers yourself. 
5 | 
6 | * [Noppe documentation](https://docs.csc.fi/cloud/csc_notebooks/), see the teacher's guide for setting up Noppe application for your own course.
7 | * [Noppe log-in](https://noppe.csc.fi)
8 | * [Installation files of CSC and UH GIS courses](https://github.com/csc-training/course_computing_environments/tree/main/noppe)
9 | 


--------------------------------------------------------------------------------
/pdal/01_crop_pipeline.json:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | {
 5 |   "pipeline":[
 6 |     "/appl/data/geo/mml/laserkeilaus/2008_latest/2008/L413/1/L4131H3.laz",
 7 |     {
 8 |       "type":"filters.crop",
 9 |       "bounds":"([379591,379978],[6673858,6674143])"
10 |     },
11 |     {
12 |       "type":"writers.las",
13 |       "filename":"output.laz"
14 |     }
15 |   ]
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/pdal/01_split_laz.sh:
--------------------------------------------------------------------------------
 1 | mkdir -p data
 2 | 
 3 | origin_x=379190
 4 | origin_y=6673340
 5 | piece_size=400
 6 | for x in 0 1
 7 | do
 8 |     for y in 0 1
 9 |     do
10 |         bb=($[$origin_x+$piece_size*$x] $[$origin_x+$piece_size*($x+1)] $[$origin_y+$piece_size*$y] $[$origin_y+$piece_size*($y+1)])
11 |         echo pdal pipeline 01_crop_pipeline.json --filters.crop.bounds="([${bb[0]},${bb[1]]}],[${bb[2]},${bb[3]}])" --writers.las.filename=data/part_$x$y.laz
12 |         pdal pipeline 01_crop_pipeline.json --filters.crop.bounds="([${bb[0]},${bb[1]]}],[${bb[2]},${bb[3]}])" --writers.las.filename=data/part_$x$y.laz
13 | 
14 |     done
15 | done
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/pdal/02_pipeline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pipeline":[
 3 |         "data/part_00.laz",
 4 |         {
 5 |             "type":"filters.smrf",
 6 |             "window":33,
 7 |             "slope":1.0,
 8 |             "threshold":0.15,
 9 |             "cell":1.0
10 |         },
11 |         {
12 |             "type":"filters.range",
13 |             "limits":"Classification[2:2]"
14 |         },
15 |         {
16 |             "type":"writers.gdal",
17 |             "filename":"data/exercise2.tif",
18 |             "output_type":"min",
19 |             "resolution":1.0
20 |         }
21 |     ]
22 | }
23 | 


--------------------------------------------------------------------------------
/pdal/03_batch_job_gnu_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --output=slurm-%j.out  # File to write the standard output to. %j is replaced by the job ID.
 4 | #SBATCH --error=slurm-%j.err  # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 5 | 
 6 | #Partition you want to submit your job to.
 7 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 8 | 
 9 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished.
10 | #SBATCH --time=00:05:00  # Maximum duration of the job. Upper limit depends on partition.
11 | 
12 | #Tells the batch job system that this is not a parallel task and only one task should be used. Note that this is one task per job, but array job will actually launch 3 simultaneous jobs.
13 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
14 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
15 | 
16 | #Tells the batch job sytem to reserve 1000MB (1GB) of memory for each of the 3 jobs.
17 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
18 | 
19 | #As the job is not run on the login where we submit the job from, it's necessary to load necessary modules in the batch job script. Loading the modules on login node will not help.
20 | module load parallel geoconda
21 | #Change to the directory where you have the files
22 | 
23 | cd /scratch/project_2000599/geocomputing/pdal
24 | 
25 | 
26 | find data -name '*.laz' | \
27 |     parallel -I{} -j $SLURM_CPUS_PER_TASK pdal pipeline --readers.las.filename=data/{/.}.laz --writers.gdal.filename=data/{/.}.tif 02_pipeline.json {}


--------------------------------------------------------------------------------
/pdal/04_batch_job_array.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #Name of the job, this makes it easier to identify your job
 3 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 4 | 
 5 | #Outputfile. Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job
 6 | #SBATCH --output=output_%j.txt  # File to write the standard output to.
 7 | 
 8 | #As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both.
 9 | #SBATCH --error=error_%j.txt  # File to write the standard error to.
10 | 
11 | #Partition you want to submit your job to. Possible values are serial, parallel, longrun, hugemem and test. In this excerecise we use test as it is for testing, but it shouldn't be used for serious work. See [Taito user guide](https://research.csc.fi/taito-constructing-a-batch-job-file) for details. 
12 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
13 | 
14 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished.
15 | #SBATCH --time=00:05:00  # Maximum duration of the job. Upper limit depends on partition.
16 | 
17 | #Tells the batch job system that this is an array job that should be run 4 times. During each run the $SLURM_ARRAY_TASK_ID variable will get different value ranging from 1 to 4. This will be used to select different input files.
18 | #SBATCH --array=1-4  # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -.
19 | 
20 | #Tells the batch job system that this is not a parallel task and only one task should be used. Note that this is one task per job, but array job will actually launch 3 simultaneous jobs.
21 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
22 | 
23 | #Tells the batch job sytem to reserve 1000MB (1GB) of memory for each of the 3 jobs.
24 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
25 | 
26 | #As the job is not run on the login where we submit the job from, it's necessary to load necessary modules in the batch job script. Loading the modules on login node will not help.
27 | module load geoconda
28 | #Change to the directory where you have the files
29 | 
30 | cd /scratch/project_2000599/geocomputing/pdal
31 | #Read the file to be processed from a list of input files. This is done by getting the line corresponding to the $SLURM_ARRAY_TASK_ID from the input file list.
32 | input=$(sed -n "$SLURM_ARRAY_TASK_ID"p 04_filelist.csv)
33 | 
34 | #Create output name from input by exchanging .laz to .tif.
35 | name=$(echo "$input" | cut -f 1 -d '.')
36 | output=data/$(echo "$name" | cut -f 2 -d '/').tif
37 | 
38 | 
39 | #Run the pipeline as in previous exercise. Note that it is possible to override input and output files in your pipeline json from the commandline.
40 | pdal pipeline --readers.las.filename=$input --writers.gdal.filename=$output 02_pipeline.json
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/pdal/04_filelist.csv:
--------------------------------------------------------------------------------
1 | data/part_00.laz
2 | data/part_01.laz
3 | data/part_10.laz
4 | data/part_11.laz
5 | 


--------------------------------------------------------------------------------
/pdal/07_batch_job_python.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --time=00:05:00  # Maximum duration of the job. Upper limit depends on partition.
 4 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 5 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 6 | #SBATCH --mem-per-cpu=4G  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
 7 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 8 | 
 9 | module load geoconda
10 | srun python 07_pdal_ground.py


--------------------------------------------------------------------------------
/pdal/07_pdal_ground.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from mpl_toolkits.mplot3d import Axes3D
 4 | from multiprocessing import Pool 
 5 | from pathlib import Path
 6 | import json
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | import pdal
10 | import time
11 | 
12 | #Location of .laz files, relative to the script
13 | input_dir="data"
14 | 
15 | ## How many parallel processes do we want to use
16 | parallel_processes = 4
17 | 
18 | # Filter laz with SMRF and create Pandas dataframe with points in it.
19 | def pdal2df(input_file):
20 | 
21 |     pipe = [
22 |         input_file,
23 |         {
24 |             "type":"filters.smrf",
25 |             "window":33,
26 |             "slope":1.0,
27 |             "threshold":0.15,
28 |             "cell":1.0
29 |         }
30 |     ]
31 | 
32 |     pipeline = pdal.Pipeline(json.dumps(pipe))
33 |     pipeline.validate() # check if our JSON and options were good
34 |     pipeline.loglevel = 8 #really noisy
35 |     count = pipeline.execute()
36 |     arrays = pipeline.arrays
37 |     arr = pipeline.arrays[0]
38 |     description = arr.dtype.descr
39 |     cols = [col for col, __ in description]
40 |     df = pd.DataFrame({col: arr[col] for col in cols})
41 |     
42 |     return df
43 | 
44 | # Plot as 3D plot, green if ground red if not.
45 | def plot_df(df, input_file):
46 |     fig = plt.figure()
47 |     ax = fig.add_subplot(111, projection='3d')
48 |     df = df.sample(frac=0.05)
49 |     colors=['green' if c==2 else 'red' for c in df.Classification.tolist()]
50 |     ax.scatter(df.X.tolist(),df.Y.tolist(),df.Z.tolist(), c=colors)
51 |     plt.savefig(input_file.replace('laz','png'))
52 | 
53 | # Procesing steps for one file 
54 | def process_laz(input_file):
55 |     input_file = str(input_file)
56 |     print(input_file)
57 |     df = pdal2df(input_file)
58 |     print(df)
59 |     plot_df(df, input_file)
60 |    
61 | # Start the script find laz files in data folder and parallelize its processing   
62 | def main():
63 |     # Find laz files on local disk    
64 |     file_list = Path(input_dir).rglob('*.laz')
65 | 
66 |     ## Create a pool of workers and run the function process_laz for each filepath in the list
67 |     pool = Pool(parallel_processes)
68 |     pool.map(process_laz, file_list)
69 | 
70 | if __name__ == '__main__':
71 |     ## This part is the first to execute when script is ran. It times the execution time and rans the main function
72 |     start = time.time()
73 |     main()
74 |     end = time.time()
75 |     print("Script completed in " + str(end - start) + " seconds")


--------------------------------------------------------------------------------
/pouta/README.md:
--------------------------------------------------------------------------------
1 | # Using CSC's Pouta platform for geocomputing applications
2 | A collection of instructions to setup virtual machines in [CSC's cPouta environment](https://docs.csc.fi/cloud/pouta/) for different tools:
3 | - [GeoServer or OpenDroneMap as Docker applications](./docker_geoserver_or_opendronemap) - installing other Docker applications would be very similar.
4 | - [MetaShape](./metashape_with_VNC)  - installing other Desktop tools could be rather similar.
5 | - [ArcPy](./arcpy)
6 | - Earlier also **PostGIS** was recommended to be installed to cPouta, now [Pukki](https://docs.csc.fi/cloud/dbaas/) is more suitable for databases.
7 | 


--------------------------------------------------------------------------------
/pouta/arcpy/ArcGIS_Server_manual_installation.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | ##Installing ArcGIS for server in a cPouta instance
 3 | 
 4 | # This is a guideline for setting up a single CentOS 7.o instance running ArcGIS for Server Enterprise
 5 | 
 6 | #You will need an ArcGIS Linux installation package and licence that you can get from ESRI (or ask CSC personnel for a university license).
 7 | 
 8 | ##Set up your cPouta instance
 9 | 
10 | # In pouta.csc.fi open Access & Security and create a security group that has a rule allowing SSH.
11 | # If you want to use your ArcGIS server with a ArcMap Desktop client you need to add rules to allow
12 | # ingress and egress for the following TCP ports:
13 | # 4000-4003, 6080(HTTP), 6443(HTTPS)
14 | # Some references about arcgis server ports:
15 | # - http://server.arcgis.com/en/server/latest/install/linux/arcgis-server-system-requirements.htm
16 | # - http://server.arcgis.com/en/server/latest/install/linux/ports-used-by-arcgis-server.htm
17 | 
18 | # In pouta.csc.fi Access & Security, create a key pair and download the private key on your computer.
19 | #
20 | # Launch the instance in pouta.csc.fi with the flavor you need and set the boot source to CentOS-7.
21 | # Set the instance's keypair and security groups properly.
22 | #
23 | # In pouta.csc.fi, associate a Floating IP for the instance and establish a SSH connection with a
24 | # SSH client. The username is cloud-user. Use the private key of the keypair you chose for
25 | # authentication.
26 | #
27 | # Now you should be in your instance.
28 | 
29 | #####
30 | ## Prepare the installation
31 | #####
32 | #The following command installs all the dependencies
33 | sudo yum install fontconfig mesa-libGL mesa-libGLU libXtst libXext libX11 libXi libXdmcp libXrender libXau xorg-x11-server-Xvfb libXfont -y
34 | 
35 | # Get the ArcGIS installation package and unpack it.
36 | # For example if you have the file in Taito you can a command like this from the cPouta instance terminal:
37 | scp username@taito-shell.csc.fi:/homeappl/home/username/ArcGIS_for_Server_Linux_xxxx_xxxxxx.tar.gz ~/
38 | 
39 | # You need to authorize your installation with a .prvc or .ecp file.
40 | # "provision" a file from my.esri.com and save it somewhere on the virtual instance.
41 | # The provision file is quite short so you can simply copy & paste it with a text editor or
42 | # send it to the instance using echo PROVISIONFILECONTENTS > ~/_Server.prvc or whatever you
43 | # feel is the most straightforward way.
44 | 
45 | # Unpack the ArcGIS installation package:
46 | 
47 | tar xvzf ArcGIS_for_Server_Linux_xxxx_xxxxxx.tar.gz
48 | 
49 | # ArcGIS for Server requires you to increase the maximum number of open files.
50 | # Ref: http://server.arcgis.com/en/server/10.4/install/linux/arcgis-for-server-system-requirements.htm
51 | # You can do that with:
52 | echo cloud-user - nofile 65535 | sudo tee -a /etc/security/limits.conf
53 | echo cloud-user - nproc 25059  | sudo tee -a /etc/security/limits.conf
54 | 
55 | # Log in & log out for the changes to the limits to take effect.
56 | 
57 | #####
58 | ## Install ArcGIS for server
59 | #####
60 | 
61 | # Now we are ready to actually install
62 | 
63 | cd ArcGISServer /
64 | ./Setup -m silent -l Yes -a /home/cloud-user/provision_file_ArcGIS_Server.prvc
65 | 
66 | # The installation will take some minutes.
67 | 
68 | # If you need only Python and arcpy you don't need to start the server.
69 | # Note that you have to use ArcGIS' own Python installation instead of the
70 | # default system installation. Python on ArcGIS Server for Linux runs
71 | # a Windows version of Python under Wine.
72 | 
73 | # You start the ArcGIS Python console with:
74 | /home/cloud-user/arcgis/server/tools/python
75 | 
76 | #####
77 | ## Test installation with a simple ArcPy script
78 | #####
79 | # The test_data folder includes a test elevation file dem.tif and a simple
80 | # script that makes loads some arcpy libraries and uses the FlowDirection
81 | # function (see http://pro.arcgis.com/en/pro-app/tool-reference/spatial-analyst/flow-direction.htm).
82 | # The result is store to the ./test_data/output/ directory
83 | #
84 | # Move the test_data folder to the ArcGIS Server instance
85 | # and run my_arcpy_script.py from there with:
86 | /home/cloud-user/arcgis/server/tools/python my_arcpy_script.py
87 | 
88 | # You are done!
89 | 


--------------------------------------------------------------------------------
/pouta/arcpy/ansible_preparations.md:
--------------------------------------------------------------------------------
 1 | # Ansible preparations
 2 | Setting up a working Ansible environment may not be trivial, especially if you are not an experienced Linux user.
 3 | 
 4 | Below you will find some information and hints on how to preapre a working environment for Ansible.
 5 | 
 6 | ## cPouta account
 7 | These are the minimum requirements before you can start using example Ansible playbooks:
 8 | 
 9 | - A pouta project with key-pairs and security groups to make connecting from your local machine possible. Instructions: https://docs.csc.fi/cloud/pouta/launch-vm-from-web-gui/
10 | 
11 | - cPouta project's API access file, see [Configure your terminal environment for OpenStack](https://docs.csc.fi/cloud/pouta/install-client/#configure-your-terminal-environment-for-openstack)
12 | 
13 | ## Computer environment
14 | 
15 | It is recommended to use a computer with a Linux operating system. Most of the instructions you will find here assume that you are working with a Linux computer. Note, that you can create a Linux virtual machine in cPouta and install the necessary tools and settings into it as necessary OR use Windows Linux Subsystem.
16 | 
17 | ### Ansible tools
18 | 
19 | You need to have an environment with the necessary tools to run an ansible script: python, [openstack-client](https://docs.csc.fi/cloud/pouta/install-client), ansible and shade. 
20 | 
21 | 
22 | ### Setting up automatic access to keypairs and servers
23 | 
24 | In order for the ansible scripts to run smoothly, you will need to make sure that
25 | the processes don't need interaction from the user and that the necessary keypairs
26 | are loaded.
27 | 
28 | Some hints that may help:
29 | - make sure that the key pair Ansible is using to contact the remote server is
30 | available. For example with:
31 | ````bash
32 | # Start a ssh agent to automatically manage your keypairs
33 | eval $(ssh-agent -s)
34 | # Add your keypair to the ssh agent
35 | ssh-add ~/.ssh/your_private_key.pem
36 | # this private key is the one corresponding to the key pair name specified in
37 | # the Ansible script.
38 | ````
39 | - by default, new remote connections have to be confirmed manually, which would interrupt the workflow of an Ansible script. To avoid new servers' fingerprint interactive checks, set the following environment variable:
40 | ````bash
41 | export ANSIBLE_HOST_KEY_CHECKING=False
42 | ````
43 | 


--------------------------------------------------------------------------------
/pouta/arcpy/ansible_run_arcpy.yml:
--------------------------------------------------------------------------------
 1 | # Ansible demo for reusing an existing ArcGIS Server volume
 2 | # (see ansible_install_arcpy.yml playbook) to restart a remote virtual machine
 3 | # on CSC's cPouta and running simple ArcPy script.
 4 | #
 5 | # You'll need to have your Ansible environment properly setup and modify
 6 | # the Ansible variables to your own:
 7 | #    - NAME_OF_YOUR_KEY: the name of the key pair as seen in your cPouta project
 8 | #    - NAME_OF_YOUR_SECURITY_GROUP: the name of a security group as seen in your cPouta project
 9 | #
10 | # This Ansible script assumes that you have the "test_data" folder in the same
11 | # directory where this Ansible script is. This is how those are defined in the code:
12 | #     test_data_dir: test_data
13 | #     demo_script: my_arcpy_script.py
14 | #
15 | # The script copies the "test_data" folder to the remote virtual machine,
16 | # then executes the example "my_arcpy_script.py" Python script and copies the
17 | # results to the local machine to the subfolder "test_data/results".
18 | #Then the remote "test_data" folder is removed and the virtual machine is deleted.
19 | ---
20 | - name: Create virtual machine from existing ArcGIS Server on cPouta
21 |   hosts: localhost # The OpenStack modules run on your local machine.
22 |   connection: local
23 |   vars:
24 |     demo_key: NAME_OF_YOUR_KEY
25 |     demo_sg: NAME_OF_YOUR_SECURITY_GROUP
26 |     demo_security_groups: default, {{ demo_sg }}
27 |     arcgis_server_vol: arcpy-volume
28 |     demo_instance: test-arcpy
29 |     cpouta_flavor: standard.tiny
30 | 
31 |   tasks:
32 |     - name: Create a virtual machine from existing ArcGIS Server volume
33 |       register: result
34 |       os_server:
35 |         name: "{{ demo_instance }}"
36 |         flavor: "{{ cpouta_flavor }}"
37 |         key_name: "{{ demo_key }}"
38 |         security_groups: "{{ demo_security_groups }}"
39 |         boot_volume: "{{ arcgis_server_vol }}"
40 | 
41 |     - name: Add new host to inventory
42 |       add_host: name={{ result.server.public_v4 }} groups=arcpy_nodes
43 | 
44 |     - name: clear ssh known_hosts
45 |       known_hosts: name={{ result.server.public_v4 }} state=absent
46 |       when: result | changed
47 | 
48 |     - name: Wait for instance to be ready
49 |       wait_for: host={{ result.server.public_v4 }} port=22 search_regex=OpenSSH delay=3
50 | 
51 | - name: Run example script in cPouta instance
52 |   hosts: arcpy_nodes
53 |   remote_user: cloud-user
54 |   vars:
55 |     test_data_dir: test_data
56 |     demo_script: my_arcpy_script.py
57 | 
58 |   tasks:
59 |     - synchronize:
60 |         mode: push
61 |         src: ./{{ test_data_dir }}/
62 |         dest: ~/{{ test_data_dir }}/
63 | 
64 |     - name: run ArcPy script
65 |       shell: ~/arcgis/server/tools/python ~/{{ test_data_dir }}/{{ demo_script }}
66 | 
67 |     - synchronize:
68 |         mode: pull
69 |         src: ~/{{ test_data_dir }}/
70 |         dest: ./{{ test_data_dir }}/results
71 | 
72 |     - name: Delete remote directory
73 |       file:
74 |         state: absent
75 |         path: ~/{{ test_data_dir }}/
76 | 
77 | - name: Destroy the computing ArcGIS Server instance (the ArcGIS Server volume remains)
78 |   hosts: localhost
79 |   vars:
80 |     demo_instance: test-arcpy
81 | 
82 |   tasks:
83 |     - name: Destroy a VM
84 |       os_server:
85 |         name: "{{ demo_instance }}"
86 |         state: absent
87 | ...
88 | 


--------------------------------------------------------------------------------
/pouta/arcpy/test_data/my_arcpy_script.py:
--------------------------------------------------------------------------------
 1 | import arcpy
 2 | from arcpy.sa import *
 3 | import os
 4 | 
 5 | arcpy.env.overwriteOutput = True
 6 | 
 7 | directory = "./output/"
 8 | 
 9 | if not os.path.exists(directory):
10 |     os.makedirs(directory)
11 | 
12 | outFlowDirection = FlowDirection("./dem.tif", "NORMAL")
13 | outFlowDirection.save(directory+"flowdir.tif")
14 | 


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/ansible.cfg:
--------------------------------------------------------------------------------
1 | [ssh_connection]
2 | ssh_args = -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=10


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/group_vars/all.yml:
--------------------------------------------------------------------------------
1 | ---
2 | key_name: kylli_pouta_2024
3 | os_image: "Ubuntu-22.04"
4 | instance_flavor: "standard.small" # This will affect your billing, select one suitable for you https://docs.csc.fi/cloud/pouta/vm-flavors-and-billing/#cpouta-flavors
5 | instance_name: "ubuntu-docker-kylli"
6 | 
7 | internal_ips: # Please change this to your own, you can use https://apps.csc.fi/myip to check your IP
8 |  - 0.0.0.0/0
9 | 


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/install-geoserver.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Deploy VM
 4 |   hosts: localhost
 5 |   collections:
 6 |     - openstack.cloud
 7 |   vars:
 8 |     installed_packages: "geoserver"
 9 |     
10 |   roles:
11 |     - openstack
12 | 
13 | - name: Install Docker and GeoServer
14 |   hosts: created_instances
15 |   collections:
16 |     - openstack.cloud
17 |   vars:
18 |     geoserver_version: "2.25.2" # If you want to install a different version of GeoServer, change this
19 |     geoserver_datadir: "/geoserver_data"
20 | 
21 |   roles:
22 |     - docker
23 |     - geoserver
24 | 


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/install-odm.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Deploy VM
 4 |   hosts: localhost
 5 |   collections:
 6 |     - openstack.cloud
 7 |   vars:
 8 |     installed_packages: "opendronemap"
 9 | 
10 |   roles:
11 |     - openstack
12 | 
13 | - name: Install Docker and OpenDroneMap
14 |   hosts: created_instances
15 |   gather_facts: false
16 |   collections:
17 |     - openstack.cloud
18 |   vars:
19 |     images_dir: "/data/images"
20 |     docker_name: "odm"    
21 | 
22 |   roles:
23 |     - docker
24 |     - opendronemap


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/requirements.yml:
--------------------------------------------------------------------------------
1 | ---
2 | collections:
3 |   - name: openstack.cloud
4 |   - name: community.docker
5 | 


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/roles/docker/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | # This tutorial is for Ubuntu
 4 | - name: Create Docker group
 5 |   become: true
 6 |   group:
 7 |     name: docker
 8 |     state: present
 9 | 
10 | - name: Add user to Docker group
11 |   become: true
12 |   user:
13 |     name: ubuntu
14 |     groups: docker
15 |     append: yes
16 | 
17 | - name: Update all packages
18 |   become: true
19 |   apt:
20 |     name: "*"
21 |     update_cache: yes
22 |     state: latest
23 | 
24 | - name: Add signing key
25 |   become: true
26 |   apt_key:
27 |     url: "https://download.docker.com/linux/ubuntu/gpg"
28 |     state: present
29 | 
30 | - name: Add Docker repository into sources list
31 |   become: true
32 |   apt_repository:
33 |     repo: deb https://download.docker.com/linux/ubuntu/ jammy stable
34 |     state: present
35 | 
36 | - name: Install Docker
37 |   become: true
38 |   apt:
39 |     name:
40 |       - docker-ce
41 |       - docker-ce-cli
42 |       - containerd.io
43 |       - docker-compose-plugin
44 |     state: latest
45 |     update_cache: True
46 | 
47 | - name: Start Docker
48 |   service:
49 |     name: docker
50 |     enabled: True
51 |     state: started


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/roles/geoserver/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Get GeoServer container info
 4 |   community.docker.docker_container_info:
 5 |     name: geoserver
 6 |   register: geoserver_status
 7 | 
 8 | - name: Pull GeoServer image
 9 |   community.docker.docker_image:
10 |     name: docker.osgeo.org/geoserver:{{ geoserver_version }}
11 |     source: pull
12 |     pull:
13 |       platform: amd64
14 |   when: not geoserver_status.exists
15 |   
16 | - name: Create Geoserver data directory 
17 |   file:
18 |     path: "{{geoserver_datadir}}"
19 |     state: directory
20 |     mode: '0770'
21 |     owner: ubuntu
22 |   become: yes
23 | 
24 | - name: Start GeoServer
25 |   community.docker.docker_container:
26 |     name: geoserver
27 |     image: docker.osgeo.org/geoserver:{{ geoserver_version }}
28 |     state: started
29 |     restart: true
30 |     ports:
31 |       - "8080:8080"
32 |     mounts:
33 |       - type: "bind"
34 |         target: /opt/geoserver_data
35 |         source: "{{geoserver_datadir}}"
36 |     env: # These are only needed when using extensions, here ysld is used as an example
37 |       INSTALL_EXTENSIONS: "true"
38 |       STABLE_EXTENSIONS: "ysld"
39 |       # If you want to use Community modules, add COMMUNITY_EXTENSIONS followed by the modules you want to use 
40 |       # e.g. COMMUNITY_EXTENSIONS: "ogcapi-features,ogcapi-images"
41 | 
42 | - name: GeoServer info
43 |   debug:
44 |     msg: 
45 |       - "You now have GeoServer available at: http://{{ hostvars['localhost']['server_facts']['servers'][0]['access_ipv4'] }}:8080/geoserver/"
46 |       - "To login in to the virtual machine, connect with: ssh ubuntu@{{ hostvars['localhost']['server_facts']['servers'][0]['access_ipv4'] }}"
47 | 


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/roles/opendronemap/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Get OpenDroneMap info
 4 |   community.docker.docker_container_info:
 5 |     name: "{{ docker_name }}"
 6 |   register: opendronemap_status
 7 | 
 8 | - name: Create Geoserver data directory 
 9 |   file:
10 |     path: "{{images_dir}}"
11 |     state: directory
12 |     mode: '0770'
13 |     owner: ubuntu
14 |   become: yes
15 | 
16 | - name: Pull OpenDroneMap Image
17 |   community.docker.docker_image:
18 |     name: opendronemap/odm
19 |     source: pull
20 |     pull:
21 |       platform: amd64
22 |   when: not opendronemap_status.exists
23 |   
24 | - name: OpenDroneMap info
25 |   debug:
26 |     msg: 
27 |       - "To login in to the virtual machine, connect with: ssh ubuntu@{{ hostvars['localhost']['server_facts']['servers'][0]['access_ipv4'] }}"
28 | 


--------------------------------------------------------------------------------
/pouta/docker_geoserver_or_opendronemap/roles/openstack/tasks/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | - set_fact:
  4 |     security_group: "{{ instance_name }}-security-group"
  5 | 
  6 | - name: Create security group
  7 |   openstack.cloud.security_group:
  8 |     state: present
  9 |     name: "{{ security_group }}"
 10 |     description: "Security group for {{ instance_name }}"
 11 | 
 12 | - name: Add port 22 opening to ips {{ internal_ips }} to rule {{ instance_name }}
 13 |   openstack.cloud.security_group_rule:
 14 |     state: present
 15 |     security_group: "{{ security_group }}"
 16 |     protocol: tcp
 17 |     port_range_min: 22
 18 |     port_range_max: 22
 19 |     remote_ip_prefix: "{{ item }}"
 20 |   with_items:
 21 |     - "{{ internal_ips }}"
 22 | 
 23 | - name: Add port 5432 opening to ips {{ internal_ips }} to rule {{ instance_name }}
 24 |   openstack.cloud.security_group_rule:
 25 |     state: present
 26 |     security_group: "{{ instance_name }}-security-group"
 27 |     protocol: tcp
 28 |     port_range_min: 5432
 29 |     port_range_max: 5432
 30 |     remote_ip_prefix: "{{ item }}"
 31 |   with_items:
 32 |     - "{{ internal_ips }}"
 33 |   when: installed_packages == "postgis"
 34 | 
 35 | - name: Add port 8080 opening to ips {{ internal_ips }} to rule {{ instance_name }}
 36 |   openstack.cloud.security_group_rule:
 37 |     state: present
 38 |     security_group: "{{ instance_name }}-security-group"
 39 |     protocol: tcp
 40 |     port_range_min: 8080
 41 |     port_range_max: 8080
 42 |     remote_ip_prefix: "0.0.0.0/0"
 43 |   when: installed_packages == "geoserver"
 44 | 
 45 | - name: Add port 8082 opening to ips {{ internal_ips }} to rule {{ instance_name }}
 46 |   openstack.cloud.security_group_rule:
 47 |     state: present
 48 |     security_group: "{{ instance_name }}-security-group"
 49 |     protocol: tcp
 50 |     port_range_min: 8082
 51 |     port_range_max: 8082
 52 |     remote_ip_prefix: "0.0.0.0/0"
 53 |   when: installed_packages == "geoserver"
 54 | 
 55 | - name: Create instance
 56 |   openstack.cloud.server:
 57 |     name: "{{ instance_name }}"
 58 |     state: present
 59 |     key_name: "{{ key_name }}"
 60 |     image: "{{ os_image }}"
 61 |     flavor: "{{ instance_flavor }}"
 62 |     security_groups: "default,{{security_group}}"
 63 |     metadata:
 64 |       group: "created_instances"
 65 |   register: servers
 66 | 
 67 | - name: Acquire Floating IP
 68 |   openstack.cloud.floating_ip:
 69 |     server: "{{ item }}"
 70 |     network: "public"
 71 |     reuse: true
 72 |     wait: true
 73 |     timeout: 60
 74 |   delay: 1
 75 |   retries: 3
 76 |   async: 60
 77 |   poll: 0
 78 |   with_items:
 79 |     - "{{ servers.server }}"
 80 | 
 81 | - name: Register openstack servers facts
 82 |   openstack.cloud.server_info:
 83 |     name: "{{ instance_name }}"
 84 |   register: server_facts
 85 | 
 86 | - name: Add hosts to inventory
 87 |   add_host:
 88 |     hostname: "{{ item.name }}"
 89 |     group: "{{ item.metadata.group }}"
 90 |     ansible_host: "{{ item.access_ipv4 }}"
 91 |     ansible_user: ubuntu
 92 |   with_items:
 93 |     - "{{ server_facts.servers }}"
 94 | 
 95 | - name: Wait for SSH connection
 96 |   wait_for:
 97 |     host: "{{ item.access_ipv4 }}"
 98 |     state: started
 99 |     port: 22
100 |     delay: 0
101 |   with_items:
102 |     - "{{ server_facts.servers }}"
103 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | # CSC geocomputing Python examples
 2 | 
 3 | * [Puhti](./puhti/README.md) - serial/array/parallel processing with Python.  How to parallelize your Python code with different methods for running in Puhti supercomputer.
 4 | * [Working with Allas data from Python](./allas). Examples with S3 and Swift APIs
 5 | * [Reading NLS topographic database geopackage with Python](./geopackage/README.md)
 6 | * [GRASS multiprocessing from Python](./grass_multiprocessing_with_python/README.md)
 7 | * [Routing](./routing/readme.md) Examples using NetworkX and igraph, serial and parallel.
 8 | * [Sentinel data download from Finhub using sentinelsat](./sentinel/README.md)
 9 | * [STAC, xarray and dask for downloading and processing data](./STAC/stac_xarray_dask_example.ipynb)
10 | * [Zonal statistics in parallel](./zonal_stats/README.md) using rasterstats, serial and parallel.
11 | * [Python Dask-geopandas](./dask_geopandas/README.md) using Dask-geopandas in spatial analysis.
12 | 
13 | 
14 | > **_NOTE:_**  If you are using [Jupyter lab](https://jupyter.org/) on your own computer, the [Jupyter-github](https://github.com/jupyterlab/jupyterlab-github) extension provides you with the possibility to browse public github repositories within Jupyter. Install the extension, click on the little github/cat icon in the left bar and fill `csc-training/geocomputing` into the search field and press enter. This lets you open and run all python files and notebooks within this repository on your own computer.
15 | 


--------------------------------------------------------------------------------
/python/STAC/csc_stac_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # This example shows how to use STAC, dask and xarray via Python script. 
 5 | # If new to STAC, see the STAC_CSC_example.ipynb for longer explanation how STAC works.
 6 | # In this example, we will search and download data through a STAC Catalog and process it using Dask and Xarray. 
 7 | # We will use Sentinel-1 data stored at FMI to compute a mean value of vv_mean for one month. The result will be saved to a new GeoTiff file.
 8 | 
 9 | import requests
10 | import stackstac
11 | from dask.distributed import Client, Lock
12 | import pystac_client
13 | import rioxarray
14 | import sys
15 | import os
16 | 
17 | # Settings
18 | STAC_URL = "https://paituli.csc.fi/geoserver/ogc/stac/v1"
19 | collection = 'sentinel_1_11_days_mosaics_at_fmi'
20 | time_filter="2021-03-01/2021-03-31"
21 | asset = 'mean_vv'
22 | output_file = os.path.join(os.getcwd(), "sentinel1_mean_vv.tif")
23 | 
24 | # Use as many workers as you have available cores
25 | no_of_workers = len(os.sched_getaffinity(0))
26 | 
27 | def find_items_from_stac():
28 |     catalog = pystac_client.Client.open(STAC_URL)
29 |     search_bbox = catalog.search(
30 |         collections=[collection],
31 |         datetime=time_filter
32 |     )
33 |     return search_bbox.item_collection()
34 | 
35 | def main():
36 | 
37 |     # Create Dask client
38 |     # Because STAC+xarray analysis is usually slowed down by data download speed, then it is good to use 1 core per worker.
39 |     # If you have computationally heavy analysis, this could be changed to several cores per worker.
40 |     client = Client(n_workers=no_of_workers)
41 | 
42 |     item_collection = find_items_from_stac()
43 | 
44 |     # Use the `stackstac` library to convert item collection to Xarray DataArray. 
45 |     cube = stackstac.stack(
46 |         items=item_collection,
47 |         assets=[asset],
48 |         #chunksize=(-1,1,2046,2046),
49 |         epsg=3067
50 |     ).squeeze() 
51 |     
52 |     # Create new data cube for the mean value. 
53 |     mean = cube.mean("time", keep_attrs=True)
54 | 
55 |     # Compute and save the result
56 |     mean_ndvi_tiff = mean.rio.to_raster(
57 |         output_file,
58 |         lock=Lock(name="rio", client=client),
59 |         tiled=True,
60 |     )
61 |    
62 |     # Close Dask cluster
63 |     client.close()
64 | 
65 | # With Dask, it is important to use the main function
66 | if __name__ == "__main__":
67 |     main()
68 |     print("Analysis ready")
69 | 


--------------------------------------------------------------------------------
/python/STAC/csc_stac_example_batch_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_2000599    # Choose the project to be billed
 3 | #SBATCH --time=00:20:00  # Maximum duration of the job. Upper limit depends on partition.
 4 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 5 | #SBATCH --cpus-per-task=10  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 6 | #SBATCH --mem-per-cpu=10G  # Minimum memory required per allocated CPU.  Default units are megabytes.
 7 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 8 | 
 9 | # Load the geoconda module which has Python with Dask, Xarray and STAC libraries
10 | module load geoconda
11 | 
12 | # Run the Python code. 
13 | python csc_stac_example.py
14 | 


--------------------------------------------------------------------------------
/python/STAC/environment.yml:
--------------------------------------------------------------------------------
 1 | name: stac
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - gdal
 7 |   - geopandas
 8 |   - dask
 9 |   - jupyterlab
10 |   - pyproj
11 |   - pystac-client
12 |   - pystac  
13 |   - requests  
14 |   - rioxarray
15 |   - stackstac
16 |   - libgdal-jp2openjpeg
17 |   - python-graphviz
18 |   - jupyter-resource-usage
19 |   - dask-labextension
20 | 


--------------------------------------------------------------------------------
/python/STAC/img/DEM_data_source_cpu_walltime.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/DEM_data_source_cpu_walltime.gif


--------------------------------------------------------------------------------
/python/STAC/img/DEM_tile_size_cpu_walltime.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/DEM_tile_size_cpu_walltime.gif


--------------------------------------------------------------------------------
/python/STAC/img/S1_data_source_cpu_walltime.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/S1_data_source_cpu_walltime.gif


--------------------------------------------------------------------------------
/python/STAC/img/S1_tile_size_cpu_walltime.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/S1_tile_size_cpu_walltime.gif


--------------------------------------------------------------------------------
/python/allas/working_with_allas_from_Python_S3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Dec  4 13:57:52 2019
 5 | @author: ekkylli
 6 | Updated 7.7.2025
 7 | """
 8 | 
 9 | # Example script for using Allas directly from an Python script:
10 | # - Reading raster and vector files
11 | # - Writing raster and vector files
12 | # - Looping over all files of certain type in a bucket
13 | 
14 | # Please notice that this example works ONLY with GDAL-based libraries for spatial data: rasterio, geopandas etc.
15 | 
16 | # The required packages depend on the task
17 | # For working with rasters
18 | import rasterio
19 | # For working with vectors
20 | import geopandas as gpd
21 | # For listing files and writing to Allas
22 | import boto3
23 | import os
24 | 
25 | 
26 | # Before starting to use S3, set up your credentials and endpoint. 
27 | # This example here applies for using Allas from CSC Puhti or Mahti supercomputers.
28 | # To use some other S3 stroage or from some other computer,
29 | # See https://docs.csc.fi/support/tutorials/gis/gdal_cloud/#s3-connection-details
30 | #
31 | # 1) Set up your credentials to Allas:
32 | # module load allas
33 | # allas-conf --mode s3cmd
34 | # This is needed only once, as long as you are using the same CSC project.
35 | # This also sets S3 endopoint to .aws/config file in a way understandable for boto3 library, but not for GDAL.
36 | #
37 | # 2) Set S3-endpoint for GDAL-library:
38 | # module load allas
39 | # OR
40 | os.environ["AWS_S3_ENDPOINT"] = "a3s.fi"
41 | # This sets AWS_S3_ENDPOINT environment variable to "a3s.fi".
42 | # Environment variables are cleaned after session end, so it must be set again in each new session.
43 | 
44 | # If you want to WRITE files with rasterio/geopandas directly to Allas, set also this.
45 | os.environ["CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE"] = "YES"
46 | 
47 | # Reading raster file
48 | r = rasterio.open('/vsis3/name_of_your_Allas_bucket/name_of_your_input_raster_file.tif')
49 | input_data = r.read()
50 | 
51 | # Writing raster file
52 | with rasterio.open('/vsis3/name_of_your_Allas_bucket/name_of_your_output_raster_file.tif', 'w', **r.profile) as dst:
53 |     dst.write(input_data)
54 | 
55 | # Reading vector file
56 | v = gpd.read_file('/vsis3/name_of_your_Allas_bucket/name_of_your_input_vector_file.gpkg')
57 | 
58 | # Writing vector file
59 | v.to_file('/vsis3/name_of_your_Allas_bucket/name_of_your_output_vector_file.gpkg', layer='layername', driver="GPKG")
60 | 
61 | # Looping through all files in a bucket, find ones that are tifs.
62 | # Then print the extent of each file as example.
63 | 
64 | # Create connection to S3 storage
65 | os.environ["AWS_REQUEST_CHECKSUM_CALCULATION"] = "when_required"
66 | os.environ["AWS_RESPONSE_CHECKSUM_VALIDATION"] = "when_required"
67 | s3_resource = boto3.resource('s3')
68 | 
69 | # By default boto3 is connecting to Amazon S3, to use custom endpoint, define it in .aws/config file as done by allas-conf --mode s3cmd
70 | # OR define it in the Python code
71 | # s3_resource = boto3.resource("s3", endpoint_url='https://a3s.fi')
72 | 
73 | my_bucket = s3_resource.Bucket('name_of_your_Allas_bucket')
74 | 
75 | for my_bucket_object in my_bucket.objects.all():
76 |     if (my_bucket_object.key.endswith('.tif')):    
77 |         filePath = '/vsis3/gis-int2/' + my_bucket_object.key
78 |         print(filePath)
79 |         r = rasterio.open(filePath)
80 |         print(r.bounds)
81 |         
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/python/allas/working_with_allas_from_Python_Swift.py:
--------------------------------------------------------------------------------
 1 | import swiftclient
 2 | import rasterio
 3 | import geopandas as gpd
 4 | from rasterio.io import MemoryFile
 5 | import tempfile
 6 | import os
 7 | 
 8 | """
 9 | Example script for using Allas directly from a Python script with swift library
10 | Created on 27.01.2020 by Johannes Nyman
11 | """
12 | 
13 | ### 1. Establishing the Swift connection to Allas
14 | 
15 | # You need to run the following commands in Puhti to get the authentication to Allas active
16 | 
17 | """
18 | module load allas
19 | allas-conf
20 | """
21 | 
22 | # These exist after running allas-conf
23 | _authurl = os.environ['OS_STORAGE_URL']
24 | _auth_token = os.environ['OS_AUTH_TOKEN']
25 | _project_name = os.environ['OS_PROJECT_NAME']
26 | _user = os.environ['OS_USERNAME']
27 | 
28 | 
29 | # Various settings for connecting to Puhti
30 | _auth_version = '3'
31 | _os_options = {
32 |     'user_domain_name': 'Default',
33 |     'project_domain_name': 'Default',
34 |     'project_name': _project_name
35 | }
36 | 
37 | # Creating the connection client
38 | conn = swiftclient.Connection(
39 |     user=_user,
40 |     preauthurl=_authurl,
41 |     preauthtoken=_auth_token,
42 |     os_options=_os_options,
43 |     auth_version=_auth_version
44 | )
45 | 
46 | ### 1. Download a file from Allas to local filesystem
47 | obj = '<YOUR-ALLAS-FILE>'
48 | container = '<YOUR-ALLAS-BUCKET>'
49 | file_output = '<YOUR-OUTPUT-PATH>'
50 | headers, raster = conn.get_object(container, obj)
51 | with open(file_output, 'bw') as f:
52 |     f.write(raster)
53 | 
54 | ### 2. Writing a raster file to Allas using the Swift library
55 | fp = "<PATH-TO-LOCAL-TIF-FILE>"
56 | bucket_name = '<YOUR-BUCKET>'
57 | raster = rasterio.open(fp)
58 | input_data = raster.read()
59 | 
60 | # The file is written to memory first and then uploaded to Allas
61 | with MemoryFile() as mem_file:
62 |     with mem_file.open(**raster.profile) as dataset:
63 |         dataset.write(input_data)
64 |     conn.put_object(bucket_name, os.path.basename(fp), contents=mem_file)
65 | 
66 | 
67 | ### 3. Writing a vector file to Allas using the Swift library
68 | fp = "<PATH-TO-GPKG-FILE>"
69 | bucket_name = '<YOUR-BUCKET>'
70 | vector = gpd.read_file(fp)
71 | 
72 | # The file is written to memory first and then uploaded to Allas
73 | tmp = tempfile.NamedTemporaryFile()
74 | vector.to_file(tmp, layer='test', driver="GPKG")
75 | tmp.seek(0) # Moving pointer to the beginning of temp file.
76 | conn.put_object(bucket_name, os.path.basename(fp) ,contents=tmp)
77 | 
78 | 
79 | ### 5. Looping through buckets and files inside your project
80 | resp_headers, containers = conn.get_account()
81 | for container in containers:
82 |     print(container['name'])
83 |     for data in conn.get_container(container['name'])[1]:
84 |         print("\t" + container['name'] + "/" + data['name'])
85 | 
86 | 


--------------------------------------------------------------------------------
/python/dask_geopandas/README.md:
--------------------------------------------------------------------------------
 1 | # Dask geopandas example
 2 | > Dask-GeoPandas is a project merging the geospatial capabilities of GeoPandas and the scalability of Dask. GeoPandas is an open source project designed to make working with geospatial data in Python easier. GeoPandas extends the datatypes used by pandas to allow spatial operations on geometric types. Dask provides advanced parallelism and distributed out-of-core computation with a dask.dataframe module designed to scale pandas. 
 3 | 
 4 | In general, one can work with Dask-GeoDataFrames as they are regular GeoDataFrames. A good approach would be to start solving a problem using plain GeoPandas, because for small data problems, Dask-GeoPandas generates a significant overhead. Only after one would run into memory or performance issues with GeoPandas, they should switch to Dask-GeoPandas with one partition having less than 1GB of data in it.
 5 | 
 6 | Unfortunately, Dask-GeoPandas provides only a limited number of operations. Before using dask-geopandas, check if the method that you need is available in [Dask-GeoPandas](https://dask-geopandas.readthedocs.io/en/stable/api.html).
 7 | 
 8 | In this example, we will use Finnish addresses (osoitteet), and based on post code data, we will assign each address its post code. To do that, we will load two shapefiles into GeoDataFrames and perform a spatial join. In the end, we compare the execution times of both dask-geopandas and plain geopandas.
 9 | 
10 | To launch this notebook in Puhti, you need JupyterLab with at least 5GB of memory and 4 cores.
11 | 
12 | ### Documentation
13 | - [Dask-geopandas documentation](https://dask-geopandas.readthedocs.io/en/stable/)
14 | - [CSC Dask tutorial](https://docs.csc.fi/support/tutorials/dask-python/)
15 | - [Jupyter in Puhti supercomputer](https://docs.csc.fi/computing/webinterface/jupyter/)
16 | - [Dask batch jobs with Puhti](https://github.com/csc-training/geocomputing/tree/master/python/puhti/05_parallel_dask)
17 | 


--------------------------------------------------------------------------------
/python/geopackage/README.md:
--------------------------------------------------------------------------------
1 | ## Reading NLS topographic database geopackage with Python
2 | The NLS topographic database has been saved into several geopackage files in Puhti at /appl/data/geo/mml/maastotietokanta/20XX/gpkg. The larger layers are in their own gpkg files and the smaller layers have been bundled into a single file. The larger layers are quite large and reading them takes some time. If the whole layers are not however needed it is possible to use Geopandas to only read the desired parts of the files. The examples can be found in the [read_gpkg.py](read_gpkg.py) script
3 | 
4 | In Puhti [geoconda](https://docs.csc.fi/apps/geoconda/) module can be used.
5 | 
6 | Similar examples for reading the geopackage with R using SF package can be found [here](https://github.com/csc-training/geocomputing/tree/master/R/geopackage).
7 | 


--------------------------------------------------------------------------------
/python/geopackage/list_layers_info.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Jun 15 08:35:45 2018
 5 | 
 6 | @author: ekkylli
 7 | 
 8 | This file lists the layers of GeoPackage, the number of features in each layer an d their type.
 9 | """
10 | 
11 | from osgeo import ogr
12 | 
13 | data = ogr.Open('/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-vakavesi_20-02-06.gpkg')
14 | 
15 | print('Data Name:', data.GetName())
16 | 
17 | # get a layer with GetLayer('layername'/layerindex)
18 | for layer in data:
19 |     print('Layer Name:', layer.GetName())
20 |     print('Layer Feature Count:', len(layer))
21 |     
22 |     layer_defn = layer.GetLayerDefn()
23 |     for i in range(layer_defn.GetGeomFieldCount()):
24 |         # some times the name doesn't appear
25 |         # but the type codes are well defined
26 |         print(layer_defn.GetGeomFieldDefn(i).GetName(), layer_defn.GetGeomFieldDefn(i).GetType())
27 |         
28 |         
29 |         
30 | 


--------------------------------------------------------------------------------
/python/geopackage/make_each_layer_a_file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Jun 19 15:38:52 2018
 5 | 
 6 | @author: ekkylli
 7 | 
 8 | Code for saving all layers of GeoPackage as separate files
 9 | """
10 | import os
11 | from osgeo import gdal, ogr
12 | 
13 | #OutputFolder
14 | outFolder='layers'
15 | 
16 | #Check that the folder exists
17 | if not os.path.exists(outFolder):
18 |     os.makedirs(outFolder)
19 | 
20 | #Make error messages visible
21 | gdal.UseExceptions() #Fail when can't open!
22 | def gdal_error_handler(err_class, err_num, err_msg):
23 |     errtype = {
24 |             gdal.CE_None:'None',
25 |             gdal.CE_Debug:'Debug',
26 |             gdal.CE_Warning:'Warning',
27 |             gdal.CE_Failure:'Failure',
28 |             gdal.CE_Fatal:'Fatal'
29 |     }
30 |     err_msg = err_msg.replace('\n',' ')
31 |     err_class = errtype.get(err_class, 'None')
32 |     print ('Error Number: %s' % (err_num))
33 |     print ('Error Type: %s' % (err_class))
34 |     print ('Error Message: %s' % (err_msg))
35 |  
36 |  #Enable error handler    USE THIS FIRST TO SEE THE ERRORS, remove late for faster throughput
37 |  #It seems that some field width warnings are given when actual data is ok.
38 | #gdal.PushErrorHandler(gdal_error_handler)    
39 |  
40 |  #Disable error handler
41 | #gdal.PopErrorHandler()
42 |  
43 | # Note, the original GeoPackage is opened with both ogr and gdal.
44 | # TODO, it might not be necessary actually
45 | ogrDS = ogr.Open('/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-vakavesi_20-02-06.gpkg')
46 | gdalDS = gdal.OpenEx('/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-vakavesi_20-02-06.gpkg', gdal.OF_VECTOR)
47 | 
48 | # get a layer with GetLayer('layername'/layerindex)
49 | for layer in ogrDS:
50 |     
51 |     # Generate the name for new file
52 |     layerName = layer.GetName()
53 |     print("Saving layer " + layerName)
54 |     outFile=os.path.join(outFolder,layerName +'.gpkg')
55 |     
56 |     # Remove output shapefile if it already exists
57 |     outDriver = ogr.GetDriverByName('GPKG')
58 |     if os.path.exists(outFile):
59 |         outDriver.DeleteDataSource(outFile)   
60 |         
61 |     #Save file with gdal, only one layer per file
62 |     ds1 = gdal.VectorTranslate(outFile, gdalDS, layers = [layerName] , format = 'GPKG') 
63 |     #Important, this is the way to save the file!
64 |     del ds1       
65 | 


--------------------------------------------------------------------------------
/python/geopackage/read_gpkg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | """
 3 | Examples for reading data from NLS geopackage with geopandas, fiona and sqlite3. 
 4 | The geopackges are rather big, so reading the whole file might not be optimal. 
 5 | We can however read parts of it quickly without having to inspect each row as shown in examples below:
 6 | """
 7 | import geopandas as gpd
 8 | import fiona
 9 | fn_muut= "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-muut_20-02-06.gpkg"
10 | fn_suo = "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-suo_20-02-06.gpkg" 
11 | 
12 | """
13 | Reading a layer into a dataframe. Some layers are large, but for smaller layers this can be quick enough.
14 | """
15 | def read_whole_layer():
16 |     df=gpd.read_file(fn_muut, layer="hylky")
17 |     print("Hylky:\n",df.head())
18 | 
19 | """
20 | Reading an area specified by a boundingbox from a single layer into a dataframe. 
21 | Geopandas takes advantage of geopackage's spatial indexing and this is a fast operation even on large layers. 
22 | For line and polygon geometries all features that at least intersect bounding box are selected.
23 | """
24 | def read_area():
25 | 
26 |     bb=(374692, 6671989, 379750, 6676677)
27 |     df=gpd.read_file(fn_suo, layer="suo", bbox=bb)
28 |     print("\n\nSuo:\n",df.head())
29 | 
30 | 
31 | """
32 | Reading rows in range 10-20. Again only the rows that we want will be read.
33 | """
34 | def read_rows_in_range():
35 |     c = fiona.open(fn_suo,layer="suo")
36 |     start=10
37 |     end=20
38 |     df=gpd.GeoDataFrame.from_features(c[start:end])
39 |     print(df)
40 | 
41 | """
42 | Reading specific rows. As above but for specific row numbers rather than a range of rows.
43 | """
44 | def read_specific_rows():
45 |     with fiona.open(fn_suo,layer="suo") as c:
46 |         rows = (1,5,100)
47 |         df=gpd.GeoDataFrame.from_features([c[i] for i in rows])
48 |         print(df)
49 |         
50 | """
51 | Reading rows where an attribute has a certain value (or based on any SQL query).
52 | Fiona and thus geopandas don't support reading only specifc rows based on an attribute. However if you really need to be able to do this fast you can do it by first creating an index for the column you want to use and then using sqlite to get numbers of rows you need. After this you can create dataframe as above. This method can of course be used to run any SQL query to first select IDs of the rows that we want before reading the data into memory. Weather these queries need to actually inspect each row or if faster execution is possible depends on the query and the indexes available. The main benefit here is that you can take advantage of additional indexes and you don't need to first read all the rows into geopandas dataframe.
53 | Geopandas also specifies read_postgis() method that you can use to accomplish the same end result, but using this with geopackage creates a need for some geometry type conversions that can be problematic.
54 | """
55 | 
56 | import sqlite3
57 | def create_index():
58 |     table="suo"
59 |     col="mtk_id"
60 |     conn = sqlite3.connect(fn)
61 |     c = conn.cursor()
62 |     sql="CREATE INDEX index_{}_{} ON {} ({})".format(table, col, table, col)
63 |     c.execute(sql)
64 |     conn.commit()
65 |     conn.close()
66 | 
67 | def read_by_attribute():
68 |     conn = sqlite3.connect(fn)
69 |     c = conn.cursor()
70 |     layer="suo"
71 |     attribute_col="mtk_id"
72 |     attribute_val=219920480
73 |     id_col = 'fid'
74 |     sql="select {} from {} where {}={}".format(id_col, layer, attribute_col, attribute_val)
75 |     c.execute(sql)
76 |     rows = c.fetchall()
77 |     rows = [r[0] for r in rows]
78 |     print(rows)
79 |     with fiona.open(fn,layer="suo") as c:
80 |         df=gpd.GeoDataFrame.from_features([c[i] for i in rows])
81 |         print(df)
82 | 
83 | if __name__=='__main__':
84 |     read_whole_layer()
85 |     read_specific_rows()
86 |     read_rows_in_range()
87 |     read_area()    
88 |     
89 | 


--------------------------------------------------------------------------------
/python/puhti/00_interactive/interactive_single_core_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An example Python script how to calculate NDVI for one Sentinel satellite images
 3 | using just 1 process.
 4 | 
 5 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC
 6 | 
 7 | """
 8 | import os
 9 | import sys
10 | import time
11 | import rasterio
12 | 
13 | ### The filepath to one Sentinel image
14 | sentinel_image_path = "/appl/data/geo/sentinel/s2_example_data/L2A/S2B_MSIL2A_20190530T094039_N0212_R036_T36VUR_20190530T113343.SAFE"
15 | 
16 | 
17 | def readImage(image_folder_fp):
18 |     print(f"Reading Sentinel image from: {image_folder_fp}")
19 |     ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings
20 |     for subdir, dirs, files in os.walk(image_folder_fp):
21 |         for file in files:
22 |             if file.endswith("_B04_10m.jp2"):
23 |                 red_fp = os.path.join(subdir, file)
24 |             if file.endswith("_B08_10m.jp2"):
25 |                 nir_fp = os.path.join(subdir, file)
26 |     ### Read the red and nir (near-infrared) band files with Rasterio
27 |     red = rasterio.open(red_fp)
28 |     nir = rasterio.open(nir_fp)
29 |     ### Return the rasterio objects as a list
30 |     return red, nir
31 | 
32 | 
33 | def calculateNDVI(red, nir):
34 |     print("Computing NDVI")
35 |     ### This function calculates NDVI from the red and nir bands
36 |     ## Read the rasterio objects pixel information to numpy arrays
37 |     red = red.read(1)
38 |     nir = nir.read(1)
39 |     ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000)
40 |     red = red / 10000
41 |     nir = nir / 10000
42 |     ### the NDVI formula
43 |     ndvi = (nir - red) / (nir + red)
44 |     return ndvi
45 | 
46 | 
47 | def saveImage(ndvi, sentinel_image_path, input_image):
48 |     ## Create an output folder to this location, if it does not exist
49 |     outputdir = "output"
50 |     if not os.path.exists(outputdir):
51 |         os.makedirs(outputdir)
52 |     ## Create output filepath for the image. We use the input name with _NDVI end
53 |     output_file = os.path.join(
54 |         outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif")
55 |     )
56 |     print(f"Saving image: {output_file}")
57 |     ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red)
58 |     metadata = input_image.profile
59 |     ## Change the data type from integer to float and file type from jp2 to GeoTiff
60 |     metadata.update(dtype=rasterio.float64, driver="GTiff")
61 |     ## Write the ndvi numpy array to a GeoTiff with the updated metadata
62 |     with rasterio.open(output_file, "w", **metadata) as dst:
63 |         dst.write(ndvi, 1)
64 | 
65 | 
66 | def processImage(sentinel_image_path):
67 |     ### This function processes one image (read, compute, save)
68 |     ## Read the image and get rasterio objects from the red nir bands
69 |     red, nir = readImage(sentinel_image_path)
70 |     ## Calculate NDVI and get the resulting numpy array
71 |     ndvi = calculateNDVI(red, nir)
72 |     ## Write the NDVI numpy array to file to the same extent as the red input band
73 |     saveImage(ndvi, sentinel_image_path, red)
74 | 
75 | 
76 | def main():
77 |     ## run the process on input dir if it is a directory
78 |     if os.path.isdir(sentinel_image_path):
79 |         print(f"\nProcess of {sentinel_image_path} started")
80 |         processImage(sentinel_image_path)
81 |         print(f"Processing of {sentinel_image_path} done\n")
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     ## This part is the first to execute when script is ran. It times the execution time and runs the main function
86 |     start = time.time()
87 |     main()
88 |     end = time.time()
89 |     print(f"Script completed in {str(end - start)} seconds")
90 | 


--------------------------------------------------------------------------------
/python/puhti/01_serial/single_core_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An example Python script how to calculate NDVI for one Sentinel satellite image
 3 | using just 1 process.
 4 | 
 5 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC
 6 | 
 7 | """
 8 | import os
 9 | import sys
10 | import time
11 | import rasterio
12 | 
13 | 
14 | ### The filepath for the input Sentinel image that is given as input parameter
15 | sentinel_image_path = sys.argv[1]
16 | 
17 | 
18 | def readImage(image_folder_fp):
19 |     print(f"Reading Sentinel image from: {image_folder_fp}")
20 |     ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings
21 |     for subdir, dirs, files in os.walk(image_folder_fp):
22 |         for file in files:
23 |             if file.endswith("_B04_10m.jp2"):
24 |                 red_fp = os.path.join(subdir, file)
25 |             if file.endswith("_B08_10m.jp2"):
26 |                 nir_fp = os.path.join(subdir, file)
27 |     ### Read the red and nir (near-infrared) band files with Rasterio
28 |     red = rasterio.open(red_fp)
29 |     nir = rasterio.open(nir_fp)
30 |     ### Return the rasterio objects as a list
31 |     return red, nir
32 | 
33 | 
34 | def calculateNDVI(red, nir):
35 |     print("Computing NDVI")
36 |     ### This function calculates NDVI from the red and nir bands
37 |     ## Read the rasterio objects pixel information to numpy arrays
38 |     red = red.read(1)
39 |     nir = nir.read(1)
40 |     ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000)
41 |     red = red / 10000
42 |     nir = nir / 10000
43 |     ### the NDVI formula
44 |     ndvi = (nir - red) / (nir + red)
45 |     return ndvi
46 | 
47 | 
48 | def saveImage(ndvi, sentinel_image_path, input_image):
49 |     ## Create an output folder to this location, if it does not exist
50 |     outputdir = "output"
51 |     if not os.path.exists(outputdir):
52 |         os.makedirs(outputdir)
53 |     ## Create output filepath for the image. We use the input name with _NDVI end
54 |     output_file = os.path.join(
55 |         outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif")
56 |     )
57 |     print(f"Saving image: {output_file}")
58 |     ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red)
59 |     metadata = input_image.profile
60 |     ## Change the data type from integer to float and file type from jp2 to GeoTiff
61 |     metadata.update(dtype=rasterio.float64, driver="GTiff")
62 |     ## Write the ndvi numpy array to a GeoTiff with the updated metadata
63 |     with rasterio.open(output_file, "w", **metadata) as dst:
64 |         dst.write(ndvi, 1)
65 | 
66 | 
67 | def processImage(sentinel_image_path):
68 |     ### This function processes one image (read, compute, save)
69 |     ## Read the image and get rasterio objects from the red nir bands
70 |     red, nir = readImage(sentinel_image_path)
71 |     ## Calculate NDVI and get the resulting numpy array
72 |     ndvi = calculateNDVI(red, nir)
73 |     ## Write the NDVI numpy array to file to the same extent as the red input band
74 |     saveImage(ndvi, sentinel_image_path, red)
75 | 
76 | 
77 | def main():
78 |     print(f"\nProcess of {sentinel_image_path} started")
79 |     processImage(sentinel_image_path)
80 |     print(f"Processing of {sentinel_image_path} done\n")
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     ## This part is the first to execute when script is ran. It times the execution time and rans the main function
85 |     start = time.time()
86 |     main()
87 |     end = time.time()
88 |     print(f"Script completed in {str(end - start)} seconds")
89 | 


--------------------------------------------------------------------------------
/python/puhti/01_serial/single_core_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=slurm-%j.out       # File to write the standard output to. %j is replaced by the job ID.
 5 | #SBATCH --error=slurm-%j.err        # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 6 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 8 | #SBATCH --cpus-per-task=1           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 9 | #SBATCH --mem-per-cpu=2G            # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | #SBATCH --partition=small            # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | 
12 | module load geoconda
13 | 
14 | ones2file=/appl/data/geo/sentinel/s2_example_data/L2A/S2B_MSIL2A_20190530T094039_N0212_R036_T36VUR_20190530T113343.SAFE
15 | 
16 | srun python single_core_example.py $ones2file
17 | 


--------------------------------------------------------------------------------
/python/puhti/01_serial/single_core_example_folder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An example Python script how to calculate NDVI for three Sentinel satellite images
 3 | using just 1 process.
 4 | For going through all the files, a for-loop is used in the main()- function
 5 | 
 6 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC
 7 | 
 8 | """
 9 | import os
10 | import sys
11 | import time
12 | import rasterio
13 | 
14 | 
15 | ### The filepath for the input Sentinel image folder is an input argument to the script
16 | image_folder = sys.argv[1]
17 | 
18 | 
19 | def readImage(image_folder_fp):
20 |     print(f"Reading Sentinel image from: {image_folder_fp}")
21 |     ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings
22 |     for subdir, dirs, files in os.walk(image_folder_fp):
23 |         for file in files:
24 |             if file.endswith("_B04_10m.jp2"):
25 |                 red_fp = os.path.join(subdir, file)
26 |             if file.endswith("_B08_10m.jp2"):
27 |                 nir_fp = os.path.join(subdir, file)
28 |     ### Read the red and nir (near-infrared) band files with Rasterio
29 |     red = rasterio.open(red_fp)
30 |     nir = rasterio.open(nir_fp)
31 |     ### Return the rasterio objects as a list
32 |     return red, nir
33 | 
34 | 
35 | def calculateNDVI(red, nir):
36 |     print("Computing NDVI")
37 |     ### This function calculates NDVI from the red and nir bands
38 |     ## Read the rasterio objects pixel information to numpy arrays
39 |     red = red.read(1)
40 |     nir = nir.read(1)
41 |     ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000)
42 |     red = red / 10000
43 |     nir = nir / 10000
44 |     ### the NDVI formula
45 |     ndvi = (nir - red) / (nir + red)
46 |     return ndvi
47 | 
48 | 
49 | def saveImage(ndvi, sentinel_image_path, input_image):
50 |     ## Create an output folder to this location, if it does not exist
51 |     outputdir = "output"
52 |     if not os.path.exists(outputdir):
53 |         os.makedirs(outputdir)
54 |     ## Create output filepath for the image. We use the input name with _NDVI end
55 |     output_file = os.path.join(
56 |         outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif")
57 |     )
58 |     print(f"Saving image: {output_file}")
59 |     ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red)
60 |     metadata = input_image.profile
61 |     ## Change the data type from integer to float and file type from jp2 to GeoTiff
62 |     metadata.update(dtype=rasterio.float64, driver="GTiff")
63 |     ## Write the ndvi numpy array to a GeoTiff with the updated metadata
64 |     with rasterio.open(output_file, "w", **metadata) as dst:
65 |         dst.write(ndvi, 1)
66 | 
67 | 
68 | def processImage(sentinel_image_path):
69 |     ### This function processes one image (read, compute, save)
70 |     ## Read the image and get rasterio objects from the red nir bands
71 |     red, nir = readImage(sentinel_image_path)
72 |     ## Calculate NDVI and get the resulting numpy array
73 |     ndvi = calculateNDVI(red, nir)
74 |     ## Write the NDVI numpy array to file to the same extent as the red input band
75 |     saveImage(ndvi, sentinel_image_path, red)
76 | 
77 | 
78 | def main():
79 |     ## Loop the directory where all sentinel image folders are and run processImage function to them one by one
80 |     for directory in os.listdir(image_folder):
81 |         sentinel_image_path = os.path.join(image_folder, directory)
82 |         if os.path.isdir(sentinel_image_path):
83 |             print(f"\nProcess of {sentinel_image_path} started")
84 |             processImage(sentinel_image_path)
85 |             print(f"Processing of {sentinel_image_path} done\n")
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     ## This part is the first to execute when script is ran. It times the execution time and rans the main function
90 |     start = time.time()
91 |     main()
92 |     end = time.time()
93 |     print(f"Script completed in {str(end - start)} seconds")
94 | 


--------------------------------------------------------------------------------
/python/puhti/01_serial/single_core_example_folder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=slurm-%j.out       # File to write the standard output to. %j is replaced by the job ID.
 5 | #SBATCH --error=slurm-%j.err        # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 6 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 8 | #SBATCH --cpus-per-task=1           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 9 | #SBATCH --mem-per-cpu=2G            # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | #SBATCH --partition=small            # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | 
12 | module load geoconda
13 | 
14 | data_folder=/appl/data/geo/sentinel/s2_example_data/L2A/
15 | 
16 | srun python single_core_example_folder.py $data_folder
17 | 


--------------------------------------------------------------------------------
/python/puhti/01_serial/single_core_example_list.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --output=slurm-%j.out       # File to write the standard output to. %j is replaced by the job ID.
 5 | #SBATCH --error=slurm-%j.err        # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 
 6 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 8 | #SBATCH --cpus-per-task=1           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 9 | #SBATCH --mem-per-cpu=2G            # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | #SBATCH --partition=small           # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | 
12 | module load geoconda
13 | 
14 | #collect all filepaths in a text file
15 | readlink -f /appl/data/geo/sentinel/s2_example_data/L2A/S2* > image_path_list.txt
16 | 
17 | #loop through files in txtfile
18 | while read ones2file; do
19 |   srun python single_core_example.py $ones2file
20 | done <image_path_list.txt
21 | 


--------------------------------------------------------------------------------
/python/puhti/02_gnu_parallel/gnu_parallel_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An example Python script how to calculate NDVI for one satellite image
 3 | 
 4 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC
 5 | 
 6 | """
 7 | import os
 8 | import sys
 9 | import time
10 | import rasterio
11 | 
12 | ### The filepath to the input Sentinel image folder is an input argument to the script
13 | sentinel_image_path = sys.argv[1]
14 | 
15 | 
16 | def readImage(image_folder_fp):
17 |     print(f"Reading Sentinel image from: {image_folder_fp}")
18 |     ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings
19 |     for subdir, dirs, files in os.walk(image_folder_fp):
20 |         for file in files:
21 |             if file.endswith("_B04_10m.jp2"):
22 |                 red_fp = os.path.join(subdir, file)
23 |             if file.endswith("_B08_10m.jp2"):
24 |                 nir_fp = os.path.join(subdir, file)
25 |     ### Read the red and nir (near-infrared) band files with Rasterio
26 |     red = rasterio.open(red_fp)
27 |     nir = rasterio.open(nir_fp)
28 |     ### Return the rasterio objects as a list
29 |     return red, nir
30 | 
31 | 
32 | def calculateNDVI(red, nir):
33 |     print("Computing NDVI")
34 |     ### This function calculates NDVI from the red and nir bands
35 |     ## Read the rasterio objects pixel information to numpy arrays
36 |     red = red.read(1)
37 |     nir = nir.read(1)
38 |     ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000)
39 |     red = red / 10000
40 |     nir = nir / 10000
41 |     ### the NDVI formula
42 |     ndvi = (nir - red) / (nir + red)
43 |     return ndvi
44 | 
45 | 
46 | def saveImage(ndvi, sentinel_image_path, input_image):
47 |     ## Create an output folder to this location, if it does not exist
48 |     outputdir = "output"
49 |     if not os.path.exists(outputdir):
50 |         os.makedirs(outputdir)
51 |     ## Create output filepath for the image. We use the input name with _NDVI end
52 |     output_file = os.path.join(
53 |         outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif")
54 |     )
55 |     print(f"Saving image: {output_file}")
56 |     ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red)
57 |     metadata = input_image.profile
58 |     ## Change the data type from integer to float and file type from jp2 to GeoTiff
59 |     metadata.update(dtype=rasterio.float64, driver="GTiff")
60 |     ## Write the ndvi numpy array to a GeoTiff with the updated metadata
61 |     with rasterio.open(output_file, "w", **metadata) as dst:
62 |         dst.write(ndvi, 1)
63 | 
64 | 
65 | def processImage(sentinel_image_path):
66 |     ### This function processes one image (read, compute, save)
67 |     ## Read the image and get rasterio objects from the red nir bands
68 |     red, nir = readImage(sentinel_image_path)
69 |     ## Calculate NDVI and get the resulting numpy array
70 |     ndvi = calculateNDVI(red, nir)
71 |     ## Write the NDVI numpy array to file to the same extent as the red input band
72 |     saveImage(ndvi, sentinel_image_path, red)
73 | 
74 | 
75 | def main():
76 |     print(f"\nProcess of {sentinel_image_path} started")
77 |     processImage(sentinel_image_path)
78 |     print(f"Processing of {sentinel_image_path} done\n")
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     ## This part is the first to execute when script is ran. It times the execution time and runs the main function
83 |     start = time.time()
84 |     main()
85 |     end = time.time()
86 |     print(f"Script completed in {str(end - start)} seconds")
87 | 


--------------------------------------------------------------------------------
/python/puhti/02_gnu_parallel/gnu_parallel_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 5 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 6 | #SBATCH --cpus-per-task=3           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 7 | #SBATCH --mem-per-cpu=2G            # Minimum memory required per usable allocated CPU.  Default units are megabytes.
 8 | #SBATCH --partition=small            # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | 
10 | module load parallel
11 | module load geoconda
12 | 
13 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A
14 | 
15 | # For looping through all the files:
16 | 
17 | # Make a list of input files
18 | readlink -f $datadir/S2* > image_path_list.txt
19 | 
20 | parallel -a image_path_list.txt python gnu_parallel_example.py
21 | 


--------------------------------------------------------------------------------
/python/puhti/03_array/array_job_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | An example Python script how to calculate NDVI for three Sentinel satellite images
 3 | with an array job.
 4 | This script handles only ONE file, which is given as parameter to the script.
 5 | 
 6 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC
 7 | 
 8 | """
 9 | 
10 | import os
11 | import sys
12 | import time
13 | import rasterio
14 | 
15 | ### The filepath for the input Sentinel image that is given as input parameter
16 | sentinel_image_path = sys.argv[1]
17 | 
18 | 
19 | def readImage(image_folder_fp):
20 |     print("Reading Sentinel image from: %s" % (image_folder_fp))
21 | 
22 |     ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings
23 |     for subdir, dirs, files in os.walk(image_folder_fp):
24 |         for file in files:
25 |             if file.endswith("_B04_10m.jp2"):
26 |                 red_fp = os.path.join(subdir, file)
27 |             if file.endswith("_B08_10m.jp2"):
28 |                 nir_fp = os.path.join(subdir, file)
29 | 
30 |     ### Read the red and nir (near-infrared) band files with Rasterio
31 |     red = rasterio.open(red_fp)
32 |     nir = rasterio.open(nir_fp)
33 | 
34 |     ### Return the rasterio objects as a list
35 |     return red, nir
36 | 
37 | 
38 | def calculateNDVI(red, nir):
39 |     print("Computing NDVI")
40 |     ### This function calculates NDVI from the red and nir bands
41 | 
42 |     ## Read the rasterio objects pixel information to numpy arrays
43 |     red = red.read(1)
44 |     nir = nir.read(1)
45 | 
46 |     ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000)
47 |     red = red / 10000
48 |     nir = nir / 10000
49 | 
50 |     ### the NDVI formula
51 |     ndvi = (nir - red) / (nir + red)
52 |     return ndvi
53 | 
54 | 
55 | def saveImage(ndvi, sentinel_image_path, input_image):
56 |     ## Create an output folder to this location, if it does not exist
57 |     outputdir = "output"
58 |     if not os.path.exists(outputdir):
59 |         os.makedirs(outputdir)
60 |     ## Create output filepath for the image. We use the input name with _NDVI end
61 |     output_file = os.path.join(
62 |         outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif")
63 |     )
64 |     print(f"Saving image: {output_file}")
65 |     ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red)
66 |     metadata = input_image.profile
67 |     ## Change the data type from integer to float and file type from jp2 to GeoTiff
68 |     metadata.update(dtype=rasterio.float64, driver="GTiff")
69 |     ## Write the ndvi numpy array to a GeoTiff with the updated metadata
70 |     with rasterio.open(output_file, "w", **metadata) as dst:
71 |         dst.write(ndvi, 1)
72 | 
73 | 
74 | def processImage(sentinel_image_path):
75 |     ### This function processes one image (read, compute, save)
76 | 
77 |     ## Read the image and get rasterio objects from the red nir bands
78 |     red, nir = readImage(sentinel_image_path)
79 | 
80 |     ## Calculate NDVI and get the resulting numpy array
81 |     ndvi = calculateNDVI(red, nir)
82 | 
83 |     ## Write the NDVI numpy array to file to the same extent as the red input band
84 |     saveImage(ndvi, sentinel_image_path, red)
85 | 
86 | 
87 | def main():
88 |     print("\nProcess started")
89 |     processImage(sentinel_image_path)
90 |     print("Processing done\n")
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     ## This part is the first to execute when script is ran. It times the execution time and runs the main function
95 |     start = time.time()
96 |     main()
97 |     end = time.time()
98 |     print("Script completed in " + str(end - start) + " seconds")
99 | 


--------------------------------------------------------------------------------
/python/puhti/03_array/array_job_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --output=slurm-%A_%a.out    # File to write the standard output to. %A is replaced by the job ID and %a with the array index.
 3 | #SBATCH --error=slurm-%A_%a.err     # File to write the standard error to. %A is replaced by the job ID and %a with the array index. Defaults to slurm-%A_%a.out if not provided.
 4 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 5 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 6 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 8 | #SBATCH --cpus-per-task=1           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 9 | #SBATCH --mem-per-cpu=2G            # Minimum memory required per usable allocated CPU.  Default units are megabytes.
10 | #SBATCH --partition=small           # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | #SBATCH --array=1-3                 # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -.
12 | 
13 | module load geoconda
14 | 
15 | # For looping through all the files:
16 | 
17 | # Make a list of input files
18 | readlink -f /appl/data/geo/sentinel/s2_example_data/L2A/S2* > image_path_list.txt
19 | 
20 | # Select the inputfile from row n to the array job n.
21 | image_path=$(sed -n ${SLURM_ARRAY_TASK_ID}p image_path_list.txt)
22 | 
23 | # Feed the filename to the Python script
24 | srun python array_job_example.py $image_path
25 | 


--------------------------------------------------------------------------------
/python/puhti/04_parallel_multiprocessing/multiprocessing_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 5 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 6 | #SBATCH --cpus-per-task=3           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 7 | #SBATCH --mem-per-cpu=2G            # Minimum memory required per usable allocated CPU.  Default units are megabytes.
 8 | #SBATCH --partition=small            # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | 
10 | module load geoconda
11 | 
12 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A
13 | 
14 | srun python multiprocessing_example.py  $datadir
15 | 


--------------------------------------------------------------------------------
/python/puhti/05_parallel_joblib/joblib_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 5 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 6 | #SBATCH --cpus-per-task=3           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 7 | #SBATCH --mem-per-cpu=2G            # Minimum memory required per usable allocated CPU.  Default units are megabytes.
 8 | #SBATCH --partition=small            # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | 
10 | module load geoconda
11 | 
12 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A
13 | 
14 | srun python joblib_example.py $datadir
15 | 


--------------------------------------------------------------------------------
/python/puhti/06_parallel_dask/multi_node/dask_multinode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --time=00:10:00             # Maximum duration of the job. Upper limit depends on partition.
 5 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 6 | #SBATCH --cpus-per-task=1           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 7 | #SBATCH --mem-per-cpu=6G            # Memory required per allocated CPU.  Default units are megabytes.
 8 | #SBATCH --partition=small            # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | 
10 | # The resources reserved here are only for the master job, so 1 core and moderate memory should be enough.
11 | # The resources for workers are reservd in the Python file.
12 | 
13 | ### Load the geoconda module which has Python and Dask installed
14 | module load geoconda
15 | 
16 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A
17 | 
18 | ### Run the Dask example. The directory given to the script has 3 Sentinel images
19 | ### We also give our project name so the master job is able to launch worker jobs
20 | 
21 | srun python dask_multinode.py $datadir $SLURM_JOB_ACCOUNT
22 | 


--------------------------------------------------------------------------------
/python/puhti/06_parallel_dask/single_node/dask_singlenode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_20xxxxx   # Choose the project to be billed
 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course
 4 | #SBATCH --time=00:05:00             # Maximum duration of the job. Upper limit depends on partition.
 5 | #SBATCH --ntasks=1                  # Number of tasks. Upper limit depends on partition.
 6 | #SBATCH --cpus-per-task=3           # How many processors work on one task. Upper limit depends on number of CPUs per node.
 7 | #SBATCH --mem-per-cpu=6G            # Memory required per usable allocated CPU.  Default units are megabytes.
 8 | #SBATCH --partition=small            # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 9 | 
10 | ### Load the geoconda module which has Python and Dask installed
11 | module load geoconda
12 | 
13 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A
14 | 
15 | ### Run the Dask example. The directory given to the script hosts 3 Sentinel images
16 | srun python dask_singlenode.py $datadir
17 | 


--------------------------------------------------------------------------------
/python/routing/batch_igraph.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -A <YOUR-PROJECT-HERE>
 3 | #SBATCH --output=out_%J.txt  # File to write the standard output to.
 4 | #SBATCH --error=err_%J.txt  # File to write the standard error to.
 5 | #SBATCH --time=00:15:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #Number of reserved cores, this number can be later accessed with $SLURM_CPUS_PER_TASK
 7 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 8 | #We're operating with shared memory so reserve total amount of memory, not per cpu
 9 | #SBATCH --mem=12000  # Real memory required per node.
10 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | 
12 | # load needed modules
13 | module load geoconda
14 | 
15 | #Pass number of cores reserved to python script as argument, so that correct number of processes can be started
16 | python igraph_parallel.py $SLURM_CPUS_PER_TASK
17 | 


--------------------------------------------------------------------------------
/python/routing/batch_nx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -A <YOUR-PROJECT-HERE>
 3 | #SBATCH --output=out_%J.txt  # File to write the standard output to.
 4 | #SBATCH --error=err_%J.txt  # File to write the standard error to.
 5 | #SBATCH --time=00:10:00  # Maximum duration of the job. Upper limit depends on partition.
 6 | #Number of reserved cores, this number can be later accessed with $SLURM_CPUS_PER_TASK
 7 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 8 | #We're operating with shared memory so reserve total amount of memory, not per cpu
 9 | #SBATCH --mem=6000  # Real memory required per node.
10 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
11 | 
12 | # load needed modules
13 | module load geoconda
14 | #Pass number of cores reserved to python script as argument, so that correct number of processes can be started
15 | python nx_parallel.py $SLURM_CPUS_PER_TASK
16 | 


--------------------------------------------------------------------------------
/python/routing/igraph_parallel.py:
--------------------------------------------------------------------------------
 1 | import igraph 
 2 | import multiprocessing as mp
 3 | import time
 4 | import sys
 5 | 
 6 | #Graphml file containing street network from Hanko, Finland.
 7 | graph_file="data/hanko.graphml"
 8 | g =  igraph.read(graph_file)
 9 | 
10 | #Create edge weights based on length
11 | for e in g.es:
12 | 	e['weight']=float(e['length'])
13 | 
14 | #Create "size" amount of (start, end) pairs in list like such: [(s1,e1),(s2,e2)...] where start and end are vertex indices
15 | size=500
16 | args= [(int(len(g.vs)*(1/size)*i),int(len(g.vs)*(1/size)*(i+1)-1)) for i in range(0,size)]
17 | 
18 | 
19 | #Function to calculate shortest path and in this case just return sum of edge weights. (You can also get path length directly from igraph with igraph.shortest_paths(), but usually you'd probably also want the actual path.
20 | def sp(start,end):
21 | 	path=g.get_shortest_paths(start, to=end, weights='weight',output="epath")
22 | 	path_len=[g.es[e]['weight'] for e in path[0]]
23 | 	return sum(path_len)
24 | 
25 | #Get number of cores from batch job script as argument, so number of processes used here matches the reserved number of cores
26 | print(sys.argv[1], " cores")
27 | 
28 | #Create multiprocessing pool and map shortest path calculations for each start, end pair to the pool.
29 | with mp.Pool(processes=int(sys.argv[1])) as pool:
30 | 	t0 = time.time()
31 | 	results = pool.starmap(sp, args)
32 | 	print("Time spent on path calculations", time.time()-t0, " seconds")
33 | 	print(sum(results))
34 | 


--------------------------------------------------------------------------------
/python/routing/nx_parallel.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx 
 2 | import multiprocessing as mp
 3 | import time
 4 | import numpy as np
 5 | import sys
 6 | import osmnx
 7 | #Graphml file containing street network Hanko, Finland.
 8 | graph_file="data/hanko.graphml"
 9 | g=nx.read_graphml(graph_file, node_type=int)
10 | 
11 | print("graph read")
12 | 
13 | #Add edge weights based on length
14 | for e in g.edges(data=True):
15 |     e[2]['w']=float(e[2]['length'])
16 | 
17 | #Function to calculate shortest path between two random points on map. Using seed to keep the paths same for each run. If specific nodes were wanted for start and end points these could be accessed with their OSM id numbers. 
18 | def sp(seed):
19 | 	np.random.seed(seed)
20 | 	route = nx.shortest_path(g, source=np.random.choice(g.nodes),target=np.random.choice(g.nodes), weight='w')
21 | 	return route
22 | 
23 | #Get number of available cores as argument from batch job script so that number of processes used on pool matches number of cores reserved in batch job script.
24 | print(sys.argv[1], " cores")
25 | with mp.Pool(processes=int(sys.argv[1])) as pool:
26 | 	t0 = time.time()
27 | 	#Map shortest path function to seeds ranging from 0 to 99 to create 100 paths.
28 | 	results = pool.map(sp,range(100))
29 | 	print("Time spent on path calculations", time.time()-t0, " seconds")
30 | 	
31 | 


--------------------------------------------------------------------------------
/python/routing/osmnx-graphml.py:
--------------------------------------------------------------------------------
 1 | import osmnx as ox
 2 | import time
 3 | import os
 4 | 
 5 | 
 6 | #Get graphml from Overpass api
 7 | def place_to_graphml(place, graphml_file):
 8 |     G = ox.graph_from_place(place,network_type="drive")
 9 |     ox.save_load.save_graphml(G, graphml_file)
10 |     
11 | #Get grpahml from local .osm file
12 | def osm_to_graphml(osm_file, graphml_file):
13 |     G = ox.graph_from_file(place,network_type="drive")
14 |     ox.save_load.save_graphml(G, graphml_file)
15 | 
16 | place_to_graphml("Helsinki, Finland", "helsinki.graphml")
17 | osm_to_graphml("finland-latest.osm", "finland.graphml")
18 | 


--------------------------------------------------------------------------------
/python/routing/readme.md:
--------------------------------------------------------------------------------
 1 | ## Routing using Python igraph or networkx package with multiprocessing
 2 | 
 3 | Here you can find example code for doing network routing in Puhti with Python igraph or networkx package. For reading the data from OpenStreetMap osmnx package is used. Osmnx can create network data needed by networkx directly. For igraph the data has to be save in GraphML format first, and then igraph can read from GraphML. In general igraph is faster and requires less memory, while networkx might be easier to use.
 4 | 
 5 | Files:
 6 | * osmnx-graphml.py - how to create a graph in GraphML format from OpenStreetMap data.
 7 | * igraph_parallel.py and nx_parallel.py - how to run shortest path analysis with igraph and networkx in parallel with multiprocessing module.
 8 | * batch_igarph.sh and batch_nx.sh - batch job files for submitting the shortest paths scripts to batch job system in Taito.
 9 | 
10 | Notes:
11 | * Creating GraphML file from OpenStreetMap data with osmnx requires a lot of memory as osmnx uses networkx graphs. Downloading from overpass API is suitable for only smaller areas.
12 | * Memory consumption increases with parallelisation, but not by much
13 | * Parallelisation is done within one node, in Puhti up to 40 cores and 382GB memory (or 1.5TB in hugemem queue) can be used.
14 | 
15 | ### Test results (from Taito Supercomputer 2019)
16 | Time and memory consumption for shortest paths analysis on whole *Finland* street network from OSM using igraph.
17 | 
18 | | Cores	 |Wall clock (min:s)|Time on pathfinding (min:s)|Mem (GB)|
19 | | ------ |------------------|-----------------------|--------|
20 | | 1|14:01|11:48|7.35|
21 | | 4|5:39|3:27|8.27|
22 | | 10|3:43|1:31|10.05|
23 | 
24 | Time and memory consumption for shortest paths analysis on *Helsinki* street network using networkx
25 | 
26 | | Cores  |Wall clock (min:s)|Time on pathfinding (min:s)|Mem (GB)|
27 | | ------ |------------------|-----------------------|--------|
28 | | 1|5:01|3:51|2.39|
29 | | 4|4:35|1:08|3.11|
30 | | 10|1:25|0:30|4.38|
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/python/sentinel/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Sentinel download script
 3 | 
 4 | This script is an example how to find and download large quantities of Sentinel-2 images using Python and the [sentinelsat library](https://sentinelsat.readthedocs.io).
 5 | 
 6 | The script works only for the Finnish **Finhub API**, or other national mirrors that use the same API. It does however not work for the Copernice Space Data Ecosystem. Check [CSC Earth Observation guide](https://docs.csc.fi/support/tutorials/gis/eo_guide/) for alternative ways of downloading Sentinel data from the CDSE.
 7 | 
 8 | Another option for similar task is to use [STAC ](../STAC).
 9 |  
10 | ### Running
11 | On local computer just install the sentinelsat library first.
12 | 
13 | In Puhti sentinelsat is included in the [geoconda module](https://docs.csc.fi/apps/geoconda/), which must be loaded before running the script.
14 | 
15 | ```
16 | module load geoconda
17 | python sentinelsat_download_from_finhub.py
18 | ```
19 | 
20 | You can run the script simply on login-node for smaller amounts of data.
21 | 
22 | For bigger amounts of data you can use [screen](https://linuxize.com/post/how-to-use-linux-screen/) or [interactive session](https://docs.csc.fi/computing/running/interactive-usage/).
23 | 
24 | ### Unzipping 
25 | 
26 | As the Python unzipping is a little complicated with files over 1GB, we recommend using bash commands to unzip the files
27 | 
28 | Unzip all files to the current directory
29 | `unzip '*.zip`
30 | 
31 | Unzip all files to current directory and delete them at the same time
32 | `find . -depth -name '*.zip' -execdir unzip -n {} \; -delete`
33 | 
34 | ## Things to consider 
35 | 
36 | * Finhub API has data limited to Nordics only, as well as only some chosen data products. For more options, check the Copernicus Data Space ecosystem
37 | * If the area of interest is in the middle of two UTM zones, the script often downloads the same image in two different projections. You can specify the UTM zone if you do not want to download duplicates
38 | 


--------------------------------------------------------------------------------
/python/sentinel/helsinki.geojson:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "FeatureCollection",
 3 |   "features": [
 4 |     {
 5 |       "type": "Feature",
 6 |       "properties": {},
 7 |       "geometry": {
 8 |         "type": "Polygon",
 9 |         "coordinates": [
10 |           [
11 |             [
12 |               24.792,
13 |               60.136
14 |             ],
15 |             [
16 |               24.792,
17 |               60.231
18 |             ],
19 |             [
20 |               25.064,
21 |               60.231
22 |             ],
23 |             [
24 |               25.064,
25 |               60.136
26 |             ],
27 |             [
28 |               24.792,
29 |               60.136
30 |             ]
31 |           ]
32 |         ]
33 |       }
34 |     }
35 |   ]
36 | }


--------------------------------------------------------------------------------
/python/sentinel/sentinelsat_download_from_finhub.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | An example script for downloading Sentinel data from FinHub with sentinelsat Python library.
 4 | 
 5 | https://finhub.nsdc.fmi.fi
 6 | """
 7 | 
 8 | import sentinelsat
 9 | from datetime import date
10 | import pandas as pd
11 | import time
12 | 
13 | ### Set your credentials
14 | finhub_user = 'a'
15 | finhub_pwd = 'a'
16 | finhub_url = 'https://finhub.nsdc.fmi.fi'
17 | 
18 | ### Open API connection
19 | finhub_api = sentinelsat.SentinelAPI(finhub_user, finhub_pwd, finhub_url)
20 | 
21 | ### Search by polygon (WGS84), time, and  query keywords
22 | footprint = sentinelsat.geojson_to_wkt(sentinelsat.read_geojson(r'helsinki.geojson'))
23 | startDate = date(2020,1,1)
24 | endDate = date(2020,7,30)
25 | cloudcoverage = (0, 20)
26 | platformname = 'Sentinel-2'
27 | producttype = 'S2MSI1C'
28 | # producttype='S2MSI2A' #for L2 images
29 | area_relation = "Contains"  # footprint has to be fully inside the image. Other options "Intersects", "IsWithin"
30 | 
31 | ### If your area is between two UTM zones, this script often downloads two versions of the same image
32 | ### Uncomment and add e.g "T35" to only focus on one UTM zone
33 | utm_zone = ""
34 | 
35 | ### Image output directory
36 | directory_path = r'sentinel_temp'
37 | 
38 | ### Help setting to see product names in full lenghth
39 | pd.set_option('display.max_colwidth', None)
40 | pd.set_option('display.expand_frame_repr', False)
41 | 
42 | def calculateTotalSize(size_column):
43 |     total_size = 0
44 |     for i in size_column:
45 |         if "MB" in i:
46 |             total_size += float(i.replace(" MB",""))/1000
47 |         if "GB" in i:
48 |            total_size += float(i.replace(" GB",""))
49 |     return round(total_size,2)
50 | 
51 | def queryAndDownload():
52 | 
53 |     finhub_products = finhub_api.query(footprint, date=(startDate, endDate), platformname=platformname,
54 |                                        cloudcoverpercentage=cloudcoverage, producttype=producttype,
55 |                                        area_relation=area_relation)
56 | 
57 |     ### Checking, if any results were found
58 |     if (len(finhub_products) == 0):
59 |         finhub_hasresults = False
60 |         print('No products found from Finhub. Terminating')
61 |     else:
62 |         finhub_hasresults = True
63 | 
64 |     if finhub_hasresults:
65 |         finhub_df = finhub_api.to_dataframe(finhub_products)
66 |         if utm_zone:
67 |             finhub_df = finhub_df[finhub_df['title'].str.contains(utm_zone)]
68 | 
69 |         finhub_id_to_download = finhub_df.uuid.tolist()
70 |         print(f'{len(finhub_id_to_download)} image(s) will be downloaded from Finhub repository')
71 |         print(finhub_df.title.to_string(index=False))
72 | 
73 |     print(f'All together {calculateTotalSize(finhub_df["size"])} GB will be downloaded')
74 | 
75 |     ### Download files
76 |     if (finhub_hasresults):
77 |         finhub_api.download_all(finhub_id_to_download, directory_path=directory_path)
78 | 
79 | 
80 | def main():
81 |     queryAndDownload()
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     start_time = time.time()
86 |     main()
87 |     print("The Download script finished in " + str((time.time() - start_time) / 60) + " minutes")
88 | 
89 | 


--------------------------------------------------------------------------------
/python/sentinel_without_credentials/README.md:
--------------------------------------------------------------------------------
 1 | ## Allas Sentinel bucket content without AWS credentials
 2 | 
 3 | This is an example script of how to get contents from public buckets in Allas containing Sentinel-2 data (without credentials).
 4 | 
 5 | ### Running 
 6 | 
 7 | On CSC's supercomputer Puhti, you can use the [geoconda module](https://docs.csc.fi/apps/geoconda/) which includes the boto3 library:
 8 | ```
 9 | module load geoconda
10 | python get_open_sentinel_buckets.py
11 | ```
12 | 
13 | On local machine install the required library: boto3
14 | 
15 | ```
16 | pip install boto3
17 | ```
18 | 
19 | 
20 | ### Results
21 | 
22 | The script prints out the first bucket's name, and the contents of the first 5 SAFEs in that bucket. The bucket's contents are accessible in the get_contents function.
23 | The image URLs could also be used directly with e.g. `rasterio` package.
24 | 


--------------------------------------------------------------------------------
/python/zonal_stats/raster-stats/README.md:
--------------------------------------------------------------------------------
1 | * `zonal_stats_serial.py` is the basic version, here the work is done on one core in serial mode. 
2 | * `zonal_stats_parallel.py` is the parallel version, where processing of polygons is split to several cores. For parallelization `multiprocessing` library is used.
3 | * `zonal-stats-stac-parallel.py`is the parallel version, where statistics is calculated for several rastes found via STAC. For parallelization `dask` delayed functions are used.
4 | 
5 | Additionally batch job scripts are provided, for running this script on CSC's Puhti supercluster. For submitting the job to Puhti:
6 | `sbatch batch_job_XX.sh`
7 | 


--------------------------------------------------------------------------------
/python/zonal_stats/raster-stats/batch_job-stac-parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_2000599    # Choose the project to be billed
 3 | #SBATCH --time=02:00:00  # Maximum duration of the job. Upper limit depends on partition.
 4 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 5 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 6 | #SBATCH --mem-per-cpu=3G  # Minimum memory required per allocated CPU.  Default units are megabytes.
 7 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 8 | 
 9 | # Load the geoconda module which has Python with Dask, Xarray and STAC libraries
10 | module load geoconda/.3.12.9_conda_conda
11 | 
12 | # Run the Python code. 
13 | python zonal-stats-stac-parallel.py


--------------------------------------------------------------------------------
/python/zonal_stats/raster-stats/batch_job_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -A <YOUR-PROJECT-HERE>
 3 | #SBATCH --time=00:05:00  # Maximum duration of the job. Upper limit depends on partition.
 4 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 5 | #SBATCH --mem=1000  # Real memory required per node.
 6 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 7 | 
 8 | module load geoconda
 9 | python zonal_stats_parallel.py
10 | 


--------------------------------------------------------------------------------
/python/zonal_stats/raster-stats/batch_job_serial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -A <YOUR-PROJECT-HERE>
 3 | #SBATCH --time=00:10:00  # Maximum duration of the job. Upper limit depends on partition.
 4 | #SBATCH --mem-per-cpu=1000  # Minimum memory required per usable allocated CPU.  Default units are megabytes.
 5 | #SBATCH --partition=test  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 6 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 7 | 
 8 | module load geoconda
 9 | srun python zonal_stats_serial.py
10 | 


--------------------------------------------------------------------------------
/python/zonal_stats/raster-stats/zonal_stats_serial.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple example Python script for zonal_stats Python function.
 3 | https://pythonhosted.org/rasterstats/
 4 | 
 5 | Some notes about input datasets:
 6 | Raster:
 7 | * If all zones together cover almost all raster data and the raster data is not too big, then the fastest is to read raster dataset to memory 
 8 | in the beginning of the script. Just make sure to reserve enogh memory. This causes also least disk readings and is in general the preferred way.
 9 | * If the zones cover only some part raster data or if the raster is too big for memory, then direct read from disk might be better. 
10 | See the comments in script, how to modify the code to read directly from disk. If reading data from disk, make sure that the raster has a format that can be paritally read (for example GeoTiff) and that it is has inner tiling (https://gdal.org/drivers/raster/gtiff.html -> TILED) for optimal reading.
11 | In this case, consider also moving the raster to local disk on the computing node:
12 | https://docs.csc.fi/computing/disk/#compute-nodes
13 | 
14 | Author: Elias Annila, Kylli Ek, CSC
15 | Date: 27.01.2022, updated 13.3.2025
16 | """
17 | 
18 | from rasterstats import zonal_stats
19 | import geopandas as gpd
20 | import rasterio
21 | import time
22 | 
23 | # Set the processing area, leave out if you want to process the whole file
24 | x_min = 350000.0
25 | y_min = 6700000.0
26 | buffer = 200000
27 | x_max = x_min + buffer
28 | y_max = y_min + buffer
29 | 
30 | bbox_3067 = (x_min, y_min, x_max, y_max)
31 | 
32 | # File paths:
33 | # Raster you want to use to compute zonal stastics from
34 | raster_file = '/appl/data/geo/mml/dem10m/dem10m_direct.vrt'
35 | # If running the code outside Puhti, get the data from Paituli.
36 | # https://www.nic.funet.fi/index/geodata/mml/dem10m/dem10m_direct.vrt
37 | 
38 | # Polygons file
39 | polygons_file = '/appl/data/geo/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg'
40 | # If running the code outside Puhti, get the data from Paituli.
41 | # polygons_file = 'https://www.nic.funet.fi/index/geodata/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg'
42 | 
43 | # Statistics calculated for each zone
44 | statistics = ['mean']
45 | #statistics = ['count', 'min' ,'mean', 'max','majority']        
46 | 
47 | def main():
48 |     
49 |     # Read the vector polygons, leave out bbox, if you want to process the whole file
50 |     zones = gpd.read_file(polygons_file , layer="KASVULOHKO", bbox=bbox_3067)
51 | 
52 |     # zonal_stats does not directly work with rasterio opened file, but needs data and transformation variables     
53 |     with rasterio.open(raster_file) as src:
54 |         # If you want to use the whole raster file, leave out the window part.
55 |         raster = src.read(indexes=1, window=rasterio.windows.from_bounds(x_min, y_min, x_max, y_max, src.transform)) 
56 |         results = zonal_stats(zones.geometry, raster, affine=src.transform, stats=statistics)
57 | 
58 |     # If you need to read the file from disk.
59 |     # results = zonal_stats(zones.geometry, raster_file, stats=statistics)
60 |     
61 |     #Join the results back to geopandas dataframe        
62 |     for stat in statistics:
63 |         results_as_list = [d[stat] for d in results]
64 |         zones[stat] = results_as_list  
65 |         
66 | if __name__ == '__main__':
67 |     t0 = time.time()
68 |     main()	
69 |     t1 = time.time()
70 |     total = t1-t0
71 |     print("Everything done, took: " + str(round(total, 0))+"s")
72 | 


--------------------------------------------------------------------------------
/python/zonal_stats/xarray-spatial/xarray-spatial-batch-job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --time=00:15:00  # Maximum duration of the job. Upper limit depends on partition.
 4 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 5 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 6 | #SBATCH --mem-per-cpu=3G  # Minimum memory required per allocated CPU.  Default units are megabytes.
 7 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 8 | 
 9 | # Load the geoconda module which has Python with Dask, Xarray and STAC libraries
10 | module load geoconda
11 | 
12 | # Run the Python code. 
13 | python xarray-spatial-zonal-stats.py
14 | 


--------------------------------------------------------------------------------
/python/zonal_stats/xarray-spatial/xarray-spatial-zonal-stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | # Created 13.3.2025, by Kylli Ek, CSC
 4 | 
 5 | import geopandas as gpd
 6 | from geocube.api.core import make_geocube
 7 | from xrspatial import zonal_stats
 8 | import rioxarray
 9 | from dask.distributed import Client, Lock
10 | import os, time
11 | 
12 | # Set the processing area, leave out if you want to process the whole file
13 | x_min = 350000.0
14 | y_min = 6700000.0
15 | buffer = 200000
16 | x_max = x_min + buffer
17 | y_max = y_min + buffer
18 | 
19 | bbox_3067 = (x_min, y_min, x_max, y_max)
20 | 
21 | # File paths:
22 | # Raster you want to use to compute zonal stastics from
23 | raster_file = '/appl/data/geo/mml/dem10m/dem10m_direct.vrt'
24 | # If running the code outside Puhti, get the data from Paituli.
25 | # raster_file = 'https://www.nic.funet.fi/index/geodata/mml/dem10m/dem10m_direct.vrt'
26 | 
27 | # Polygons file
28 | polygons_file = '/appl/data/geo/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg'
29 | # If running the code outside Puhti, get the data from Paituli.
30 | # polygons_file = 'https://www.nic.funet.fi/index/geodata/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg'
31 | 
32 | def main():
33 |     # Get the number of workers
34 |     no_of_workers = len(os.sched_getaffinity(0))
35 |     # Create Dask Client for parallel processing
36 |     client = Client(n_workers=no_of_workers)
37 | 
38 |     # Read the raster file, to read as Dasked backed Xarray DataArray
39 |     # Notice the use of `chunks=True`.
40 |     dem10m = rioxarray.open_rasterio(raster_file, chunks=True) 
41 |     # Crop the raster file to processing area, leave out if you want to process the whole file
42 |     dem10m_clip = dem10m.rio.clip_box(minx=x_min, miny=y_min, maxx=x_max, maxy=y_max)
43 | 
44 |     # Read the vector polygons, leave out bbox, if you want to process the whole file
45 |     polygons = gpd.read_file(polygons_file , layer="KASVULOHKO", bbox=bbox_3067)
46 |     polygons['ID'] = polygons.PERUSLOHKOTUNNUS.astype(int)
47 | 
48 |     # Create Xarray DataArray similar to the raster data
49 |     out_grid = make_geocube(
50 |         vector_data=polygons,
51 |         measurements=["ID"],
52 |         like=dem10m_clip
53 |     )
54 | 
55 |     # Write the rasterized polygons to the disk, so that Dasked backed Xarray DataArray could be created of them.
56 |     out_grid["ID"].rio.to_raster("fields.tif", lock=Lock(name="rio"))
57 | 
58 |     # Read the rasterized polygons back in as Dasked backed Xarray DataArray
59 |     fields = rioxarray.open_rasterio("fields.tif", chunks=True)
60 | 
61 |     # Caclulate the zonal statistics
62 |     zonal_stats_values = zonal_stats(fields[0], dem10m_clip[0], stats_funcs=['mean']).compute()
63 | 
64 |     # Join the results back to the original zones data
65 |     polygons_result  = polygons.merge(zonal_stats_values.compute(), left_on='ID', right_on='zone', how='left')
66 | 
67 | # With Dask, it is important to use the main function
68 | if __name__ == "__main__":
69 |     start = time.time()
70 |     main()
71 |     end = time.time()
72 |     print("Script completed in " + str(end - start) + " seconds")
73 | 


--------------------------------------------------------------------------------
/snap/01_simple_job/CreateStackGraph.xml:
--------------------------------------------------------------------------------
 1 | <graph id="Graph">
 2 |   <version>1.0</version>
 3 |   <node id="2-CreateStack">
 4 |     <operator>CreateStack</operator>
 5 |     <sources>
 6 |       <sourceProduct.4 refid="1-ProductSet-Reader"/>
 7 |     </sources>
 8 |     <parameters class="com.bc.ceres.binding.dom.XppDomElement">
 9 |       <masterBands/>
10 |       <sourceBands/>
11 |       <resamplingType>NEAREST_NEIGHBOUR</resamplingType>
12 |       <extent>Master</extent>
13 |       <initialOffsetMethod>Product Geolocation</initialOffsetMethod>
14 |     </parameters>
15 |   </node>
16 |   <node id="3-Write">
17 |     <operator>Write</operator>
18 |     <sources>
19 |       <sourceProduct refid="2-CreateStack"/>
20 |     </sources>
21 |     <parameters class="com.bc.ceres.binding.dom.XppDomElement">
22 |       <file>/scratch/project_200XXXX/snap/output/S1A_IW_GRDH_1SDH_20171002T154536_20171002T154601_018636_01F6C8_90C0_Stack.tif</file>
23 |       <formatName>GeoTIFF-BigTIFF</formatName>
24 |     </parameters>
25 |   </node>
26 |   <node id="1-ProductSet-Reader">
27 |     <operator>ProductSet-Reader</operator>
28 |     <sources/>
29 |     <parameters class="com.bc.ceres.binding.dom.XppDomElement">
30 |       <fileList>/scratch/project_200XXXX/snap/input/S1A_IW_GRDH_1SDH_20171002T154536_20171002T154601_018636_01F6C8_90C0.tif,/scratch/project_200XXXX/snap/input/S1A_IW_GRDH_1SDH_20171014T154537_20171014T154602_018811_01FC18_2318.tif</fileList>
31 |     </parameters>
32 |   </node>
33 | </graph>


--------------------------------------------------------------------------------
/snap/01_simple_job/snap_batch_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 3 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 4 | #SBATCH --time=04:00:00  # Maximum duration of the job. Upper limit depends on partition.
 5 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 6 | #SBATCH --cpus-per-task=2  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 7 | #SBATCH --mem=60000  # Real memory required per node.
 8 | #SBATCH --gres=nvme:50  # How much local disk to reserve. Default units are gigabytes. 
 9 | 
10 | #The last row resrves 50G of local fast disk on the compute node, it will be used for SNAP and JAVA cache, set by snap_add_userdir.
11 | 
12 | module load snap
13 | source snap_add_userdir $LOCAL_SCRATCH
14 | gpt /scratch/project_200XXXX/scripts/CreateStackGraph.xml -q 2 -c 40G -J-Xmx55G -e
15 | 
16 | # Match values in gpt command with job reservation: 
17 | # -q 2 with --cpus-per-task=2
18 | # -J-Xmx55G with --mem=60000, use for job a few Gb less than reserved
19 | # -c 40G with -J-Xmx55G, use ~75 % of available memory for data cache, depends on task..


--------------------------------------------------------------------------------
/snap/02_array_job/resample_and_lai.xml:
--------------------------------------------------------------------------------
 1 | <graph id="Graph">
 2 |   <version>1.0</version>
 3 |   <node id="Resample">
 4 |     <operator>Resample</operator>
 5 |     <sources>
 6 |       <sourceProduct>${sourceProduct}</sourceProduct>
 7 |     </sources>
 8 |     <parameters class="com.bc.ceres.binding.dom.XppDomElement">
 9 |       <referenceBand/>
10 |       <targetWidth/>
11 |       <targetHeight/>
12 |       <targetResolution>10</targetResolution>
13 |       <upsampling>Nearest</upsampling>
14 |       <downsampling>First</downsampling>
15 |       <flagDownsampling>First</flagDownsampling>
16 |       <resampleOnPyramidLevels>true</resampleOnPyramidLevels>
17 |     </parameters>
18 |   </node>
19 |   <node id="BiophysicalOp">
20 |     <operator>BiophysicalOp</operator>
21 |     <sources>
22 |       <sourceProduct refid="Resample"/>
23 |     </sources>
24 |     <parameters class="com.bc.ceres.binding.dom.XppDomElement">
25 |       <computeLAI>true</computeLAI>
26 |       <computeFapar>false</computeFapar>
27 |       <computeFcover>false</computeFcover>
28 |       <computeCab>false</computeCab>
29 |       <computeCw>false</computeCw>
30 |     </parameters>
31 |   </node>
32 | </graph>
33 | 


--------------------------------------------------------------------------------
/snap/02_array_job/snap_array_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH --output=out_%A_%a.txt  # File to write the standard output to.
 3 | #SBATCH --error=err_%A_%a.txt  # File to write the standard error to.
 4 | #SBATCH --account=project_200xxxx    # Choose the project to be billed
 5 | #SBATCH --partition=small  # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job
 6 | #SBATCH --time=02:00:00  # Maximum duration of the job. Upper limit depends on partition.
 7 | #SBATCH --ntasks=1  # Number of tasks. Upper limit depends on partition.
 8 | #SBATCH --cpus-per-task=4  # How many processors work on one task. Upper limit depends on number of CPUs per node.
 9 | #SBATCH --mem=8000  # Real memory required per node.
10 | #SBATCH --array=1-3  # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -.
11 | #SBATCH --gres=nvme:10  # How much local disk to reserve. Default units are gigabytes. 
12 | 
13 | ### Load SNAP module
14 | module load snap
15 | 
16 | ### For looping through all the files:
17 | 
18 | ### Make a list of input files. This folder has 3 S2L2 images
19 | readlink -f /appl/data/geo/sentinel/s2_example_data/L2A/S2* > image_path_list.txt
20 | 
21 | ### Select the inputfile row by row
22 | image_path=$(sed -n ${SLURM_ARRAY_TASK_ID}p image_path_list.txt)
23 | 
24 | ### Parse image basename to be used in output filename
25 | image_filename="$(basename -- $image_path)"
26 | 
27 | ### Assign an output_folder
28 | output_folder=/scratch/project_2000599/snap/output/
29 | 
30 | # Set custom SNAP user dir
31 | source snap_add_userdir $LOCAL_SCRATCH/cache_"$SLURM_ARRAY_TASK_ID"
32 | 
33 | ### -q is num of cores, -t is target file, -SsourceProduct is the xml inside each SAFE folder
34 | gpt resample_and_lai.xml -q 4 -c 5G -J-Xmx7G -t ${output_folder}/${image_filename}_LAI.tif -SsourceProduct=${image_path}/MTD_MSIL2A.xml -e
35 | 
36 | # Match values in gpt command with job reservation: 
37 | # -q 4 with --cpus-per-task=4
38 | # -J-Xmx7G with --mem=8000, use for job a few Gb less than reserved
39 | # -c 5G with -J-Xmx7G, use ~75 % of available memory for data cache, depends on task..


--------------------------------------------------------------------------------
/snap/README.md:
--------------------------------------------------------------------------------
 1 | # SNAP GPT in Puhti
 2 | 
 3 | Examples:
 4 | 
 5 | * [Simple job with one GPT graph](01_simple_job). This example is S1 stacking with several input files.
 6 | * [Array job with one GPT graph for 3 images](02_array_job). This is an example of running the SNAP graph for multiple Sentinel-2 Level 3 images with an [array job](https://docs.csc.fi/computing/running/array-jobs/). It resamples the bands and calculates LAI (Leaf area index) for one image.
 7 | 
 8 | 
 9 | Both examples include 2 files:
10 | 
11 | * .xml-file - the SNAP Graph that defines the processing workflow.  
12 | * .sh-file - the batch job script that makes resource (time, memory, cores) reservations to Puhti and starts the gpt command. The batch job file [is submitted to the Puhti queuing system](https://docs.csc.fi/computing/running/submitting-jobs/)
13 | 
14 | ```
15 | sbatch snap_batch_job.sh
16 | OR
17 | sbatch snap_array_job.sh
18 | ```


--------------------------------------------------------------------------------
/supercomputer_installations/ames-stereo_3.2.0.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - nasa-ames-stereo-pipeline
3 |   - usgs-astrogeology
4 |   - conda-forge
5 |   - defaults
6 | dependencies:
7 |   - stereo-pipeline
8 |     
9 | 


--------------------------------------------------------------------------------
/supercomputer_installations/arcgis-python-api-2.1.0.yml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - esri
3 | dependencies:
4 |   - arcgis_learn=2.1.0
5 |   - python=3.9
6 | 
7 | 


--------------------------------------------------------------------------------
/supercomputer_installations/geoconda/geoconda_3.10.6.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | #  - esri
 4 | dependencies:
 5 |   - python>=3.10
 6 |   - gdal
 7 | #  - arcgis
 8 |   - boto3
 9 |   - bottleneck
10 |   - cartopy
11 |   - cfgrib
12 | #  - copc-lib
13 |   - dask
14 |   - dask-geopandas
15 |   - dask-jobqueue
16 |   - dask-labextension
17 |   - dask-ml  
18 |   - descartes
19 |   - earthengine-api
20 |   - geoalchemy2
21 |   - geopandas
22 |   - graphviz
23 |   - gstools
24 |   - hvplot
25 |   - imbalanced-learn
26 |   - jinja2
27 |   - jupyterlab
28 |   - jupyter-git
29 |   - laspy
30 |   - laxpy
31 |   - lidar
32 |   - lxml
33 |   - metpy
34 |   - natsort
35 |   - ncview
36 |   - netcdf4
37 |   - networkx
38 |   - osmnx
39 |   - owslib  
40 |   - pdal
41 |   - pip
42 |   - plotly
43 |   - py6S
44 |   - pygmt
45 |   - pyicu
46 |   - pyntcloud
47 |   - pyogrio
48 |   - pyproj
49 |   - pysal
50 |   - pystac-client
51 |   - python-cdo
52 |   - python-graphviz
53 |   - python-igraph
54 |   - python-keystoneclient
55 |   - python-pdal
56 |   - python-swiftclient
57 |   - python-wget
58 |   - rasterio
59 |   - rasterstats
60 |   - rioxarray
61 |   - rtree
62 |   - satpy
63 |   - scikit-image
64 |   - scikit-learn
65 |   - sentinelsat
66 |   - shapely
67 |   - spyder
68 |   - stackstac
69 |   - wrf-python
70 |   - xarray
71 |   - xarray-spatial
72 |   - xesmf
73 |   - pip:
74 |     - laszip
75 | #    - open3d
76 |     
77 |     
78 | 


--------------------------------------------------------------------------------
/supercomputer_installations/geoconda/geoconda_3.10.9.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | #  - esri
 4 | dependencies:
 5 |   - python>=3.10
 6 |   - gdal
 7 | #  - arcgis
 8 |   - boto3
 9 |   - bottleneck
10 |   - cartopy
11 |   - cfgrib
12 |   - copc-lib
13 |   - dask
14 |   - dask-geopandas
15 |   - dask-jobqueue
16 |   - dask-labextension
17 |   - dask-ml  
18 |   - descartes
19 |   - earthengine-api
20 |   - geoalchemy2
21 |   - geopandas
22 |   - geopy
23 |   - graphviz
24 |   - gstools
25 |   - hvplot
26 |   - imageio
27 |   - imbalanced-learn
28 |   - jinja2
29 |   - jupyterlab
30 |   - jupyterlab-git
31 |   - keplergl
32 |   - laspy
33 |   - laxpy
34 |   - leafmap
35 |   - lidar
36 |   - lxml
37 |   - metpy
38 |   - natsort
39 |   - ncview
40 |   - netcdf4
41 |   - networkx
42 |   - osmnx
43 |   - owslib  
44 |   - pdal
45 |   - pip
46 |   - plotly
47 |   - py6S
48 |   - pydeck
49 |   - pygmt
50 |   - pyicu
51 |   - pyntcloud
52 |   - pyogrio
53 |   - pyproj
54 |   - pysal
55 |   - pystac-client
56 |   - python-cdo
57 |   - python-graphviz
58 |   - python-igraph
59 |   - python-keystoneclient
60 |   - python-pdal
61 |   - python-swiftclient
62 |   - python-wget
63 |   - rasterio
64 |   - rasterstats
65 |   - rioxarray
66 |   - rtree
67 |   - satpy
68 |   - scikit-image
69 |   - scikit-learn
70 |   - sentinelsat
71 |   - shapely
72 |   - spyder
73 |   - stackstac
74 |   - tifffile
75 |   - wrf-python
76 |   - xarray
77 |   - xarray-spatial
78 |   - xarray_leaflet
79 |   - xesmf
80 |   - pip:
81 |     - laszip
82 |     - open3d
83 |     - git+https://github.com/ernstste/landsatlinks.git 
84 |     
85 | 


--------------------------------------------------------------------------------
/supercomputer_installations/geoconda/geoconda_3.11.0.yml:
--------------------------------------------------------------------------------
 1 | name: geoconda-3.11
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.11
 6 |   - boto3
 7 |   - bottleneck
 8 |   - cartopy
 9 |   - cfgrib
10 | #  - copc-lib Not available for Python 3.12 nor 3.11
11 |   - dask
12 |   - dask-geopandas
13 |   - dask-jobqueue
14 |   - dask-labextension
15 |   - dask-ml
16 |   - descartes
17 |   - earthaccess
18 |   - earthengine-api
19 |   - geoalchemy2
20 |   - geopandas
21 |   - geopy
22 |   - git
23 |   - graphviz
24 |   - gstools
25 |   - hvplot
26 |   - h3pandas
27 |   - imageio
28 |   - imbalanced-learn
29 |   - jinja2
30 |   - jupyterlab
31 |   - jupyterlab-git
32 |   - keplergl
33 |   - laspy
34 |   - lazrs-python
35 |   - laxpy
36 |   - leafmap
37 |   - lxml
38 |   - metpy
39 |   - movingpandas
40 |   - nbclassic
41 |   - natsort
42 |   - ncview
43 |   - netcdf4
44 |   - networkx
45 |   - openeo
46 | #  - osmnx New version coming later in the year, now would require big downgrade of geopandas
47 |   - owslib
48 |   - pcraster
49 |   - pdal
50 |   - pip
51 |   - plotly
52 |   - psy-view
53 |   - py6s
54 |   - pydeck
55 |   - pygmt
56 |   - pyicu
57 |   - pyntcloud
58 |   - pyogrio
59 |   - pyproj
60 |   - pysal
61 |   - pystac-client
62 |   - python-cdo
63 |   - python-graphviz
64 |   - python-igraph
65 |   - python-keystoneclient
66 |   - python-pdal
67 |   - python-swiftclient
68 |   - python-wget
69 |   - rasterio
70 |   - rasterstats
71 |   - richdem
72 |   - rioxarray
73 |   - r5py
74 |   - satpy
75 |   - scalene
76 |   - scikit-image
77 |   - scikit-learn
78 |   - sentinelsat
79 |   - sentinelhub
80 |   - shapely
81 |   - spyder
82 |   - stackstac
83 |   - wrf-python
84 |   - xarray
85 |   - xarray-spatial
86 |   - xarray_leaflet
87 |   - xesmf
88 |   - pip:
89 |     - lidar
90 |     - laszip
91 |     - open3d
92 |     - git+https://github.com/ernstste/landsatlinks.git
93 | #    - git+https://github.com/pangeo-data/xesmf.git
94 |     - git+https://github.com/mayrajeo/geo2ml.git
95 | 


--------------------------------------------------------------------------------
/supercomputer_installations/geoconda/whitebox_tools_postinstall/download_wbt:
--------------------------------------------------------------------------------
1 | python start_wbt.py
2 | 


--------------------------------------------------------------------------------
/supercomputer_installations/geoconda/whitebox_tools_postinstall/start_wbt.py:
--------------------------------------------------------------------------------
1 | import whitebox
2 | whitebox.WhiteboxTools()
3 | 
4 | 


--------------------------------------------------------------------------------