├── .gitattributes ├── Copernicus_data_download ├── CDSE_S3_download_with_polygon.sh ├── CDSE_S3_download_with_tile_names.sh └── README.md ├── NLS_geopackage_tests.md ├── R ├── README.md ├── R_LiDAR │ ├── README.md │ ├── R_lidar_course_exercises │ │ ├── README.md │ │ ├── R_lidar.Rproj │ │ ├── las_files.txt │ │ ├── session3.1.-rlas.Rmd │ │ ├── session3.2-lidR-LAS-class.Rmd │ │ ├── session3.3-lidR-LAScatalog-class.Rmd │ │ ├── simple_catalog_lidR_batchjob_cluster.sh │ │ ├── simple_catalog_lidR_batchjob_multicore.sh │ │ ├── simple_catalog_lidR_cluster.R │ │ ├── simple_catalog_lidR_multicore.R │ │ ├── simple_lidR.R │ │ └── simple_lidR_arrayjob.sh │ └── rlas-DEM_example │ │ ├── README.md │ │ ├── area_of_interest.dbf │ │ ├── area_of_interest.prj │ │ ├── area_of_interest.qpj │ │ ├── area_of_interest.shp │ │ ├── area_of_interest.shx │ │ ├── basic_rlas.R │ │ ├── batchjob_rlas_basics.sh │ │ └── get_lidar_files_function.R ├── STAC │ ├── Readme.md │ └── STAC_CSC_example.Rmd ├── allas │ └── working_with_allas_from_R_S3.R ├── geopackage │ ├── README.md │ └── read_gpkg.R ├── puhti │ ├── 01_serial │ │ ├── Contours_simple.R │ │ └── serial_batch_job.sh │ ├── 02_parallel_future │ │ ├── Calc_contours_future_cluster.R │ │ ├── Calc_contours_future_multicore.R │ │ ├── parallel_batch_job_future_cluster.sh │ │ └── parallel_batch_job_future_multicore.sh │ ├── 03_parallel_snow │ │ ├── Calc_contours_snow.R │ │ └── parallel_batch_job_snow.sh │ ├── 04_parallel_foreach │ │ ├── Calc_contours_foreach.R │ │ └── parallel_batch_job_foreach.sh │ ├── 05_array │ │ ├── Contours_array.R │ │ └── array_batch_job.sh │ ├── README.md │ ├── mapsheets.txt │ └── mapsheets_URLs.txt ├── raster_predict │ ├── README.md │ ├── Rplots.pdf │ ├── r_run.sh │ ├── rlogo.grd │ ├── rlogo.gri │ └── rtest.R └── virtual_rasters.R ├── README.md ├── force ├── LEVEL2_parameters.prm ├── README.md ├── file_queue.txt └── force_batch_job.sh ├── gdal ├── gdal_batch_job_parallel.sh ├── gdal_batch_job_serial.sh ├── gdal_parallel.sh ├── gdal_serial.sh └── readme.md ├── grass ├── 01_serial_cli │ ├── grass_cli.sh │ └── grass_cli_serial.sh ├── 02_python_scripting_serial │ ├── python_scripting_serial.py │ └── python_scripting_serial.sh ├── 03_pygrass_serial │ ├── pygrass_serial.py │ └── pygrass_serial.sh ├── 04_pygrass_parallel │ ├── pygrass_parallel_with_gridmodule.py │ └── pygrass_parallel_with_gridmodule.sh └── readme.md ├── machineLearning └── README.md ├── noppe └── Readme.md ├── pdal ├── 01_crop_pipeline.json ├── 01_split_laz.sh ├── 02_pipeline.json ├── 03_batch_job_gnu_parallel.sh ├── 04_batch_job_array.sh ├── 04_filelist.csv ├── 07_batch_job_python.sh ├── 07_pdal_ground.py └── README.md ├── pouta ├── README.md ├── arcpy │ ├── ArcGIS_Server_manual_installation.sh │ ├── README.md │ ├── ansible_install_arcpy.yml │ ├── ansible_preparations.md │ ├── ansible_run_arcpy.yml │ └── test_data │ │ └── my_arcpy_script.py ├── docker_geoserver_or_opendronemap │ ├── README.md │ ├── ansible.cfg │ ├── group_vars │ │ └── all.yml │ ├── install-geoserver.yml │ ├── install-odm.yml │ ├── requirements.yml │ └── roles │ │ ├── docker │ │ └── tasks │ │ │ └── main.yml │ │ ├── geoserver │ │ └── tasks │ │ │ └── main.yml │ │ ├── opendronemap │ │ └── tasks │ │ │ └── main.yml │ │ └── openstack │ │ └── tasks │ │ └── main.yml └── metashape_with_VNC │ └── readme.md ├── python ├── README.md ├── STAC │ ├── Readme.md │ ├── STAC_CSC_example.ipynb │ ├── csc_stac_example.py │ ├── csc_stac_example_batch_job.sh │ ├── environment.yml │ ├── img │ │ ├── DEM_data_source_cpu_walltime.gif │ │ ├── DEM_tile_size_cpu_walltime.gif │ │ ├── S1_data_source_cpu_walltime.gif │ │ └── S1_tile_size_cpu_walltime.gif │ ├── stac_xarray_dask_example.ipynb │ └── static_stac.ipynb ├── allas │ ├── working_with_allas_from_Python_S3.py │ └── working_with_allas_from_Python_Swift.py ├── dask_geopandas │ ├── README.md │ └── dask-geopandas.ipynb ├── geopackage │ ├── README.md │ ├── list_layers_info.py │ ├── make_each_layer_a_file.py │ └── read_gpkg.py ├── puhti │ ├── 00_interactive │ │ └── interactive_single_core_example.py │ ├── 01_serial │ │ ├── single_core_example.py │ │ ├── single_core_example.sh │ │ ├── single_core_example_folder.py │ │ ├── single_core_example_folder.sh │ │ └── single_core_example_list.sh │ ├── 02_gnu_parallel │ │ ├── gnu_parallel_example.py │ │ └── gnu_parallel_example.sh │ ├── 03_array │ │ ├── array_job_example.py │ │ └── array_job_example.sh │ ├── 04_parallel_multiprocessing │ │ ├── multiprocessing_example.py │ │ └── multiprocessing_example.sh │ ├── 05_parallel_joblib │ │ ├── joblib_example.py │ │ └── joblib_example.sh │ ├── 06_parallel_dask │ │ ├── multi_node │ │ │ ├── dask_multinode.py │ │ │ └── dask_multinode.sh │ │ └── single_node │ │ │ ├── dask_singlenode.py │ │ │ └── dask_singlenode.sh │ ├── README.md │ └── interactive.ipynb ├── routing │ ├── batch_igraph.sh │ ├── batch_nx.sh │ ├── data │ │ └── hanko.graphml │ ├── igraph_parallel.py │ ├── nx_parallel.py │ ├── osmnx-graphml.py │ └── readme.md ├── sentinel │ ├── README.md │ ├── helsinki.geojson │ └── sentinelsat_download_from_finhub.py ├── sentinel_without_credentials │ ├── README.md │ └── get_open_sentinel_buckets.py └── zonal_stats │ ├── README.md │ ├── raster-stats │ ├── README.md │ ├── batch_job-stac-parallel.sh │ ├── batch_job_parallel.sh │ ├── batch_job_serial.sh │ ├── zonal-stats-stac-parallel.py │ ├── zonal_stats_parallel.py │ └── zonal_stats_serial.py │ └── xarray-spatial │ ├── xarray-spatial-batch-job.sh │ └── xarray-spatial-zonal-stats.py ├── routing.md ├── snap ├── 01_simple_job │ ├── CreateStackGraph.xml │ └── snap_batch_job.sh ├── 02_array_job │ ├── resample_and_lai.xml │ └── snap_array_job.sh └── README.md └── supercomputer_installations ├── ames-stereo_3.2.0.yml ├── arcgis-python-api-2.1.0.yml └── geoconda ├── geoconda_3.10.6.yml ├── geoconda_3.10.9.yml ├── geoconda_3.11.0.yml └── whitebox_tools_postinstall ├── download_wbt └── start_wbt.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Keep linux line endings always. 2 | * text eol=lf -------------------------------------------------------------------------------- /Copernicus_data_download/CDSE_S3_download_with_polygon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############### 4 | # Example script to query and download data from Copernicus Data Space Ecosystem WITH POLYGON. 5 | # See readme for connection set up details. 6 | # 7 | # Based on script provided by Maria Yli-Heikkilä (LUKE) - adapted to CDSE by Samantha Wittke and Kylli Ek, CSC - IT center for Science 8 | 9 | 10 | # If you suitable polygon ready, then save it as CSV with geometry in WKT 11 | # https://geojson.io/ can be used for quick creation of the input file in GeoJson format - draw a polygon on map and save the text to a .json file. 12 | # The input file could be also .shp, .gpkg or some other format supported by GDAL. 13 | ogr2ogr -f CSV area.csv input.json -lco GEOMETRY=AS_WKT 14 | 15 | # Alternatively, if you do not have the polygon ready, but would like to calculate based on some exsisting vector file: 16 | # Note that besides changing the name of the file, you have to change also the layer name, in case of Shape file, it is the same as file name. 17 | # 18 | # ogr2ogr -f CSV area.csv /appl/data/geo/tilastokeskus/tieliikenne/2022/tieliikenne_2022.shp -dialect SQLite -sql "select st_concavehull(st_collect(geometry)) from tieliikenne_2022" -lco GEOMETRY=AS_WKT -t_srs EPSG:4326 19 | 20 | # Get the WKT polygon from file and remove quotes. 21 | # Only the first polygon of the the file is used. 22 | wkt=$(sed '2q;d' area.csv) 23 | wkt2=${wkt//\"/} 24 | 25 | # Provide the timedelta - start date and time 26 | STARTDATE=2023-05-01T00:00:00 27 | 28 | # end date and time ; fixed or set to current date 29 | #CURRENTDATE=$(date +"%Y-%m-%dT%T") 30 | #ENDDATE=$CURRENTDATE 31 | ENDDATE=2023-05-06T23:59:59 32 | 33 | CLOUDCOVER="[0,95]" 34 | 35 | # Baseurl to reach the CDSE catalog with json output 36 | BASEURL="http://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel2/search.json?" 37 | 38 | # Query the catalog with previously defined variables, 20 is the default max record number, which you can adapt to your needs 39 | # See https://documentation.dataspace.copernicus.eu/APIs/OpenSearch.html#output-sorting-and-limiting for further options for sorting 40 | QUERY="productType=S2MSI2A&startDate=${STARTDATE}.000Z&completionDate=${ENDDATE}.000Z&cloudCover=${CLOUDCOVER}&geometry=${wkt2}&maxRecords=1" 41 | # echo $BASEURL$QUERY 42 | 43 | wget --output-document=query.json "$BASEURL$QUERY" 44 | 45 | # JSON includes much more information than only product paths -> extract product path from the JSON and safe to 46 | jq -r '.. | .productIdentifier? | select( . != null ) ' query.json | grep "/eodata/" > safe_files.txt 47 | 48 | # Read the file with product paths and download each file from CDSE 49 | while IFS="" read -r FILE || [ -n "$FILE" ] 50 | do 51 | echo $FILE 52 | # Define folder name for each .SAFE file 53 | SAFENAME="$(basename -- $FILE)" 54 | 55 | # Download to local disk 56 | rclone copy -P -v cdse:$FILE /scratch/project_2000599/cdse/$SAFENAME 57 | 58 | # OR Download to Allas 59 | #rclone copy -P -v cdse:$FILE s3allas:yourBucketName/$SAFENAME 60 | done < safe_files.txt 61 | 62 | # Delete temporary files 63 | rm area.csv 64 | rm query.json 65 | rm safe_files.txt -------------------------------------------------------------------------------- /Copernicus_data_download/CDSE_S3_download_with_tile_names.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############### 4 | # Example script to query and download data from Copernicus Data Space Ecosystem WITH SENTINEL-2 TILE NAMES. 5 | # See readme for connection set up details. 6 | # 7 | # Based on script provided by Maria Yli-Heikkilä (LUKE) - adapted to CDSE by Samantha Wittke and Kylli Ek, CSC - IT center for Science 8 | 9 | # Provide Sentinel-2 tilenames that you want to download 10 | TILES=("T34VDM" "T34VEM" "T34VEN") 11 | 12 | # Provide the timedelta - start date and time 13 | STARTDATE=2023-05-01T00:00:00 14 | 15 | # end date and time ; fixed or set to current date 16 | #CURRENTDATE=$(date +"%Y-%m-%dT%T") 17 | #ENDDATE=$CURRENTDATE 18 | ENDDATE=2023-05-06T23:59:59 19 | 20 | CLOUDCOVER="[0,95]" 21 | 22 | # Baseurl to reach the CDSE catalog with json output 23 | BASEURL="http://catalogue.dataspace.copernicus.eu/resto/api/collections/Sentinel2/search.json?" 24 | 25 | YEAR=${ENDDATE:0:4} 26 | 27 | for TILE in ${TILES[@]} 28 | 29 | do 30 | # Query the catalog with previously defined variables, 20 is the default max record number, which you can adapt to your needs 31 | # See https://documentation.dataspace.copernicus.eu/APIs/OpenSearch.html#output-sorting-and-limiting for further options for sorting 32 | QUERY="productType=S2MSI2A&startDate=${STARTDATE}.000Z&completionDate=${ENDDATE}.000Z&cloudCover=${CLOUDCOVER}&productIdentifier=${TILE}&maxRecords=20" 33 | # echo $BASEURL$QUERY 34 | wget --output-document=query_${YEAR}_${TILE}.json $BASEURL$QUERY 35 | 36 | # JSON includes much more information than only product paths -> extract product path from the JSON and safe to 37 | jq -r '.. | .productIdentifier? | select( . != null ) ' query_${YEAR}_${TILE}.json | grep "/eodata/" > name_${YEAR}_${TILE}.txt 38 | 39 | # Read the file with product paths and download each file from CDSE to Allas bucket defined above 40 | while IFS="" read -r FILE || [ -n "$FILE" ] 41 | do 42 | echo $FILE 43 | # Define folder name for each .SAFE file 44 | SAFENAME="$(basename -- $FILE)" 45 | 46 | # Download to local disk 47 | rclone copy -P -v cdse:$FILE /scratch/project_2000599/cdse/$SAFENAME 48 | 49 | # OR Download to Allas 50 | #rclone copy -P -v cdse:$FILE s3allas:yourBucketName/$SAFENAME 51 | done < name_${YEAR}_${TILE}.txt 52 | 53 | # Delete temporary files 54 | rm query_${YEAR}_${TILE}.json 55 | rm name_${YEAR}_${TILE}.txt 56 | done 57 | 58 | -------------------------------------------------------------------------------- /Copernicus_data_download/README.md: -------------------------------------------------------------------------------- 1 | # Downloading data from Copernicus Data Space Ecosystem 2 | 3 | The [Copernicus Data Space Ecosystem](https://dataspace.copernicus.eu/) (CDSE) provides multiple ways of querying and downloading data. Check out the [CSC Earth Observation guide](https://docs.csc.fi/support/tutorials/gis/eo_guide) for further information about the CDSE. 4 | 5 | ## CDSE S3 download with rclone 6 | 7 | These examples show how find and to copy data from CDSE S3 object storage using `rclone`: 8 | 9 | 1. Query CDSE Sentinel-2 catalog based on startdate, enddate, cloudcover using [openSearch API](https://documentation.dataspace.copernicus.eu/APIs/OpenSearch.html) 10 | 2. Download the found data from CDSE object storage via `s3` using `rclone`. Data can be downloaded to local disk or directly to CSC's object storage Allas. 11 | 12 | The scripts are otherwise very similar, but the area of interest is defined in 2 different ways: 13 | 14 | * Area is defined by Sentinel-2 tile names: [CDSE_S3_download_with_tile_names.sh](CDSE_S3_download_with_tile_names.sh) 15 | * Area is defined by polygon in a file: [CDSE_S3_download_with_polygon.sh](CDSE_S3_download_with_polygon.sh). The polygon many also be calculated as convex hull of a vector layer. 16 | 17 | 18 | To run the script, first **connection details** to must be set up. 19 | 20 | 1. [Get secret and access keys for CDSE](https://documentation.dataspace.copernicus.eu/APIs/S3.html#generate-secrets). 21 | 2. Configure rclone to use CDSE. 22 | * [General rclone configuration instructions](https://rclone.org/docs/#configure) 23 | * For most settings CDSE can use the same as Allas, see [Allas rclone settings](https://docs.csc.fi/data/Allas/using_allas/rclone_local/#configuring-s3-connection-in-windows) 24 | * CDSE endpoint: `eodata.dataspace.copernicus.eu`. 25 | * Name of the remote: `cdse` 26 | 3. If you want to download files to your local disk, you are ready to go. 27 | 4. If you want to copy files to another object storage, for example CSC Allas, then set up `rclone` connection details also for the second service. Follow [Allas: Copying files directly between object storages](https://docs.csc.fi/data/Allas/accessing_allas/#copying-files-directly-between-object-storages) instructions. In this example `s3allas` is the name of the second remote connection. 28 | 29 | The script should work on any Linux/Mac machine that has `rclone` installed. In CSC supercomputers, `rclone` is included in the `allas`-module. The search with polygon versioni requires `GDAL`, which is available in [several modules in Puhti](https://docs.csc.fi/apps/gdal/), for example `geoconda`. 30 | 31 | ## Direct use of S3 data with GDAL, Python and R 32 | CDSE S3 data can also be used directly with [GDAL-based tools, inc Python, R and QGIS](https://docs.csc.fi/support/tutorials/gis/gdal_cloud/#vsis3-reading-and-writing-files-fromto-s3-services). 33 | -------------------------------------------------------------------------------- /NLS_geopackage_tests.md: -------------------------------------------------------------------------------- 1 | ## NLS Geopackage tests 2 | 3 | Main findings: 4 | * Tested using with geopandas, R sf library and ogr2ogr. Geopandas and ogr2ogr both have some good advantages, sf not so much. 5 | * Memory usage: Ogr2ogr is clearly more memory efficient as it seems that it's memory consumption is more or less constant regardless of input size. It probably handles the files in some sort of pieces. Geopanadas and sf on the other hand have to read the whole file into memory at once, but sf memory consumption is about 2x that of geopandas. Ogr2ogr 0.5GB, geopandas up to 15GB for single layer, sf up to 39 GB single layer. 6 | * Computation time: Extracting the largest layers takes 21 and 24 mins with sf and geopandas (only read to memory), 49 mnins for ogr2ogr (includes saving to another file). Interestingly there wasn't significant time performance differences in smaller layers. 7 | * Extracting a small area efficiently is possible. At least geopandas can make use of spatial indexes built into the geopackage format and read a small area defined by a bounding box fast even from largest layers. For the other two this should also be possible as there's an option to use SQL queries, but I couldn't make them perform any faster than reading the whole layer. 8 | * Buffering & similar analysis using SQL queries is possible with ogr2ogr. With sf and geopandas you'll have to use different libraries after reading the file into a dataframe. 9 | * Geopackage standard doesn't enforce a specific name for the geometry column, you can see it with ogrinfo. 10 | * Example code for [R](R/geopackage) and [Python](python/geopackage). 11 | 12 | ### Testing results 13 | 14 | Geopandas (read) 15 | 16 | Layer |Time |Max rss| 17 | --------------| ----- | ----- | 18 | Hylky |1s | | 19 | Kallioalue (4,5GB)|10:31|6,6GB| 20 | Suo 14GB|24:09 |15GB| 21 | Suo small bbox (7 features)*|0:03|0.8GB| 22 | 23 | *~`geopandas.read_file(file, layer, bbox=)` Takes features that are at least partially within bbox. 24 | 25 | R sf (read) 26 | 27 | Layer |Time |Max rss| 28 | -------------- | ----- | ----- | 29 | Hylky |2s | | 30 | Kallioalue (4,5GB)|8:03|12GB| 31 | Suo 14GB|21:26 |39GB| 32 | 33 | ogr2ogr (read & write) 34 | `ogr2ogr kallio.gpkg MTK-geopackage-test-18-06-07.gpkg -sql "select * from kallioalueet"` 35 | 36 | Layer |Time |Max rss| 37 | -------------- | ----- | ----- | 38 | Kallioalue (4,5GB)|11:09|0.38GB| 39 | Suo 14GB|49:09 |0.33GB| 40 | -------------------------------------------------------------------------------- /R/README.md: -------------------------------------------------------------------------------- 1 | Examples for doing spatial analysis with **R** in CSC computing environment: 2 | 3 | * [puhti](puhti) - examples for different job types in Puhti: simple serial, array and parallel: `snow`, `parallel` and `future`. 4 | * [Working with Allas data from R](allas) 5 | * [Reading NLS topographic database geopackage with R](geopackage) 6 | * [R for LiDAR data](R_LiDAR) - examples and exercises 7 | * Some R packages have in-built support for parallization, for example `raster`, `terra` and `lidR`. 8 | * [raster](raster_predict) - includes also the batch job (parallel) file for Puhti. 9 | * [lidR](R_LiDAR/R_lidar_course_exercises) 10 | * `terra` - just follow [terra manual](https://cran.r-project.org/web/packages/terra/terra.pdf), see for example predict example. For batch job file see `raster` package example. 11 | 12 | References for CSC's R spatial tools: 13 | * [Puhti's R for GIS documentation](https://docs.csc.fi/apps/r-env-for-gis/), at the end of page are several links to good learning materials about R for spatial data analysis. 14 | * [Puhti's R documentation](https://docs.csc.fi/apps/r-env-singularity/) 15 | -------------------------------------------------------------------------------- /R/R_LiDAR/README.md: -------------------------------------------------------------------------------- 1 | Examples using R for LiDAR datasets management: 2 | * [loading Puhti's LiDAR datasets with `rlas`](rlas-DEM_example) - load Puhti's LiDAR datasets for an area of interest and do basic reading and merging operations with the `rlas` package 3 | * [R LiDAR Course exercises](R_lidar_course_exercises) - using `rlas` and `lidR` in Puhti supercluster 4 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/README.md: -------------------------------------------------------------------------------- 1 | ## R lidar exercise CSC's R lidar course 2 | In these exercises the most basic functions of the `rlas` and `lidR` R packages by Jean-Romain are demonstrated. See the documentation for these packages from [rlas](https://cran.r-project.org/web/packages/rlas/index.html) and [lidR](https://github.com/Jean-Romain/lidR/wiki). 3 | 4 | The materials in this repository are based mostly on the above mentioned documentation with some edits and new parts to adapt them to using these libraries in CSC's Puhti supercluster. 5 | 6 | ## Course related info 7 | The original 2019 CSC's course page: [Lidar data analysis in Taito, with PDAL and R](https://www.csc.fi/web/training/-/lidar-data-analysis-in-taito-with-pdal-and-r) The same material works in Puhti supercomputer as well. 8 | 9 | The data for these excerises is basic NLS lidar data. In Puhti the lidar data can be found from /appl/data/geo/mml/laserkeilaus/2008_latest/ 10 | 11 | The original NLS lidar files might not workd with `lidR`, because of scale errors. For fixing this, use e.g. las2las 12 | 13 | fix one file with: 14 | ``` 15 | las2las -i /scratch//mml/laserkeilaus/2008_17/2017/T522/1/T5224F1.laz -rescale 0.01 0.01 0.01 -auto_reoffset -o ~/outfolder/T5224F1.laz 16 | ``` 17 | 18 | all with (in same directory): 19 | ``` 20 | las2las -i ~/original_las_dir/*.laz -rescale 0.01 0.01 0.01 -auto_reoffset -olaz -odir ~/outdir/ 21 | ``` 22 | 23 | The most up-to-date version of the exercises is in this repository. Download the contents of this repository as a zip file to your project's folders or your home folder in Puhti and unzip it with `unzip R_lidar_2019.zip`. Then connect to Puhti using NoMachine and start RStudio. Open the `R_lidar.Rproj` project from the `r_exercise` folder you just unziped. 24 | 25 | Note that all the necessary software packages are already installed in the Puhti supercluster and thus their installation is not covered in these exercises. To see a description of the installed R spatial packages ready installed see: https://docs.csc.fi/apps/r-env/ 26 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/R_lidar.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/las_files.txt: -------------------------------------------------------------------------------- 1 | /appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/U4422H1.laz 2 | /appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/U4422H2.laz -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/session3.1.-rlas.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The lidR package - LAS formal class" 3 | author: "Eduardo González" 4 | date: "23. February 2019" 5 | output: 6 | html_document: 7 | df_print: paged 8 | theme: flatly 9 | --- 10 | 11 | This materials have been adapted from the official [lidR wiki documentation](https://github.com/Jean-Romain/lidR/wiki) and [A Brief Introduction of lidR](http://xzsunbest.tk/2018/07/30/ABriefIntroductionOfLidR/). 12 | 13 | 14 | # The `rlas` package 15 | 16 | The `rlas` package relies on a modified version of `LASlib` and `LASzip` libraries (by Martin Isenburg) that were modified to be compatible with `R`. See the official documentation of [the `rlas` package](https://cran.r-project.org/web/packages/rlas/index.html) for more information. 17 | 18 | The main use of the `rlas` package is to read and write `.las` and `.laz` binary files used to store LiDAR data. LAS version 1.0 to 1.4 are supported. Point data record format 0,1,2,3,6,7,8 are supported. 19 | 20 | The tools offer some basic functionality to directly read and manipulate LAS data. 21 | 22 | ```{r} 23 | library(rlas) 24 | file_name <- "/appl/data/geo/mml/laserkeilaus/2008_latest/2019/L331/1/L3313F3.laz" 25 | lasdata <- read.las(file_name) 26 | lasheader <- read.lasheader(file_name) 27 | ``` 28 | 29 | 30 | ## Basic structure of an rlas data object 31 | 32 | You can verify that the an `rlas` object is simply a data table with the data for each lidar point as a row: 33 | ```{r } 34 | class(lasdata) 35 | names(lasdata) 36 | ``` 37 | 38 | See a subset of the lidar data table: 39 | ```{r} 40 | print(lasdata[c(1:10),]) 41 | ``` 42 | 43 | The original laz file size is 190Mb, but its size when loaded in memory to R is 3.3Gb. To see the size of the data table in memory: 44 | ```{r} 45 | size <- object.size(lasdata) 46 | print(size, units = "auto") 47 | ``` 48 | 49 | You may apply filters to columns and what points to load based on their attributes when loading LAS files: 50 | ```{r} 51 | filtered_las <- read.las(file_name, select = "ia", filter = "-keep_first -drop_intensity_below 95") 52 | size <- object.size(filtered_las) 53 | print(size, units = "auto") 54 | ``` 55 | 56 | The example above loads the coordinate columns x, y and z which are always loaded plus the intensity (i) column and the san angle (a) column. Then only the rows representing a first return point are loaded. Check the documentation for this function with `?read.las` or from the [rlas reference manual](https://cran.r-project.org/web/packages/rlas/rlas.pdf) to see all the available values for selection and filtering. Note that the filter values are the same as those in LAStools and can be checked with: 57 | ```{r} 58 | rlas:::lasfilterusage() 59 | ``` 60 | 61 | To write the fildered LAS data as a las or compressed laz file you ned to create the header first and then use `write.las()` function A las or laz file is created following the extension you inditate in the command: 62 | ```{r} 63 | filtered_header <- header_create(filtered_las) 64 | write.las("./outputs/out_las.las", filtered_header, filtered_las) 65 | write.las("./outputs/out_las.laz", filtered_header, filtered_las) 66 | ``` 67 | 68 | It is recommendable to also create an index for LAS file, you can do that with the `writelax()` function. 69 | ```{r} 70 | writelax("./outputs/out_las.laz") 71 | ``` 72 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_batchjob_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | # SBATCH --account 3 | #SBATCH --account project_2001659 4 | #Name of the job, this makes it easier to identify your job 5 | #output_%j.txt - Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job 6 | #SBATCH --output=batch_output_%j.txt # File to write the standard output to. 7 | #error_%j.txt - As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both. 8 | #SBATCH --error=batch_error_%j.txt # File to write the standard error to. 9 | #Partition you want to submit your job to. 10 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished. 12 | #SBATCH --time=00:10:00 # Maximum duration of the job. Upper limit depends on partition. 13 | #Reserve 5 cores: 1 for master and 4 for workers 14 | #Compared to multicore version, use --ntasks setting, not --cpus-per-task 15 | #SBATCH --ntasks=5 # Number of tasks. Upper limit depends on partition. 16 | #Reserve 10000MB (10GB) of memory per node 17 | #SBATCH --mem=10000 # Real memory required per node. 18 | 19 | module load r-env-singularity 20 | 21 | # If you have installed packages this helps resolve problems related to those 22 | if test -f ~/.Renviron; then 23 | sed -i '/TMPDIR/d' ~/.Renviron 24 | sed -i '/OMP_NUM_THREADS/d' ~/.Renviron 25 | fi 26 | 27 | # Specify a temp folder path 28 | echo "TMPDIR=/scratch/" >> ~/.Renviron 29 | 30 | # Remove and creates new output folder 31 | rm -rf batch_output 32 | mkdir batch_output 33 | # Use RMPISNOW instead of Rscript 34 | srun singularity_wrapper exec RMPISNOW --no-save --slave -f simple_catalog_lidR_cluster.R 35 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_batchjob_multicore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | # SBATCH --account 3 | #SBATCH --account project_2001659 4 | #Name of the job, this makes it easier to identify your job 5 | #output_%j.txt - Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job 6 | #SBATCH --output=batch_output_%j.txt # File to write the standard output to. 7 | #error_%j.txt - As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both. 8 | #SBATCH --error=batch_error_%j.txt # File to write the standard error to. 9 | #Partition you want to submit your job to. 10 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished. 12 | #SBATCH --time=00:10:00 # Maximum duration of the job. Upper limit depends on partition. 13 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 14 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 15 | #The number of cpus used by the lidR task, equal to number of workers. Max 40 in Puhti. 16 | #SBATCH --cpus-per-task=5 # How many processors work on one task. Upper limit depends on number of CPUs per node. 17 | 18 | #Tells the batch job sytem to reserve 8000MB (8GB) of memory for core 19 | #SBATCH --mem-per-cpu=8000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 20 | 21 | module load r-env-singularity 22 | 23 | # If you have installed packages this helps resolve problems related to those 24 | if test -f ~/.Renviron; then 25 | sed -i '/TMPDIR/d' ~/.Renviron 26 | sed -i '/OMP_NUM_THREADS/d' ~/.Renviron 27 | fi 28 | 29 | # Specify a temp folder path 30 | echo "TMPDIR=/scratch/" >> ~/.Renviron 31 | 32 | # Match thread number with set_lidr_threads(n) setting in the R-script 33 | # echo "OMP_NUM_THREADS=2" >> ~/.Renviron 34 | 35 | # Remove and creates new output folder 36 | rm -rf batch_output_multicore 37 | mkdir batch_output_multicore 38 | srun singularity_wrapper exec Rscript --no-save simple_catalog_lidR_multicore.R 39 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_cluster.R: -------------------------------------------------------------------------------- 1 | # lidR supports parallel computation of project/catalog. 2 | # https://rdrr.io/cran/lidR/man/lidR-parallelism.html 3 | # Parallelization is based on chunks. By default 1 chunk = 1 file. 4 | # It is possible to define also chunks smaller than the file. 5 | # This increases the complexity of calculation and calculation time a little bit. 6 | # But reduces the required memory significantly. 7 | # The number of workers should be equal or smaller than the number of chunks. 8 | # In this example 4 workers are used. 9 | # All lidR functions support using workers. 10 | 11 | library(future) 12 | library("lidR") 13 | #Enabling this will print out a little bit more info about the parallelization plan used. 14 | options(lidR.verbose = TRUE) 15 | options(future.availableCores.methods = "Slurm") 16 | 17 | # With plan(cluster) the number of workers is based on batch job reservation details: ntasks or ntasks-per-node. 18 | cl<-getMPIcluster() 19 | fcn <- plan(cluster, workers = cl) 20 | 21 | # In Puhti R snow clusters do not seem to support OpenMP parallelization, so do not use this option in Puhti. 22 | # set_lidr_threads(2) 23 | 24 | # load catalog 25 | project <- readLAScatalog("/appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/") 26 | 27 | # lascheck(ctg_subset) 28 | 29 | # output file naming options 30 | opt_output_files(project) <- "batch_output/dtm_ctg_{XLEFT}_{YBOTTOM}_{ID}" 31 | 32 | # NLS lidar files cover 3 x 3 km, so here 1500 x 1500 m chunk size is used -> 4 chunks per file -> ~16 chunks 33 | # (with some overlapping it will be 20 chunks in practice). 34 | opt_chunk_size(project) <- 1500 35 | 36 | # summary(ctg_subset) 37 | 38 | # Calculate DTM for the catalog, note that the files are written by the catalog itself 39 | # https://rdrr.io/cran/lidR/man/catalog_apply.html 40 | output <- catalog_sapply(project, grid_terrain, algorithm = tin()) 41 | 42 | #Last, stop the snow cluster 43 | stopCluster(cl) 44 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/simple_catalog_lidR_multicore.R: -------------------------------------------------------------------------------- 1 | # lidR supports parallel computation of project/catalog. 2 | # https://rdrr.io/cran/lidR/man/lidR-parallelism.html 3 | # Parallelization is based on chunks. By default 1 chunk = 1 file. 4 | # It is possible to define also chunks smaller than the file. 5 | # This increases the complexity of calculation and calculation time a little bit. 6 | # But reduces the required memory significantly. 7 | # The number of workers should be equal or smaller than the number of chunks. 8 | # In this example 4 workers are used. 9 | # All lidR functions support using workers. 10 | 11 | library(future) 12 | library("lidR") 13 | #Enabling this will print out a little bit more info about the parallelization plan used. 14 | options(lidR.verbose = TRUE) 15 | options(future.availableCores.methods = "Slurm") 16 | 17 | # With plan(multicore) the number of workers is based on batch job reservation details. 18 | plan("multicore") 19 | 20 | # In Puhti R OpenMP parallelization does not seem to work with lidR, so do not use this option in Puhti. 21 | # set_lidr_threads(2) 22 | 23 | # load catalog 24 | project <- readLAScatalog("/appl/data/geo/mml/laserkeilaus/2008_latest/2019/U442/1/") 25 | 26 | # lascheck(ctg_subset) 27 | 28 | # output file naming options 29 | opt_output_files(project) <- "batch_output_multicore/dtm_ctg_{XLEFT}_{YBOTTOM}_{ID}" 30 | 31 | # NLS lidar files cover 3 x 3 km, so here 1500 x 1500 m chunk size is used -> 4 chunks per file. 32 | opt_chunk_size(project) <- 1500 33 | 34 | # summary(ctg_subset) 35 | 36 | # Calculate DTM for the catalog, note that the files are written by the catalog itself 37 | # https://rdrr.io/cran/lidR/man/catalog_apply.html 38 | output <- catalog_sapply(project, grid_terrain, algorithm = tin()) 39 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/simple_lidR.R: -------------------------------------------------------------------------------- 1 | #Read the command line argument, which is the path to a las file. 2 | args = commandArgs(trailingOnly=TRUE) 3 | #Gets las_name from argument 4 | if (length(args)==0) { 5 | stop("Please give a las file name.", call.=FALSE) 6 | } else if (length(args)==1) { 7 | # input file 8 | las_name <- args[1] 9 | } 10 | 11 | 12 | library("lidR") 13 | print(las_name) 14 | 15 | # name with tif extension 16 | out_name <- paste0("dtm_", tools::file_path_sans_ext(basename(las_name)),".tif") 17 | 18 | # Open las file 19 | las <- readLAS(las_name) 20 | print(las) 21 | # Calculate DTM and save to disk 22 | dtm <- grid_terrain(las, algorithm = tin()) 23 | writeRaster(dtm, paste0("array_output/", out_name), format="GTiff", overwrite=TRUE) 24 | -------------------------------------------------------------------------------- /R/R_LiDAR/R_lidar_course_exercises/simple_lidR_arrayjob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #Name of the job, this makes it easier to identify your job 4 | #output_%j.txt - Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job 5 | #SBATCH --output=array_output/array_output_%j.txt # File to write the standard output to. 6 | #error_%j.txt - As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both. 7 | #SBATCH --error=array_output/array_error_%j.txt # File to write the standard error to. 8 | #Partition you want to submit your job to. Possible values are serial, parallel, longrun, hugemem and test. In this excerecise we use test as it is for testing, but it shouldn't be used for serious work. See for details. 9 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 10 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished. 11 | #SBATCH --time=00:15:00 # Maximum duration of the job. Upper limit depends on partition. 12 | #--array - Tells the batch job system that this is an array job that should be run 3 times. It creates a variable named $SLURM_ARRAY_TASK_ID which will get a different value ranging from 1 to 3 for each task. 13 | #SBATCH --array=1-6 # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -. 14 | #Tells the batch job system that this is not a parallel task and only one task should be used. Note that this is one task per job, but array job will actually launch 3 simultaneous jobs. 15 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 16 | #Tells the batch job sytem to reserve 1000MB (1GB) of memory for each of the 3 jobs. 17 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 18 | 19 | #As the job is not run on the login node where we submit the job from, it's necessary to load necessary modules in the batch job script. Loading the modules on the login node before sending the batch job will not help. 20 | module load r-env-singularity 21 | 22 | # If you have installed packages this helps resolve problems related to those 23 | if test -f ~/.Renviron; then 24 | sed -i '/TMPDIR/d' ~/.Renviron 25 | fi 26 | 27 | #Read the file to be processed from a list of input files. This is done by getting the line corresponding to the $SLURM_ARRAY_TASK_ID from the input file list. 28 | input=$(sed -n "$SLURM_ARRAY_TASK_ID"p las_files.txt) 29 | 30 | srun singularity_wrapper exec Rscript --no-save simple_lidR.R $input 31 | -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/README.md: -------------------------------------------------------------------------------- 1 | # Basic Lidar data management in Puhti 2 | 3 | For convinience, the full open Lidar datasets from National Land Survey of Finland (Maanmittauslaitos) have been made available in Puhti. This allows researchers to directly use these datasets in their work in Puhti without the need to download particular files. See more information about the LiDAR and other datasets available in Puhti from the [GIS Data in Puhti](https://research.csc.fi/gis_data_in_csc_computing_env) page. 4 | 5 | The following are some simple R scripts demonstrating how the LiDAR dataset can be queried in your R scripts. 6 | 7 | - [Get LiDAR files intersecting a given polygon](get_lidar_files_function.R) 8 | - [Basic LiDAR Puhti files management with `rlas` package](basic_rlas.R) 9 | - [Example batch job to run basic_rlas.R script](batchjob_rlas_basics.sh) 10 | 11 | See more LiDAR tools and examples in the [R LiDAR Course exercises](../R_lidar_course_exercises) 12 | -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/area_of_interest.dbf: -------------------------------------------------------------------------------- 1 | vA idN 2 | 1 -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/area_of_interest.prj: -------------------------------------------------------------------------------- 1 | PROJCS["ETRS89_TM35FIN_E_N",GEOGCS["GCS_ETRS_1989",DATUM["D_ETRS_1989",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",27],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["Meter",1]] -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/area_of_interest.qpj: -------------------------------------------------------------------------------- 1 | PROJCS["ETRS89 / TM35FIN(E,N)",GEOGCS["ETRS89",DATUM["European_Terrestrial_Reference_System_1989",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],TOWGS84[0,0,0,0,0,0,0],AUTHORITY["EPSG","6258"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4258"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",27],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","3067"]] 2 | -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/area_of_interest.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/R_LiDAR/rlas-DEM_example/area_of_interest.shp -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/area_of_interest.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/R_LiDAR/rlas-DEM_example/area_of_interest.shx -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/basic_rlas.R: -------------------------------------------------------------------------------- 1 | ## Use get_lidar_files_function to get LiDAR files for a area of interest 2 | ## and rlas to read and merge the LiDAR data 3 | 4 | library(rlas) 5 | library(foreach) 6 | source("get_lidar_files_function.R") 7 | 8 | # Get lidar file names 9 | lidar_files <- lidar_files_puhti("area_of_interest.shp") 10 | print(lidar_files) 11 | 12 | # Get basic information from the LiDAR files 13 | headers <- vector("list") 14 | for (file in lidar_files){ 15 | headers[[file]] <- read.lasheader(file) 16 | } 17 | names(headers) 18 | 19 | # Preview the LiDAR data for the first file in the list 20 | lidar_data <- read.las(lidar_files[1]) # this is a data.frame 21 | tail(lidar_data) 22 | summary(lidar_data) 23 | 24 | # Merge the LiDAR files into a single R data frame 25 | i <- 1:length(lidar_files) 26 | merged_lidar_data <- foreach(i, .combine='rbind') %do% { 27 | read.las(lidar_files[i])} 28 | -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/batchjob_rlas_basics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --output=out_R.txt # File to write the standard output to. 4 | #SBATCH --error=err_R.txt # File to write the standard error to. 5 | #SBATCH --time=00:15:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #SBATCH --mem-per-cpu=4000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 7 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 8 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 9 | 10 | module load r-env-singularity 11 | 12 | # If you have installed packages this helps resolve problems related to those 13 | if test -f ~/.Renviron; then 14 | sed -i '/TMPDIR/d' ~/.Renviron 15 | fi 16 | 17 | 18 | srun singularity_wrapper exec Rscript --no-save --slave basic_rlas.R 19 | -------------------------------------------------------------------------------- /R/R_LiDAR/rlas-DEM_example/get_lidar_files_function.R: -------------------------------------------------------------------------------- 1 | # Function to get LiDAR files from Puhti's GIS datasets for a 2 | # given area of interest 3 | 4 | # Make sure that you have Puhti's R spatial environment loaded by using module load r-env or by using interactively 5 | # GIS RStudio in NoMachine. 6 | 7 | lidar_files_puhti <- function(f_poly){ 8 | 9 | # f_poly - the path to a polygon layer with polygons covering an area from 10 | # you want LiDAR files collected 11 | 12 | library(raster) 13 | 14 | # The LiDAR dataset in Puhti is located at /appl/data/geo/mml/laserkeilaus/2008_latest/2008_latest.shp 15 | # You should use the lidar_auto_all.shp index file to spatially look for LiDAR files 16 | f_lidar_index <- "/appl/data/geo/mml/laserkeilaus/2008_latest/2008_latest.shp" 17 | lidar_index <<- shapefile(f_lidar_index) 18 | 19 | # Polygon to get lidar datasets from an area 20 | f_poly <<- "area_of_interest.shp" 21 | poly <- shapefile(f_poly) 22 | 23 | # What lidar index tiles intersect with our polygon 24 | inters <- intersect(lidar_index, poly) 25 | if (is.null(inters)) { 26 | print("ERROR: LiDAR tiles do not cover the study area!!!!") 27 | next 28 | } 29 | 30 | # From the index file, use the "path" attribute to get the file names of 31 | # the LiDAR tiles intersecting your area 32 | files <- vector() 33 | for (i in 1:length(inters@polygons)){ 34 | file <- paste0("/appl/data/geo/",inters$path[i]) 35 | files <- c(files, file) 36 | } 37 | print(files) 38 | return(files) 39 | } 40 | 41 | # Test the function with the example area_of_interest.shp file 42 | # lidar_files <- lidar_files_puhti("./area_of_interest.shp") 43 | # print(lidar_files) 44 | -------------------------------------------------------------------------------- /R/STAC/Readme.md: -------------------------------------------------------------------------------- 1 | # STAC R examples 2 | 3 | The [STAC](https://stacspec.org/en/) is a specification to describe geospatial information, so it can easily **searched and downloaded**. 4 | STAC includes metadata of datasets and links to actual files, the data files are usually stored in the cloud. See [Paituli STAC page](https://paituli.csc.fi/stac.html) for general introduction about STAC and what Collections (=datasets) are included in Paituli STAC. 5 | 6 | In this repository we provide examples to work with: 7 | 8 | * [Paituli STAC API](STAC_CSC_example.R) 9 | * See, also similar [Python STAC examples](../../python/STAC) 10 | 11 | The examples mainly cover data search and download, using [rstac](https://cran.r-project.org/web/packages/rstac/index.html). For analyzing data [gdalcubes](https://gdalcubes.github.io/) or [terra](https://cran.r-project.org/web/packages/terra/index.html) can be used. When working with bigger datasts, gdalcubes supports also parallelization. 12 | 13 | The examples can be run on any computer with R installation. The required R packages can be seen from the beginning of the example scripts. The examples download all data from cloud storage, so relatively good internet connection is needed. For using data in JP2000 format, GDAL must be installed with JP2000-support. 14 | 15 | It is possible to try this script also in CSC Puhti supercomputer. The easiest option is to start RStudio in Puhti web interface. For learning STAC, it is recommended to reserve 1 core and 8 Gb memory. Data analysis part with gdalcubes would benefit from more cores. Currently Puhti r-env does not support JP2000 format, so sentinel2-l2a can not be used with gdalcubes. Searching all collections works. 16 | 17 | In CSC Puhti supercomputer, the examples can be run with [r-env module](https://docs.csc.fi/apps/r-env/), which includes all necessary R packages. The easiest is to use RStudio with Puhti web interface: 18 | 19 | * Open [Puhti web interface](https://www.puhti.csc.fi/) 20 | * Click "RStudio" on dashboard 21 | * Select following settings: 22 | * Project: project_2002044 during course, own project otherwise 23 | * Partition: interactive 24 | * Number of CPU cores: 1 25 | * Memory (Gb): 8 26 | * Local disk (GB): 0 27 | * Time: 1:00:00 (or adjust to reasonable) 28 | * Click launch and wait until granted resources 29 | * Click "Connect to RStudio Server" 30 | 31 | 32 | If you want to run the example on your own computer, you'll need RStudio installed, along with the following packages: ```geojsonsf, sf, terra, tidyverse, rstac, httr, gdalcubes```. 33 | 34 | Download the example code to your computer (either by copying the whole repository to your computer with git (`git clone https://github.com/csc-training/geocomputing.git`) or by downloading only the needed file via github webinterface (find `STAC_CSC_example.Rmd` and in the upper right corner of the file "Download raw file button"). 35 | -------------------------------------------------------------------------------- /R/allas/working_with_allas_from_R_S3.R: -------------------------------------------------------------------------------- 1 | # Example script for using Allas directly from an R script: 2 | # - Reading and wrtiing raster and vector files 3 | # - Looping over all files of certain type in a bucket 4 | # - Writing raster and vector files (not working properly) Older version 5 | 6 | # Please notice that this example works ONLY with GDAL-based libraries for spatial data: sf, terra etc. 7 | 8 | # Note this does not work with R version 442 in Puhti, use some other version. 9 | 10 | library("terra") 11 | library("sf") 12 | library("aws.s3") 13 | library("tidyverse") 14 | 15 | # Before starting to use Allas with aws.s3 set up your credentials and endpoint to Allas. 16 | # This example here applies for using Allas from CSC Puhti or Mahti supercomputers. 17 | # To use some other S3 stroage or from some other computer, 18 | # See https://docs.csc.fi/support/tutorials/gis/gdal_cloud/#s3-connection-details 19 | 20 | # 1) Set up your credentials to Allas: 21 | # module load allas 22 | # allas-conf --mode s3cmd 23 | # This is needed only once, as long as you are using the same CSC project. 24 | # This also sets S3 endopoint to .aws/config file. 25 | 26 | # 2) Set S3 region for aws.s3-library. 27 | options("cloudyr.aws.default_region" = "") 28 | 29 | # If you want to WRITE files with terra/sf directly to Allas, set also this. 30 | Sys.setenv("CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE" = "YES") 31 | 32 | # Reading raster file 33 | r <- rast('/vsis3/name_of_your_Allas_bucket/name_of_your_input_raster_file.tif') 34 | 35 | # This should work, but has had some bugs in terra code, so does not work with any R version in Puhti (8.8.2023) 36 | writeRaster(r, filename='/vsis3/name_of_your_Allas_bucket/name_of_your_output_raster_file.tif') 37 | 38 | # Reading vector file 39 | v <- st_read('/vsis3/name_of_your_Allas_bucket/name_of_your_input_vector_file.gpkg') 40 | 41 | # Writing vector file 42 | st_write(v, '/vsis3/name_of_your_Allas_bucket/name_of_your_output_vector_file.gpkg') 43 | 44 | # Looping through all files in a bucket, having the same file type (tif). 45 | # First get list of all objects in the bucket 46 | all_files_in_bucket <- get_bucket_df(name_of_your_Allas_bucket) 47 | # Filter out only .tif-files and keep only the file name information (=Key) 48 | tif_files = all_files_in_bucket %>% filter(str_detect(Key, '.tif$')) %>% select(Key) 49 | # Loop through the files, here just printing the extent of each file as example. 50 | for (row in 1:nrow(tif_files)) { 51 | filepath <- paste('/vsis3/name_of_your_Allas_bucket/', tif_files[row,], sep = "") 52 | print (filepath) 53 | r <- raster(filepath) 54 | print (extent(r)) 55 | } 56 | 57 | 58 | -------------------------------------------------------------------------------- /R/geopackage/README.md: -------------------------------------------------------------------------------- 1 | ## Reading NLS topographic database geopackage with R 2 | The NLS topographic database has been saved into several geopackage files in Puhti at /appl/data/geo/mml/maastotietokanta/20XX/gpkg. The larger layers are in their own gpkg files and the smaller layers have been bundled into a single file. The larger layers are quite large and reading them takes some time. If the whole layers are not however needed it is possible to use SF package in R to only read the desired parts of the files. The examples on how to read only certain rows can be found in the read_gpkg.R script. 3 | 4 | In Puhti use the [r-env-singularity module](https://docs.csc.fi/apps/r-env-singularity/) for R. 5 | 6 | Similar examples for reading the geopackage with Python using Geopandas can be found [here](https://github.com/csc-training/geocomputing/tree/master/python/geopackage). 7 | -------------------------------------------------------------------------------- /R/geopackage/read_gpkg.R: -------------------------------------------------------------------------------- 1 | # Examples on reading data from NLS geopackages with sf. 2 | # The geopackges are quite large files so reading the whole thing takes a while. 3 | # We can however read parts of it quickly without having to inspect each row as shown in below examples. 4 | 5 | library(sf) 6 | fn_muut <- "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-muut_20-02-06.gpkg" 7 | fn_suo <- "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-suo_20-02-06.gpkg" 8 | #Reading a layer into a dataframe. Some layers are large, but for smaller layers this can be quick enough. 9 | read_whole_layer <- function(){ 10 | layer="hylky" 11 | df<-read_sf(fn_muut, layer) 12 | print(df) 13 | } 14 | 15 | # Geopackage is internally an sqlite database which can be connected to and queried. 16 | # The read_sf function takes a query= parameter that allows us to specify an SQL query to select only some parts of data. 17 | # The given SQL is handled by OGR, so see https://www.gdal.org/ogr_sql.html for available further details. 18 | # Basically selection based on any attribute is possible, but selection by geometry does not seem to be possible. 19 | 20 | # SQL selections can be used in several ways: 21 | 22 | # Reading rows in range 10-20. Only the rows that we want will be read regardless of the actual number of rows in the layer. 23 | read_rows_in_range <- function(){ 24 | layer<-"suo" 25 | start <- 10 26 | end <- 20 27 | sql <- sprintf("select * from %s where rowid >= %s and rowid < %s",layer, start, end) 28 | df<-read_sf(fn_suo, layer=layer, query=sql) 29 | print(df) 30 | } 31 | 32 | #As above but for specific rows 33 | read_specific_rows <- function(){ 34 | layer<-"suo" 35 | rows<-c(1,5,100) 36 | sql <- sprintf("select * from %s where rowid in (%s)",layer, paste(rows, collapse=", ")) 37 | print(sql) 38 | df<-read_sf(fn_suo, layer=layer, query=sql) 39 | print(df) 40 | } 41 | 42 | 43 | #We can use the query parameter to ask rows based on any attribute not just rowid. This may however be slow depending on number of rows and indexes available in the geopackage. If the column you want to use is not indexed you can create the index as follows 44 | create_index <- function(){ 45 | layer<-"suo" 46 | attr_col<-"mtk_id" 47 | attr_value<-219920480 48 | con <- dbConnect(RSQLite::SQLite(), fn) 49 | res<-dbSendQuery(con, sprintf("CREATE INDEX index_%s_%s ON %s (%s)",layer, attr_col, layer, attr_col)) 50 | dbClearResult(res) 51 | dbDisconnect(con) 52 | } 53 | 54 | #After this you can quickly query the layer based on the column. 55 | read_by_attribute <- function(){ 56 | layer<-"suo" 57 | attr_col<-"mtk_id" 58 | attr_value<-219920480 59 | 60 | sql <- sprintf("select * from %s where %s=%s", layer, attr_col, attr_value) 61 | df<-read_sf(fn,layer=layer, query=sql) 62 | print(df) 63 | 64 | } 65 | 66 | 67 | #If we want to query based on a bounding box efficiently we need to be able to take advantage of spatial indexing. The NLS Geopackage includes a spatial index for each layer already so we don't have to worry about creating it. Depending on how your version of GDAL has been compiled we still may need to enable spatilite extension to be able to take advantage of the indexing (=use RTreeIntersects function). To do this first open connection to the geopackage, enable spatialite extension and then supply that connection to read_sf rather than the filename. You can also try to just supply the filename to read_sf function and skip enabling spatialite as this may also work. 68 | 69 | read_area <- function(){ 70 | con <- dbConnect(RSQLite::SQLite(), fn) 71 | 72 | #Linux: 73 | res<-dbSendQuery(con, "select load_extension('libspatialite.so')") 74 | #Windows: 75 | #res<-dbSendQuery(con, "select load_extension('libspatialite.dll')") 76 | 77 | dbClearResult(res) 78 | bb<-c(374692, 6671989, 379750, 6676677) 79 | layer<-"suo" 80 | geom_col<-"sijainti_alue" 81 | sql <- sprintf("select * from %s where rowid in (select id from rtree_%s_%s where id match RTreeIntersects(%s,%s,%s,%s))",layer, layer, geom_col, bb[1],bb[2],bb[3],bb[4]) 82 | df<-read_sf(con, query=sql) 83 | dbDisconnect(con) 84 | print(df) 85 | } 86 | 87 | 88 | -------------------------------------------------------------------------------- /R/puhti/01_serial/Contours_simple.R: -------------------------------------------------------------------------------- 1 | # This is an spatial analysis example script for using R in CSC Puhti 2 | # This scipt can be used for serial or array jobs. 3 | # Here countours are calculated based on a DEM file and saved in GeoPackage format. 4 | # The file given as input is a 10m DEM file from Finnish NLS. 5 | 6 | # load terra library 7 | library(terra) 8 | 9 | # Set the working directory with RStudio 10 | # mainDir <- "/scratch/project_2002044/students/ekkylli/geocomputing/R/puhti/01_serial" 11 | # setwd(mainDir) 12 | 13 | mapsheets <- readLines('../mapsheets.txt') 14 | 15 | #Calculate contours and save the results as GeoPackage 16 | for (mapsheet in mapsheets){ 17 | DEM <- rast(mapsheet) 18 | file <- gsub("tif", "gpkg", basename(mapsheet)) 19 | contours <- as.contour(DEM) 20 | writeVector(contours, file, filetype="GPKG", overwrite=TRUE) 21 | } -------------------------------------------------------------------------------- /R/puhti/01_serial/serial_batch_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 5 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 6 | #SBATCH --time=0:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 8 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 9 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 10 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 11 | 12 | module load r-env 13 | 14 | # Clean up .Renviron file in home directory 15 | if test -f ~/.Renviron; then 16 | sed -i '/TMPDIR/d' ~/.Renviron 17 | fi 18 | 19 | # Specify a temp folder path 20 | # echo "TMPDIR=/scratch//tmp" >> ~/.Renviron 21 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron 22 | 23 | srun apptainer_wrapper exec Rscript --no-save Contours_simple.R 24 | -------------------------------------------------------------------------------- /R/puhti/02_parallel_future/Calc_contours_future_cluster.R: -------------------------------------------------------------------------------- 1 | # This is an spatial analysis example script for using R in CSC Puhti 2 | # This script can be used for parallel jobs. 3 | # The countours are calculated and saved in GeoPackage format. 4 | # The file given as input is a 10m DEM file from Finnish NLS. 5 | # The input files are listed in the mapsheet.txt file 6 | 7 | # For parallel tasks future package is used. 8 | 9 | # load libraries 10 | library(furrr) 11 | library(terra) 12 | 13 | # Start the snow cluster and create a plan with future package 14 | cl<-getMPIcluster() 15 | plan(cluster, workers = cl) 16 | 17 | # The function run on each core 18 | funtorun <- function(mapsheet) { 19 | DEM <- rast(mapsheet) 20 | file <- gsub("tif", "gpkg", basename(mapsheet)) 21 | contours <- as.contour(DEM) 22 | writeVector(contours, file, filetype="GPKG", overwrite=TRUE) 23 | } 24 | 25 | # Read the mapsheets from external file 26 | mapsheets <- readLines('../mapsheets.txt') 27 | 28 | # Give cluster the work to be done 29 | system.time(a<-future_map(mapsheets,funtorun)) 30 | 31 | #Stop cluster 32 | stopCluster(cl) 33 | 34 | -------------------------------------------------------------------------------- /R/puhti/02_parallel_future/Calc_contours_future_multicore.R: -------------------------------------------------------------------------------- 1 | # This is an spatial analysis example script for using R in CSC Puhti 2 | # This script can be used for parallel jobs. 3 | # The countours are calculated and saved in GeoPackage format. 4 | # The file given as input is a 10m DEM file from Finnish NLS. 5 | # The input files are listed in the mapsheet.txt file 6 | 7 | # For parallel tasks future package is used. 8 | 9 | # load libraries 10 | library(furrr) 11 | library(terra) 12 | 13 | # With plan(multicore) the number of workers is based on batch job reservation details. 14 | plan("multicore") 15 | 16 | # The function run on each core 17 | funtorun <- function(mapsheet) { 18 | DEM <- rast(mapsheet) 19 | file <- gsub("tif", "gpkg", basename(mapsheet)) 20 | contours <- as.contour(DEM) 21 | writeVector(contours, file, filetype="GPKG", overwrite=TRUE) 22 | } 23 | 24 | # Read the mapsheets from external file 25 | mapsheets <- readLines('../mapsheets.txt') 26 | 27 | # Give cluster the work to be done 28 | system.time(a<-future_map(mapsheets,funtorun)) 29 | -------------------------------------------------------------------------------- /R/puhti/02_parallel_future/parallel_batch_job_future_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=output.txt # File to write the standard output to. 5 | #SBATCH --error=errors.txt # File to write the standard error to. 6 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #Reserve cores for 1 master + 3 workers 8 | #SBATCH --ntasks=4 # Number of tasks. Upper limit depends on partition. 9 | #Test partition is used for testing, for real jobs use either serial or parallel depending on how many nodes you need. 10 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 12 | 13 | module load r-env 14 | 15 | # If you have installed packages this helps resolve problems related to those 16 | if test -f ~/.Renviron; then 17 | sed -i '/TMPDIR/d' ~/.Renviron 18 | sed -i '/OMP_NUM_THREADS/d' ~/.Renviron 19 | fi 20 | 21 | # Specify a temp folder path 22 | # echo "TMPDIR=/scratch//tmp" >> ~/.Renviron 23 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron 24 | 25 | srun apptainer_wrapper exec RMPISNOW --no-save --slave -f Calc_contours_future_cluster.R 26 | -------------------------------------------------------------------------------- /R/puhti/02_parallel_future/parallel_batch_job_future_multicore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=output.txt # File to write the standard output to. 5 | #SBATCH --error=errors.txt # File to write the standard error to. 6 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 8 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 9 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | #Equal to number of workers. Max 40 in Puhti. 11 | #SBATCH --cpus-per-task=3 # How many processors work on one task. Upper limit depends on number of CPUs per node. 12 | 13 | module load r-env 14 | 15 | # If you have installed packages this helps resolve problems related to those 16 | if test -f ~/.Renviron; then 17 | sed -i '/TMPDIR/d' ~/.Renviron 18 | sed -i '/OMP_NUM_THREADS/d' ~/.Renviron 19 | fi 20 | 21 | # Specify a temp folder path 22 | # echo "TMPDIR=/scratch//tmp" >> ~/.Renviron 23 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron 24 | 25 | srun apptainer_wrapper exec Rscript --no-save Calc_contours_future_multicore.R 26 | -------------------------------------------------------------------------------- /R/puhti/03_parallel_snow/Calc_contours_snow.R: -------------------------------------------------------------------------------- 1 | # This is an spatial analysis example script for using R in CSC Puhti 2 | # This scipt can be used for parallel jobs. 3 | # Here countours are calculated and saved in GeoPackage format. 4 | # The file given as input is a 10m DEM file from Finnish NLS. 5 | # The input files are listed in the mapsheet.txt file 6 | 7 | # For parallel tasks the snow package is used. 8 | 9 | # Start the snow cluster 10 | cl<-getMPIcluster() 11 | 12 | # The function run on each core 13 | # The R modules need to be loaded inside the functions. 14 | # The variables from outside of this function are not visible. 15 | funtorun<-function(mapsheet) { 16 | DEM <- rast(mapsheet) 17 | file <- gsub("tif", "gpkg", basename(mapsheet)) 18 | contours <- as.contour(DEM) 19 | writeVector(contours, file, filetype="GPKG", overwrite=TRUE) 20 | } 21 | 22 | # load terra library 23 | clusterEvalQ(cl, library(terra)) 24 | 25 | # Read the mapsheets from external file 26 | mapsheets <- readLines('../mapsheets.txt') 27 | 28 | # Give cluster the work to be done 29 | system.time(a<-clusterApply(cl,mapsheets,funtorun)) 30 | 31 | #Stop cluster 32 | stopCluster(cl) -------------------------------------------------------------------------------- /R/puhti/03_parallel_snow/parallel_batch_job_snow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=output_%j.txt # File to write the standard output to. 5 | #SBATCH --error=errors_%j.txt # File to write the standard error to. 6 | #SBATCH --time=00:04:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #Reserve cores for 1 master + 3 workers 8 | #SBATCH --ntasks=4 # Number of tasks. Upper limit depends on partition. 9 | #Test partition is for small test jobs only. For real jobs use either serial or parallel partition dependeing on how many nodes you need 10 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 12 | 13 | module load r-env 14 | 15 | if test -f ~/.Renviron; then 16 | sed -i '/TMPDIR/d' ~/.Renviron 17 | fi 18 | 19 | # Specify a temp folder path 20 | # echo "TMPDIR=/scratch//tmp" >> ~/.Renviron 21 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron 22 | 23 | srun apptainer_wrapper exec RMPISNOW --no-save --slave -f Calc_contours_snow.R 24 | -------------------------------------------------------------------------------- /R/puhti/04_parallel_foreach/Calc_contours_foreach.R: -------------------------------------------------------------------------------- 1 | # This is an spatial analysis example script for using R in CSC Puhti 2 | # This script can be used for parallel jobs. 3 | # The countours are calculated and saved in GeoPackage format. 4 | # The file given as input is a 10m DEM file from Finnish NLS. 5 | # The input files are listed in the mapsheet.txt file 6 | 7 | # For parallel tasks the foreach with doMPI is used. 8 | # See https://docs.csc.fi/apps/r-env-singularity/ 9 | 10 | library(doMPI,quietly=TRUE) 11 | cl<-startMPIcluster() 12 | registerDoMPI(cl) 13 | 14 | # Read the mapsheets from external file, in this case from user's workdirectory 15 | mapsheets <- readLines('../mapsheets.txt') 16 | 17 | # The function run on each core 18 | # The R modules need to be loaded inside the functions. 19 | # The variables from outside of this function are not visible. 20 | 21 | funtorun<-function(mapsheet) { 22 | DEM <- rast(mapsheet) 23 | file <- gsub("tif", "gpkg", basename(mapsheet)) 24 | contours <- as.contour(DEM) 25 | writeVector(contours, file, filetype="GPKG", overwrite=TRUE) 26 | } 27 | 28 | # Run funtorun function in parallel for each mapsheet. .export passes variables and .packages the necessary packages. 29 | # If return value is used .combine can be used to specify which function to use for combining results. 30 | 31 | a<-foreach(i=1:3, .packages=c("terra"), .combine="c") %dopar% { 32 | funtorun(mapsheets[i]) 33 | } 34 | #Print combined return values. In this case names of created shapefiles. 35 | print(a) 36 | closeCluster(cl) 37 | mpi.quit() 38 | 39 | -------------------------------------------------------------------------------- /R/puhti/04_parallel_foreach/parallel_batch_job_foreach.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=output.txt # File to write the standard output to. 5 | #SBATCH --error=errors.txt # File to write the standard error to. 6 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --ntasks=3 # Number of tasks. Upper limit depends on partition. 8 | #Test partition is used for testing, for real jobs use either serial or parallel depending on how many nodes you need. 9 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 10 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 11 | 12 | module load r-env 13 | 14 | if test -f ~/.Renviron; then 15 | sed -i '/TMPDIR/d' ~/.Renviron 16 | fi 17 | 18 | # Specify a temp folder path 19 | # echo "TMPDIR=/scratch//tmp" >> ~/.Renviron 20 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron 21 | 22 | srun apptainer_wrapper exec Rscript --no-save --slave Calc_contours_foreach.R 23 | -------------------------------------------------------------------------------- /R/puhti/05_array/Contours_array.R: -------------------------------------------------------------------------------- 1 | # This is an spatial analysis example script for using R in CSC Puhti 2 | # This script can be used for serial or array jobs. 3 | # Here a .tif DEM file is provided as an argument 4 | # and then countours are calculated and saved in GeoPackage format. 5 | # The file given as input is a 10m DEM file from Finnish NLS. 6 | 7 | # Load the necessary libraries 8 | library(terra) 9 | 10 | # Read the command line argument, which is the path of the .tif file. 11 | args = commandArgs(trailingOnly=TRUE) 12 | 13 | if (length(args)==0) { 14 | stop("Please give the map sheet number", call.=FALSE) 15 | } else if (length(args)==1) { 16 | # The filepath given to this script goes to variable mapsheet 17 | mapsheet <- args[1] 18 | } 19 | print(mapsheet) 20 | 21 | # Calculate contours 22 | DEM <- rast(mapsheet) 23 | file <- gsub("tif", "gpkg", basename(mapsheet)) 24 | contours <- as.contour(DEM) 25 | # Save the results as GeoPackage 26 | writeVector(contours, file, filetype="GPKG", overwrite=TRUE) 27 | -------------------------------------------------------------------------------- /R/puhti/05_array/array_batch_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=slurm-%A_%a.out # File to write the standard output to. %A is replaced by the job ID and %a with the array index. 5 | #SBATCH --error=slurm-%A_%a.err # File to write the standard error to. %A is replaced by the job ID and %a with the array index. Defaults to slurm-%A_%a.out if not provided. 6 | #SBATCH --time=00:02:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 8 | #SBATCH --array=1-3 # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -. 9 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 10 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | 12 | # load the Puhti module for R 13 | module load r-env 14 | 15 | if test -f ~/.Renviron; then 16 | sed -i '/TMPDIR/d' ~/.Renviron 17 | fi 18 | 19 | # Specify a temp folder path 20 | # echo "TMPDIR=/scratch//tmp" >> ~/.Renviron 21 | echo "TMPDIR=$PWD/tmp" >> ~/.Renviron 22 | 23 | # read the file that has filepaths for mapsheets and pick one row according to variable $SLURM_ARRAY_TASK_ID 24 | name=$(sed -n "$SLURM_ARRAY_TASK_ID"p ../mapsheets.txt) 25 | 26 | # run the analysis command 27 | srun apptainer_wrapper exec Rscript Contours_array.R $name 28 | -------------------------------------------------------------------------------- /R/puhti/mapsheets.txt: -------------------------------------------------------------------------------- 1 | /appl/data/geo/mml/dem10m/2019/W3/W33/W3333.tif 2 | /appl/data/geo/mml/dem10m/2019/W3/W33/W3332.tif 3 | /appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif 4 | -------------------------------------------------------------------------------- /R/puhti/mapsheets_URLs.txt: -------------------------------------------------------------------------------- 1 | /vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem10m/2019/W3/W33/W3333.tif 2 | /vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem10m/2019/W3/W33/W3332.tif 3 | /vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem10m/2019/W3/W33/W3331.tif 4 | -------------------------------------------------------------------------------- /R/raster_predict/README.md: -------------------------------------------------------------------------------- 1 | > **_NOTE:_** This example applies to [raster package](https://cran.r-project.org/web/packages/raster/index.html). The new [terra package](https://cran.r-project.org/web/packages/terra/index.html) is replacing raster package. For parallelization with terra just follow terra documentation, see for example predict example in [terra manual](https://cran.r-project.org/web/packages/terra/terra.pdf). 2 | 3 | Some of the functions in `raster` package support parallel computing. 4 | 5 | This example includes a **parallel job** using `predict()` funcion from `raster` package. The precence/absence of R logo is predicted in an image. This type of model is often used to predict species distributions. See the dismo package for more of that. 6 | 7 | The example is from https://www.rdocumentation.org/packages/raster/versions/2.5-8/topics/predict and it's 8 | simplified and adapted to be run thorugh batch job system in Puhti. 9 | 10 | `r_run.sh` shows how to submit parallel R jobs to Puhti SLURM system. 11 | 12 | For further details see comments in script. For general instructions on how to use R in Puhti see https://docs.csc.fi/apps/r-env-singularity/ 13 | 14 | > **_NOTE:_** `raster` package uses for parallelization `snow` package, which can not be used in Puhti in interactive session nor RStudio . 15 | -------------------------------------------------------------------------------- /R/raster_predict/Rplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/raster_predict/Rplots.pdf -------------------------------------------------------------------------------- /R/raster_predict/r_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --output=output.txt # File to write the standard output to. 4 | #SBATCH --error=errors.txt # File to write the standard error to. 5 | #SBATCH --time=00:10:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #SBATCH --ntasks=4 # Number of tasks. Upper limit depends on partition. 7 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 8 | #SBATCH --mem=1000 # Real memory required per node. 9 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 10 | 11 | module load r-env-singularity 12 | 13 | if test -f ~/.Renviron; then 14 | sed -i '/TMPDIR/d' ~/.Renviron 15 | fi 16 | 17 | srun singularity_wrapper exec RMPISNOW --no-save -f rtest.R 18 | 19 | -------------------------------------------------------------------------------- /R/raster_predict/rlogo.grd: -------------------------------------------------------------------------------- 1 | [general] 2 | creator=R package 'raster' 3 | created= 2010-04-16 14:57:47 4 | [georeference] 5 | nrows= 77 6 | ncols= 101 7 | xmin= 0 8 | ymin= 0 9 | xmax= 101 10 | ymax= 77 11 | projection= +proj=merc +datum=WGS84 12 | [data] 13 | datatype= FLT4S 14 | byteorder= little 15 | nbands= 3 16 | bandorder= BIL 17 | categorical= FALSE 18 | levels= NA 19 | minvalue= 0:0:0 20 | maxvalue= 255:255:255 21 | nodatavalue= -3.4e+38 22 | [legend] 23 | legendtype= 24 | values= 25 | color= 26 | [description] 27 | layername= red:green:blue 28 | history= 29 | -------------------------------------------------------------------------------- /R/raster_predict/rlogo.gri: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/R/raster_predict/rlogo.gri -------------------------------------------------------------------------------- /R/raster_predict/rtest.R: -------------------------------------------------------------------------------- 1 | #Example from: https://www.rdocumentation.org/packages/raster/versions/2.5-8/topics/predict 2 | 3 | # A simple model to predict the location of the R in the R-logo using 20 presence points 4 | # and 50 (random) pseudo-absence points. This type of model is often used to predict 5 | # species distributions. See the dismo package for more of that. 6 | 7 | library(raster) 8 | setwd("raster_predict") 9 | 10 | # create a RasterStack or RasterBrick with with a set of predictor layers 11 | logo <- brick("rlogo.grd") 12 | names(logo) 13 | 14 | # known presence and absence points 15 | p <- matrix(c(48, 48, 48, 53, 50, 46, 54, 70, 84, 85, 74, 84, 95, 85, 16 | 66, 42, 26, 4, 19, 17, 7, 14, 26, 29, 39, 45, 51, 56, 46, 38, 31, 17 | 22, 34, 60, 70, 73, 63, 46, 43, 28), ncol=2) 18 | 19 | a <- matrix(c(22, 33, 64, 85, 92, 94, 59, 27, 30, 64, 60, 33, 31, 9, 20 | 99, 67, 15, 5, 4, 30, 8, 37, 42, 27, 19, 69, 60, 73, 3, 5, 21, 21 | 37, 52, 70, 74, 9, 13, 4, 17, 47), ncol=2) 22 | 23 | 24 | # extract values for points 25 | xy <- rbind(cbind(1, p), cbind(0, a)) 26 | v <- data.frame(cbind(pa=xy[,1], extract(logo, xy[,2:3]))) 27 | 28 | #build a model, here an example with glm 29 | model <- glm(formula=pa~., data=v) 30 | 31 | #Serial code for making predictions: 32 | #r1 <- predict(logo, model, progress='text') 33 | 34 | #Run predict function using an mpi cluster. Note that in Puhti the cluster is already available, you shouldn't start it yourself but use the handle provided by getMPIcluster. 35 | cl<-getMPIcluster() 36 | r1 <- clusterR(logo, predict, args=list(model), cl=cl) 37 | stopCluster(cl) 38 | 39 | 40 | #Plot the original data and results 41 | plotRGB(logo) 42 | points(p, bg='blue', pch=21) 43 | points(a, bg='red', pch=21) 44 | plot(r1,col = gray.colors(10, start = 0, end = 1, gamma = 1, alpha = NULL)) 45 | quit() 46 | 47 | -------------------------------------------------------------------------------- /R/virtual_rasters.R: -------------------------------------------------------------------------------- 1 | # Example scripts for using virtual rasters. 2 | # Here contours are calculated based on a 2m DEM file and saved in GeoPackage format. 3 | # As input 4 different options are used: 4 | # 1) Paituli files, copied to Puhti local disk 5 | # 2) Paituli files, with URLs (from Espoo) 6 | # 3) GeoPortti GeoCubes file (physically in cPouta next to Puhti, in Kajaani) 7 | # 4) FMI STAC file (physically somewhere else in Finland) 8 | # All of these data sources cover all Finland. 9 | # The contours are calculated only on a subset of the data, defined by BBOX. 10 | # BBOX location can be changed to any other location in Finland. 11 | 12 | library(terra) 13 | # For measuring computation time 14 | library(tictoc) 15 | 16 | 17 | # The extent of used data 18 | # bbox_vrt <- ext(489000, 490000, 7333000, 7334000) 19 | # Bigger BBOX 20 | bbox_vrt <- ext(480000, 490000, 7330000, 7340000) 21 | 22 | 23 | # Paituli files in Puhti locally 24 | puhti_dem2m_vrt <- "/appl/data/geo/mml/dem2m/dem2m_direct.vrt" 25 | puhti_result_file <- 'Puhti_vrt_contours.gpkg' 26 | 27 | # Paituli files (URL) 28 | # Link from here: https://www.nic.funet.fi/index/geodata/mml/dem2m/2008_latest/ 29 | paituli_dem2m_vrt <- "/vsicurl/https://www.nic.funet.fi/index/geodata/mml/dem2m/2008_latest/dem2m.vrt" 30 | paituli_result_file <- 'Paituli_vrt_contours.gpkg' 31 | 32 | 33 | # GeoPortti GeoCubes files 34 | # Link from here: https://vm0160.kaj.pouta.csc.fi/geocubes/fileaccess/ 35 | geocubes_dem2m_vrt <- "/vsicurl/https://vm0160.kaj.pouta.csc.fi/mml/korkeusmalli/km2/2022/km2_2022_2m.vrt" 36 | geocubes_result_file <- 'Geocubes_vrt_contours.gpkg' 37 | 38 | # FMI STAC files 39 | # Link from here: https://pta.data.lit.fmi.fi/stac/items/MML-DTM-2m/MML-DTM-2m_2020.json 40 | fmi_dem2m_vrt <- '/vsicurl/https://pta.data.lit.fmi.fi/dem/etrs-tm35fin-n2000/MML-DTM-2020-2m-height.vrt' 41 | fmi_result_file <- 'FMI_vrt_contours.gpkg' 42 | 43 | # Function to handle each dataset 44 | contours_from_vrt <- function(vrt_path, bbox, output_file){ 45 | # Create terra SpatRaster object of virtual raster file. 46 | # Data is not read to R at this phase. 47 | vrt <- rast(vrt_path) 48 | # Crop the SpatRaster to our area of interest. 49 | DEM = crop(vrt, bbox) 50 | # Calcualte contours. 51 | contours <- as.contour(DEM) 52 | # Save result to a file. 53 | writeVector(contours, output_file, filetype="GPKG", overwrite=TRUE) 54 | } 55 | 56 | # Run the function with each data source and see how long it takes. 57 | tic("Paituli files, in Puhti locally") 58 | contours_from_vrt(puhti_dem2m_vrt, bbox_vrt, puhti_result_file) 59 | toc() 60 | 61 | tic("Paituli files, URL") 62 | contours_from_vrt(geocubes_dem2m_vrt, bbox_vrt, geocubes_result_file) 63 | toc() 64 | 65 | tic("GeoCubes in cPouta") 66 | contours_from_vrt(geocubes_dem2m_vrt, bbox_vrt, geocubes_result_file) 67 | toc() 68 | 69 | tic("FMI") 70 | contours_from_vrt(fmi_dem2m_vrt, bbox_vrt, fmi_result_file) 71 | toc() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Geocomputing using CSC resources 2 | 3 | This repository contains examples for use of different geospatial applications. Many of the examples are for [CSC supercomputer Puhti](https://docs.csc.fi/computing/systems-puhti/) but may also be helpful for other systems (or your own computer). Please find a list of all geospatial software that is available on Puhti in [CSC docs](https://docs.csc.fi/apps/#geosciences). 4 | 5 | ## Puhti 6 | 7 | ### R 8 | * [Overview](./R/README.md) 9 | * [Puhti](./R/puhti) - serial/array/parallel processing with R. 10 | * [R for LiDAR data](./R/R_LiDAR): lidR and rlas 11 | * [Working with Allas data from R](./R/allas) 12 | * [Reading NLS topographic database geopackage with R](./R/geopackage) 13 | 14 | 15 | ### Python 16 | * [Overview](./python/README.md) 17 | * [Puhti](./Python/puhti/README.md) - serial/array/parallel processing with Python. 18 | * [Working with Allas data from Python](./Python/allas) 19 | * [Reading NLS topographic database geopackage with Python](./python/geopackage/README.md) 20 | * [GRASS multiprocessing from Python](./python/grass_multiprocessing_with_python/README.md) 21 | * [Routing](./python/routing/readme.md) 22 | * [Sentinel data download from Finhub using sentinelsat](python/sentinel/README.md) 23 | * [STAC, xarray and dask for downloading and processing data](./python/STAC/stac_xarray_dask_example.ipynb) 24 | * [Zonal statistics in parallel](./python/zonal_stats/README.md) 25 | * [Python Dask-geopandas](./python/dask_geopandas/README.md) 26 | 27 | ### Other tools 28 | * [FORCE ](./force/README.md) 29 | * [GDAL](./gdal/readme.md) 30 | * [GRASS](./grass/readme.md) 31 | * [PDAL](./pdal/README.md) 32 | * [SNAP graph processing tool gpt](./snap/README.md) 33 | 34 | ### Use cases / longer examples 35 | * [GeoPortti share Github repository](https://github.com/geoporttishare?tab=repositories) includes several longer examples of HPC usage. 36 | 37 | ## Pouta 38 | * [Instructions to setup geospatial tools](./pouta/README.md) to virtual machines in [CSC's cPouta environment](https://docs.csc.fi/cloud/pouta/), inc OpenDroneMap, GeoServer, ArcPy and MetaShape. 39 | 40 | ## CSC Notebooks 41 | * [Setting up geospatial Python Jupyter environment](./noppe/Readme.md) 42 | 43 | ## Download 44 | 45 | If you have installed Git or You can download these scripts to any computer using git. To do this, first navigate to the destination folder (in Puhti this could be your project's **projappl** or **scratch** folder): 46 | 47 | `cd /projappl/` 48 | or 49 | `cd /scratch/` 50 | 51 | And then clone this repository there 52 | 53 | `git clone https://github.com/csc-training/geocomputing.git` 54 | 55 | If Git is not available, you can download the files also as zip-file from the `Code` drop-down menu above. 56 | 57 | 58 | ## License 59 | These examples are free to use under CC 4.0 BY license unless marked otherwise. 60 | 61 | ## Acknowledgement 62 | 63 | Please acknowledge CSC and Geoportti in your publications, it is important for project continuation and funding reports. As an example, you can write "The authors wish to thank CSC - IT Center for Science, Finland (urn:nbn:fi:research-infras-2016072531) and the Open Geospatial Information Infrastructure for Research (Geoportti, urn:nbn:fi:research-infras-2016072513) for computational resources and support 64 | -------------------------------------------------------------------------------- /force/LEVEL2_parameters.prm: -------------------------------------------------------------------------------- 1 | ++PARAM_LEVEL2_START++ 2 | 3 | # INPUT/OUTPUT DIRECTORIES 4 | # ------------------------------------------------------------------------ 5 | FILE_QUEUE = /users/johannes/force/file_queue.txt 6 | DIR_LEVEL2 = /scratch/project_2000599/force/output_L2A 7 | DIR_LOG = /scratch/project_2000599/force 8 | DIR_TEMP = /scratch/project_2000599/force/temp 9 | 10 | # DIGITAL ELEVATION MODEL 11 | # ------------------------------------------------------------------------ 12 | FILE_DEM = NULL 13 | DEM_NODATA = -32767 14 | 15 | # DATA CUBES 16 | # ------------------------------------------------------------------------ 17 | DO_REPROJ = TRUE 18 | DO_TILE = TRUE 19 | FILE_TILE = NULL 20 | TILE_SIZE = 30000 21 | BLOCK_SIZE = 3000 22 | RESOLUTION_LANDSAT = 30 23 | RESOLUTION_SENTINEL2 = 10 24 | ORIGIN_LON = -25 25 | ORIGIN_LAT = 60 26 | PROJECTION = GLANCE7 27 | RESAMPLING = CC 28 | 29 | # RADIOMETRIC CORRECTION OPTIONS 30 | # ------------------------------------------------------------------------ 31 | DO_ATMO = TRUE 32 | DO_TOPO = FALSE 33 | DO_BRDF = TRUE 34 | ADJACENCY_EFFECT = TRUE 35 | MULTI_SCATTERING = TRUE 36 | 37 | # WATER VAPOR CORRECTION OPTIONS 38 | # ------------------------------------------------------------------------ 39 | DIR_WVPLUT = NULL 40 | WATER_VAPOR = NULL 41 | 42 | # AEROSOL OPTICAL DEPTH OPTIONS 43 | # ------------------------------------------------------------------------ 44 | DO_AOD = TRUE 45 | DIR_AOD = NULL 46 | 47 | # CLOUD DETECTION OPTIONS 48 | # ------------------------------------------------------------------------ 49 | MAX_CLOUD_COVER_FRAME = 75 50 | MAX_CLOUD_COVER_TILE = 75 51 | CLOUD_THRESHOLD = 0.225 52 | SHADOW_THRESHOLD = 0.02 53 | 54 | # RESOLUTION MERGING 55 | # ------------------------------------------------------------------------ 56 | RES_MERGE = IMPROPHE 57 | 58 | # CO-REGISTRATION OPTIONS 59 | # ------------------------------------------------------------------------ 60 | DIR_COREG_BASE = NULL 61 | COREG_BASE_NODATA = -9999 62 | 63 | # MISCELLANEOUS OPTIONS 64 | # ------------------------------------------------------------------------ 65 | IMPULSE_NOISE = TRUE 66 | BUFFER_NODATA = FALSE 67 | 68 | # TIER LEVEL 69 | # ------------------------------------------------------------------------ 70 | TIER = 1 71 | 72 | # PARALLEL PROCESSING 73 | # ------------------------------------------------------------------------ 74 | # Multiprocessing options (NPROC, DELAY) only apply when using the batch 75 | # utility force-level2. They are not used by the core function force-l2ps. 76 | # ------------------------------------------------------------------------ 77 | NPROC = 8 78 | NTHREAD = 2 79 | PARALLEL_READS = FALSE 80 | DELAY = 3 81 | TIMEOUT_ZIP = 30 82 | 83 | # OUTPUT OPTIONS 84 | # ------------------------------------------------------------------------ 85 | OUTPUT_FORMAT = GTiff 86 | OUTPUT_DST = FALSE 87 | OUTPUT_AOD = FALSE 88 | OUTPUT_WVP = FALSE 89 | OUTPUT_VZN = FALSE 90 | OUTPUT_HOT = FALSE 91 | OUTPUT_OVV = TRUE 92 | 93 | ++PARAM_LEVEL2_END++ 94 | -------------------------------------------------------------------------------- /force/README.md: -------------------------------------------------------------------------------- 1 | # FORCE example & benchmarks 2 | 3 | This is an example of using FORCE to process L1 Sentinel images to L2 using the **force-level2** command. FORCE documentation can be found here 4 | 5 | https://force-eo.readthedocs.io/en/latest/index.html 6 | 7 | ## Repository content 8 | 9 | * **file_queue.txt** - the queue file that has all Sentinel images to be processed 10 | * **LEVEL2_parameters.prm** - the parameter file which holds all processing related parameters. Remember to change the NPROC to number of CPUs you reserved 11 | * **force_batch_job.sh** - the batch job file used to submit the job to Puhti 12 | 13 | ## Benchmarks 14 | 15 | Processing 4 L1C Sentinel-images to L2A. Test images can be found from /appl/data/geo/sentinel/s2_example_data/L1C 16 | 17 | relevant parameters in .prm file 18 | 19 | **DO_TOPO = FALSE** 20 | **NPROC = number of CPU you reserved in the batch job file** 21 | **NTHREAD = 2** 22 | 23 | ### 4CPU 24 | 25 | * Nodes: 1 26 | * Cores per node: 4 27 | * CPU Utilized: 01:33:09 28 | * CPU Efficiency: 94.60% of 01:38:28 core-walltime 29 | * Job Wall-clock time: 00:24:37 30 | 31 | ### 8CPU 32 | 33 | * Nodes: 1 34 | * Cores per node: 8 35 | * CPU Utilized: 01:35:51 36 | * CPU Efficiency: 80.05% of 01:59:44 core-walltime 37 | * Job Wall-clock time: 00:14:58 38 | 39 | ### 16CPU 40 | 41 | * Nodes: 1 42 | * Cores per node: 16 43 | * CPU Utilized: 01:37:39 44 | * CPU Efficiency: 42.19% of 03:51:28 core-walltime 45 | * Job Wall-clock time: 00:14:28 46 | 47 | # CONCLUSION 48 | 49 | From the benchmark runs, it seems that **a good rule of thumb is that the optimal number of CPU cores is approximately twice the amount of images processed in parallel.** In this example we had 4 images and 8 CPU cores which produced **80% CPU** efficiency, but 16 CPU cores only **42%**. 50 | 51 | Maximum number of available CPU cores for FORCE in Puhti is one full node which is **40 CPU** cores 52 | 53 | This example used approximately **34GB** memory while processing 4 images at the same time. 54 | 55 | -------------------------------------------------------------------------------- /force/file_queue.txt: -------------------------------------------------------------------------------- 1 | /appl/data/geo/sentinel/s2_example_data/L1C/S2A_MSIL1C_20200320T094031_N0209_R036_T34VFM_20200320T101331.SAFE QUEUED 2 | /appl/data/geo/sentinel/s2_example_data/L1C/S2A_MSIL1C_20200422T095031_N0209_R079_T35VLG_20200422T133517.SAFE QUEUED 3 | /appl/data/geo/sentinel/s2_example_data/L1C/S2B_MSIL1C_20200407T095029_N0209_R079_T35VLG_20200407T115232.SAFE QUEUED 4 | /appl/data/geo/sentinel/s2_example_data/L1C/S2B_MSIL1C_20200427T095029_N0209_R079_T35VLG_20200427T115137.SAFE QUEUED -------------------------------------------------------------------------------- /force/force_batch_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 4 | #SBATCH --time=01:00:00 # Maximum duration of the job. Upper limit depends on partition. 5 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 6 | #SBATCH --cpus-per-task=8 # How many processors work on one task. Upper limit depends on number of CPUs per node. 7 | #SBATCH --mem=40G # Real memory required per node. 8 | 9 | module load force 10 | srun force-level2 /users/johannes/force/LEVEL2_parameters.prm 11 | -------------------------------------------------------------------------------- /gdal/gdal_batch_job_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ToDo: change project name in the row below 3 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 4 | # SBATCH --reservation=geocomputing_wed # Only available during the course 5 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 6 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 7 | #SBATCH --time 0:05:00 8 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 9 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 10 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 11 | #SBATCH --mem-per-cpu=300 # Minimum memory required per usable allocated CPU. Default units are megabytes. 12 | 13 | # Load geoconda module to have GDAL commandline tools available. 14 | module load parallel geoconda 15 | 16 | # Find the files that have .tif ending, we do not want to process the .tif.aux.xml files in the same folders. 17 | # Run the GDAL script for each of the found files. 18 | 19 | find /appl/data/geo/mml/dem10m/2019/W3/W33 -name '*.tif' | \ 20 | parallel -j $SLURM_CPUS_PER_TASK bash gdal_parallel.sh {} 21 | -------------------------------------------------------------------------------- /gdal/gdal_batch_job_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ToDo: change project name in the row below 3 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 4 | # SBATCH --reservation=geocomputing_wed # Only available during the course 5 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 6 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 7 | #SBATCH --time 0:05:00 8 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 9 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 10 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 11 | #SBATCH --mem-per-cpu=300 # Minimum memory required per usable allocated CPU. Default units are megabytes. 12 | 13 | # Load geoconda module to have GDAL commandline tools available. 14 | module load geoconda 15 | 16 | # Run the bash script, which includes the GDAL commands. 17 | srun bash gdal_serial.sh 18 | -------------------------------------------------------------------------------- /gdal/gdal_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get the file name given as input (argument) for this script. 4 | in=$1 5 | 6 | # Define output file name, based on input file name 7 | out=$(basename $in) 8 | 9 | # Change the coordinate system to EPSG:2393, which is the old Finnish YKJ (=KKJ3) 10 | gdalwarp $in $out -of COG -t_srs EPSG:2393 -overwrite 11 | -------------------------------------------------------------------------------- /gdal/gdal_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Find the files that have .tif ending, we do not want to process the .tif.aux.xml files in the same folders. 4 | for i in $(find /appl/data/geo/mml/dem10m/2019/W3/W33 -name '*.tif') 5 | 6 | # Process the files 7 | do 8 | # Define output file name, based on input file name 9 | out=$(basename $i) 10 | # Change the coordinate system to EPSG:2393, which is the old Finnish YKJ (=KKJ3) 11 | # ToDo: change project name and username in the row below 12 | gdalwarp $i /scratch/project_20xxxxx/students/cscusername/geocomputing/gdal/$out -of COG -t_srs EPSG:2393 13 | done 14 | -------------------------------------------------------------------------------- /grass/01_serial_cli/grass_cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | #Set file paths and names 5 | DEMFILE="/appl/data/geo/mml/dem10m/2019/V4/V41/V4132.tif" 6 | GRASSINPUT="V4132" 7 | GRASSOUTPUT="contours" 8 | OUTPUT="/scratch/project_2000599/grass/output/V4132.gpkg" 9 | 10 | # Register external GeoTIFF in current mapset: 11 | r.external input=$DEMFILE output=$GRASSINPUT --verbose --overwrite 12 | 13 | # Set GRASS region 14 | g.region raster=$GRASSINPUT 15 | 16 | # Perform GRASS analysis, here calculate contours from DEM 17 | r.contour in=$GRASSINPUT out=$GRASSOUTPUT minlevel=200 maxlevel=800 step=10 --overwrite 18 | 19 | #Write output to file 20 | v.out.ogr input=$GRASSOUTPUT output=$OUTPUT --overwrite 21 | 22 | # These can be left out, just debug info 23 | echo "\n\n ***DEBUG INFO***" 24 | echo "GRASS version" 25 | g.version 26 | 27 | echo "GRASS env settings: gisdatabase, location, mapset" 28 | g.gisenv 29 | 30 | echo "Available datasets:" 31 | g.list type=all -m 32 | 33 | echo "Input file info" 34 | r.info $GRASSINPUT --verbose 35 | 36 | echo "Output info" 37 | v.info $GRASSOUTPUT --verbose -------------------------------------------------------------------------------- /grass/01_serial_cli/grass_cli_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 4 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 5 | #SBATCH --time=0:05:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 7 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 8 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 9 | #SBATCH --mem-per-cpu=4000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | 11 | module load grassgis 12 | 13 | # Run the GRASS script with temporary location 14 | grass --tmp-location EPSG:3067 --exec bash grass_cli.sh 15 | -------------------------------------------------------------------------------- /grass/02_python_scripting_serial/python_scripting_serial.py: -------------------------------------------------------------------------------- 1 | import grass.script as gscript 2 | import grass.script as gcore 3 | import pprint 4 | 5 | import json 6 | 7 | file='/appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif' 8 | grassfile='W3331' 9 | grasscontoursfile='W3331_contours' 10 | contoursfile="/scratch/project_2000599/grass/output/V4132.gpkg" 11 | 12 | # Register external GeoTIFF in current mapset: 13 | gscript.parse_command("r.external", input=file,output=grassfile,flags="e",overwrite=True) 14 | 15 | # Set GRASS region 16 | gscript.run_command('g.region', rast=grassfile) 17 | 18 | # Perform GRASS analysis, here calculate contours from DEM 19 | gscript.run_command('r.contour', input=grassfile, output=grasscontoursfile, minlevel=200, maxlevel=800, step=10, overwrite=True) 20 | 21 | #Write output to file 22 | gscript.run_command('v.out.ogr', input=grasscontoursfile, output=contoursfile, overwrite=True) 23 | 24 | # These can be left out, just debug info 25 | # TODO: not working properly! 26 | print( "\n\n ***DEBUG INFO***") 27 | print( "GRASS version") 28 | print(gscript.read_command("g.version")) 29 | 30 | print("\nGRASS env settings: gisdatabase, location, mapset") 31 | print(gscript.read_command("g.gisenv", flags="s")) 32 | 33 | print("\nAvailable datasets:") 34 | print(gscript.read_command("g.list", type="all", flags='m')) 35 | 36 | print("\nInput file info") 37 | print(gscript.read_command("r.info", map=grassfile, flags='g')) 38 | 39 | print("\nOutput info") 40 | print(gscript.read_command("v.info", map=grasscontoursfile, flags='g')) -------------------------------------------------------------------------------- /grass/02_python_scripting_serial/python_scripting_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 4 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 5 | #SBATCH --time=0:05:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 7 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 8 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 9 | #SBATCH --mem-per-cpu=4000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | 11 | module load grassgis 12 | 13 | # Run the GRASS Python script with temporary location 14 | grass --tmp-location EPSG:3067 --exec python3 python_scripting_serial.py 15 | -------------------------------------------------------------------------------- /grass/03_pygrass_serial/pygrass_serial.py: -------------------------------------------------------------------------------- 1 | from grass.pygrass.modules.shortcuts import general as g 2 | from grass.pygrass.modules.shortcuts import raster as r 3 | from grass.pygrass.modules.shortcuts import vector as v 4 | #from grass.pygrass.modules.shortcuts import temporal as t 5 | 6 | from grass.pygrass.modules.grid import GridModule 7 | 8 | file='/appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif' 9 | grassfile='W3331' 10 | grasscontoursfile='W3331_contours' 11 | contoursfile="/scratch/project_2000599/grass/output/V4132.gpkg" 12 | 13 | # Register external GeoTIFF in current mapset: 14 | r.external(input=file,output=grassfile,flags="e",overwrite=True) 15 | 16 | # Set GRASS region 17 | g.region(raster=grassfile) 18 | 19 | # Perform GRASS analysis, here calculate contours from DEM 20 | r.contour(input=grassfile, output=grasscontoursfile, minlevel=200, maxlevel=800, step=10, overwrite=True) 21 | 22 | #Write output to file 23 | v.out_ogr(input=grasscontoursfile, output=contoursfile, overwrite=True) 24 | 25 | # These can be left out, just debug info 26 | # TODO: not working properly! 27 | print( "\n\n ***DEBUG INFO***") 28 | print( "GRASS version") 29 | print(g.version()) 30 | 31 | print("GRASS env settings: gisdatabase, location, mapset") 32 | print(g.gisenv()) 33 | 34 | print("Available datasets:") 35 | print(g.list(type="all", flags='m')) 36 | 37 | print("Input file info") 38 | print(r.info(map=grassfile, verbose=True)) 39 | 40 | print("Output info") 41 | print(v.info(map=grasscontoursfile, verbose=True)) -------------------------------------------------------------------------------- /grass/03_pygrass_serial/pygrass_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 4 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 5 | #SBATCH --time=0:05:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 7 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 8 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 9 | #SBATCH --mem-per-cpu=4000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | 11 | module load grassgis 12 | 13 | # Run the PyGRASS script with temporary location 14 | grass --tmp-location EPSG:3067 --exec python3 pygrass_serial.py -------------------------------------------------------------------------------- /grass/04_pygrass_parallel/pygrass_parallel_with_gridmodule.py: -------------------------------------------------------------------------------- 1 | import grass.script as gscript 2 | from grass.pygrass.modules.shortcuts import general as g 3 | from grass.pygrass.modules.shortcuts import raster as r 4 | from grass.pygrass.modules.shortcuts import vector as v 5 | #from grass.pygrass.modules.shortcuts import temporal as t 6 | 7 | from grass.pygrass.modules.grid import GridModule 8 | 9 | file='/appl/data/geo/mml/dem10m/2019/W3/W33/W3331.tif' 10 | grassfile='W3331' 11 | grasscontoursfile='W3331_contours' 12 | aspectfile="/scratch/project_2000599/grass/output/aspect.tif" 13 | cpus=4 14 | 15 | # Register external GeoTIFF in current mapset: 16 | r.external(input=file,output=grassfile,flags="e",overwrite=True) 17 | 18 | # Set GRASS region 19 | g.region(raster=grassfile) 20 | 21 | #Perform GRASS analysis, here calculate contours from DEM, parallelization with GridModule 22 | region = gscript.region() 23 | width = region['cols'] // 2 + 1 24 | height = region['rows'] // 2 + 1 25 | 26 | grd = GridModule('r.slope.aspect', 27 | width=width, height=height, overlap=2, 28 | processes=cpus, split=False, 29 | elevation=grassfile, 30 | aspect='aspect', overwrite=True) 31 | grd.run() 32 | 33 | # grd = GridModule('r.contour', 34 | # width=width, height=height, overlap=20, 35 | # processes=cpus, input=grassfile, 36 | # output=grasscontoursfile, 37 | # minlevel=200, maxlevel=800, step=10, overwrite=True) 38 | # grd.run() 39 | 40 | #Write output to file 41 | r.out_gdal(input='aspect', output=aspectfile, overwrite=True) 42 | #r.out_ogr(input=grasscontoursfile, output=outfile, overwrite=True) 43 | 44 | # These can be left out, just debug info 45 | g.version() 46 | g.gisenv() 47 | g.list(type="all", flags='m') 48 | r.info(map=grassfile, verbose=True) 49 | v.info(map=grasscontoursfile, verbose=True) 50 | -------------------------------------------------------------------------------- /grass/04_pygrass_parallel/pygrass_parallel_with_gridmodule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 4 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 5 | #SBATCH --time=0:05:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 7 | #SBATCH --nodes=1 # Number of compute nodes. Upper limit depends on partition. 8 | #SBATCH --ntasks=4 # Number of tasks. Upper limit depends on partition. 9 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | 11 | module load grassgis 12 | 13 | # Run the PyGRASS script with temporary location 14 | grass --tmp-location EPSG:3067 --exec python3 pygrass_parallel_with_gridmodule.py 15 | -------------------------------------------------------------------------------- /grass/readme.md: -------------------------------------------------------------------------------- 1 | # GRASS GIS example batch jobs for Puhti supercomputer 2 | 3 | * [GRASS shell scripts](https://grasswiki.osgeo.org/wiki/GRASS_Python_Scripting_Library). [Example](01_serial_cli) 4 | * [GRASS Python Scripting Library](https://grasswiki.osgeo.org/wiki/GRASS_Python_Scripting_Library). [Example](02_python_scripting_serial) 5 | * [PyGRASS](https://grasswiki.osgeo.org/wiki/Python/pygrass). Examples: [basic serial](03_pygrass_serial) and [parallel with GridModule](04_pygrass_parallel) 6 | * In these examples temporary location is used, in many cases it is better to use permanend GRASS mapset and location. If using temporary location with bigger datasets, use compute nodes with [local NMVE disk](https://docs.csc.fi/computing/running/creating-job-scripts-puhti/#local-storage), which have more temporary space available or set TMPDIR to be a folder in scratch (`export TMPDIR=/scratch/project_200xxxx/grass/tmp` 7 | ). 8 | 9 | Python Scripting suits simpler cases when chaining existing tools is enough, PyGRASS enables data access from Python. 10 | See [GRASS page in CSC Docs](https://docs.csc.fi/apps/grass/#references), for addtional external references. 11 | -------------------------------------------------------------------------------- /machineLearning/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Scripts for the CSC Practical machine learning for spatial data course have been moved to https://github.com/csc-training/GeoML . 3 | 4 | 5 | -------------------------------------------------------------------------------- /noppe/Readme.md: -------------------------------------------------------------------------------- 1 | # Noppe 2 | 3 | CSC Noppe is a service for providing common easy-to-use exercise environment for courses. Noppe suppors currently Jupyter Lab/Notebooks and RStudio. Current log-in options are: CSC accounts, HAKA, Virtu and MOOC.fi. 4 | It is possible to use existing Docker containers or, if you have special requirements, you can also create the docker containers yourself. 5 | 6 | * [Noppe documentation](https://docs.csc.fi/cloud/csc_notebooks/), see the teacher's guide for setting up Noppe application for your own course. 7 | * [Noppe log-in](https://noppe.csc.fi) 8 | * [Installation files of CSC and UH GIS courses](https://github.com/csc-training/course_computing_environments/tree/main/noppe) 9 | -------------------------------------------------------------------------------- /pdal/01_crop_pipeline.json: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | { 5 | "pipeline":[ 6 | "/appl/data/geo/mml/laserkeilaus/2008_latest/2008/L413/1/L4131H3.laz", 7 | { 8 | "type":"filters.crop", 9 | "bounds":"([379591,379978],[6673858,6674143])" 10 | }, 11 | { 12 | "type":"writers.las", 13 | "filename":"output.laz" 14 | } 15 | ] 16 | } 17 | 18 | -------------------------------------------------------------------------------- /pdal/01_split_laz.sh: -------------------------------------------------------------------------------- 1 | mkdir -p data 2 | 3 | origin_x=379190 4 | origin_y=6673340 5 | piece_size=400 6 | for x in 0 1 7 | do 8 | for y in 0 1 9 | do 10 | bb=($[$origin_x+$piece_size*$x] $[$origin_x+$piece_size*($x+1)] $[$origin_y+$piece_size*$y] $[$origin_y+$piece_size*($y+1)]) 11 | echo pdal pipeline 01_crop_pipeline.json --filters.crop.bounds="([${bb[0]},${bb[1]]}],[${bb[2]},${bb[3]}])" --writers.las.filename=data/part_$x$y.laz 12 | pdal pipeline 01_crop_pipeline.json --filters.crop.bounds="([${bb[0]},${bb[1]]}],[${bb[2]},${bb[3]}])" --writers.las.filename=data/part_$x$y.laz 13 | 14 | done 15 | done 16 | 17 | 18 | -------------------------------------------------------------------------------- /pdal/02_pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "pipeline":[ 3 | "data/part_00.laz", 4 | { 5 | "type":"filters.smrf", 6 | "window":33, 7 | "slope":1.0, 8 | "threshold":0.15, 9 | "cell":1.0 10 | }, 11 | { 12 | "type":"filters.range", 13 | "limits":"Classification[2:2]" 14 | }, 15 | { 16 | "type":"writers.gdal", 17 | "filename":"data/exercise2.tif", 18 | "output_type":"min", 19 | "resolution":1.0 20 | } 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /pdal/03_batch_job_gnu_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 4 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 5 | 6 | #Partition you want to submit your job to. 7 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 8 | 9 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished. 10 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 11 | 12 | #Tells the batch job system that this is not a parallel task and only one task should be used. Note that this is one task per job, but array job will actually launch 3 simultaneous jobs. 13 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 14 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 15 | 16 | #Tells the batch job sytem to reserve 1000MB (1GB) of memory for each of the 3 jobs. 17 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 18 | 19 | #As the job is not run on the login where we submit the job from, it's necessary to load necessary modules in the batch job script. Loading the modules on login node will not help. 20 | module load parallel geoconda 21 | #Change to the directory where you have the files 22 | 23 | cd /scratch/project_2000599/geocomputing/pdal 24 | 25 | 26 | find data -name '*.laz' | \ 27 | parallel -I{} -j $SLURM_CPUS_PER_TASK pdal pipeline --readers.las.filename=data/{/.}.laz --writers.gdal.filename=data/{/.}.tif 02_pipeline.json {} -------------------------------------------------------------------------------- /pdal/04_batch_job_array.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #Name of the job, this makes it easier to identify your job 3 | #SBATCH --account=project_200xxxx # Choose the project to be billed 4 | 5 | #Outputfile. Everything that would normally be printed into to the terminal when you run a program gets printed to this file. The %j refers to job number so that you don't overwrite the same file for each job 6 | #SBATCH --output=output_%j.txt # File to write the standard output to. 7 | 8 | #As above but for error messages. It's however always not so clear what messages go to errors and what to output so it's always best to check both. 9 | #SBATCH --error=error_%j.txt # File to write the standard error to. 10 | 11 | #Partition you want to submit your job to. Possible values are serial, parallel, longrun, hugemem and test. In this excerecise we use test as it is for testing, but it shouldn't be used for serious work. See [Taito user guide](https://research.csc.fi/taito-constructing-a-batch-job-file) for details. 12 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 13 | 14 | #Time limit for the job in hh:mm:ss, Once this amount of time has passed the job will be terminated regardless of weather it has finished. 15 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 16 | 17 | #Tells the batch job system that this is an array job that should be run 4 times. During each run the $SLURM_ARRAY_TASK_ID variable will get different value ranging from 1 to 4. This will be used to select different input files. 18 | #SBATCH --array=1-4 # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -. 19 | 20 | #Tells the batch job system that this is not a parallel task and only one task should be used. Note that this is one task per job, but array job will actually launch 3 simultaneous jobs. 21 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 22 | 23 | #Tells the batch job sytem to reserve 1000MB (1GB) of memory for each of the 3 jobs. 24 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 25 | 26 | #As the job is not run on the login where we submit the job from, it's necessary to load necessary modules in the batch job script. Loading the modules on login node will not help. 27 | module load geoconda 28 | #Change to the directory where you have the files 29 | 30 | cd /scratch/project_2000599/geocomputing/pdal 31 | #Read the file to be processed from a list of input files. This is done by getting the line corresponding to the $SLURM_ARRAY_TASK_ID from the input file list. 32 | input=$(sed -n "$SLURM_ARRAY_TASK_ID"p 04_filelist.csv) 33 | 34 | #Create output name from input by exchanging .laz to .tif. 35 | name=$(echo "$input" | cut -f 1 -d '.') 36 | output=data/$(echo "$name" | cut -f 2 -d '/').tif 37 | 38 | 39 | #Run the pipeline as in previous exercise. Note that it is possible to override input and output files in your pipeline json from the commandline. 40 | pdal pipeline --readers.las.filename=$input --writers.gdal.filename=$output 02_pipeline.json 41 | 42 | 43 | -------------------------------------------------------------------------------- /pdal/04_filelist.csv: -------------------------------------------------------------------------------- 1 | data/part_00.laz 2 | data/part_01.laz 3 | data/part_10.laz 4 | data/part_11.laz 5 | -------------------------------------------------------------------------------- /pdal/07_batch_job_python.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 4 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 5 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 6 | #SBATCH --mem-per-cpu=4G # Minimum memory required per usable allocated CPU. Default units are megabytes. 7 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 8 | 9 | module load geoconda 10 | srun python 07_pdal_ground.py -------------------------------------------------------------------------------- /pdal/07_pdal_ground.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from mpl_toolkits.mplot3d import Axes3D 4 | from multiprocessing import Pool 5 | from pathlib import Path 6 | import json 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | import pdal 10 | import time 11 | 12 | #Location of .laz files, relative to the script 13 | input_dir="data" 14 | 15 | ## How many parallel processes do we want to use 16 | parallel_processes = 4 17 | 18 | # Filter laz with SMRF and create Pandas dataframe with points in it. 19 | def pdal2df(input_file): 20 | 21 | pipe = [ 22 | input_file, 23 | { 24 | "type":"filters.smrf", 25 | "window":33, 26 | "slope":1.0, 27 | "threshold":0.15, 28 | "cell":1.0 29 | } 30 | ] 31 | 32 | pipeline = pdal.Pipeline(json.dumps(pipe)) 33 | pipeline.validate() # check if our JSON and options were good 34 | pipeline.loglevel = 8 #really noisy 35 | count = pipeline.execute() 36 | arrays = pipeline.arrays 37 | arr = pipeline.arrays[0] 38 | description = arr.dtype.descr 39 | cols = [col for col, __ in description] 40 | df = pd.DataFrame({col: arr[col] for col in cols}) 41 | 42 | return df 43 | 44 | # Plot as 3D plot, green if ground red if not. 45 | def plot_df(df, input_file): 46 | fig = plt.figure() 47 | ax = fig.add_subplot(111, projection='3d') 48 | df = df.sample(frac=0.05) 49 | colors=['green' if c==2 else 'red' for c in df.Classification.tolist()] 50 | ax.scatter(df.X.tolist(),df.Y.tolist(),df.Z.tolist(), c=colors) 51 | plt.savefig(input_file.replace('laz','png')) 52 | 53 | # Procesing steps for one file 54 | def process_laz(input_file): 55 | input_file = str(input_file) 56 | print(input_file) 57 | df = pdal2df(input_file) 58 | print(df) 59 | plot_df(df, input_file) 60 | 61 | # Start the script find laz files in data folder and parallelize its processing 62 | def main(): 63 | # Find laz files on local disk 64 | file_list = Path(input_dir).rglob('*.laz') 65 | 66 | ## Create a pool of workers and run the function process_laz for each filepath in the list 67 | pool = Pool(parallel_processes) 68 | pool.map(process_laz, file_list) 69 | 70 | if __name__ == '__main__': 71 | ## This part is the first to execute when script is ran. It times the execution time and rans the main function 72 | start = time.time() 73 | main() 74 | end = time.time() 75 | print("Script completed in " + str(end - start) + " seconds") -------------------------------------------------------------------------------- /pouta/README.md: -------------------------------------------------------------------------------- 1 | # Using CSC's Pouta platform for geocomputing applications 2 | A collection of instructions to setup virtual machines in [CSC's cPouta environment](https://docs.csc.fi/cloud/pouta/) for different tools: 3 | - [GeoServer or OpenDroneMap as Docker applications](./docker_geoserver_or_opendronemap) - installing other Docker applications would be very similar. 4 | - [MetaShape](./metashape_with_VNC) - installing other Desktop tools could be rather similar. 5 | - [ArcPy](./arcpy) 6 | - Earlier also **PostGIS** was recommended to be installed to cPouta, now [Pukki](https://docs.csc.fi/cloud/dbaas/) is more suitable for databases. 7 | -------------------------------------------------------------------------------- /pouta/arcpy/ArcGIS_Server_manual_installation.sh: -------------------------------------------------------------------------------- 1 | 2 | ##Installing ArcGIS for server in a cPouta instance 3 | 4 | # This is a guideline for setting up a single CentOS 7.o instance running ArcGIS for Server Enterprise 5 | 6 | #You will need an ArcGIS Linux installation package and licence that you can get from ESRI (or ask CSC personnel for a university license). 7 | 8 | ##Set up your cPouta instance 9 | 10 | # In pouta.csc.fi open Access & Security and create a security group that has a rule allowing SSH. 11 | # If you want to use your ArcGIS server with a ArcMap Desktop client you need to add rules to allow 12 | # ingress and egress for the following TCP ports: 13 | # 4000-4003, 6080(HTTP), 6443(HTTPS) 14 | # Some references about arcgis server ports: 15 | # - http://server.arcgis.com/en/server/latest/install/linux/arcgis-server-system-requirements.htm 16 | # - http://server.arcgis.com/en/server/latest/install/linux/ports-used-by-arcgis-server.htm 17 | 18 | # In pouta.csc.fi Access & Security, create a key pair and download the private key on your computer. 19 | # 20 | # Launch the instance in pouta.csc.fi with the flavor you need and set the boot source to CentOS-7. 21 | # Set the instance's keypair and security groups properly. 22 | # 23 | # In pouta.csc.fi, associate a Floating IP for the instance and establish a SSH connection with a 24 | # SSH client. The username is cloud-user. Use the private key of the keypair you chose for 25 | # authentication. 26 | # 27 | # Now you should be in your instance. 28 | 29 | ##### 30 | ## Prepare the installation 31 | ##### 32 | #The following command installs all the dependencies 33 | sudo yum install fontconfig mesa-libGL mesa-libGLU libXtst libXext libX11 libXi libXdmcp libXrender libXau xorg-x11-server-Xvfb libXfont -y 34 | 35 | # Get the ArcGIS installation package and unpack it. 36 | # For example if you have the file in Taito you can a command like this from the cPouta instance terminal: 37 | scp username@taito-shell.csc.fi:/homeappl/home/username/ArcGIS_for_Server_Linux_xxxx_xxxxxx.tar.gz ~/ 38 | 39 | # You need to authorize your installation with a .prvc or .ecp file. 40 | # "provision" a file from my.esri.com and save it somewhere on the virtual instance. 41 | # The provision file is quite short so you can simply copy & paste it with a text editor or 42 | # send it to the instance using echo PROVISIONFILECONTENTS > ~/_Server.prvc or whatever you 43 | # feel is the most straightforward way. 44 | 45 | # Unpack the ArcGIS installation package: 46 | 47 | tar xvzf ArcGIS_for_Server_Linux_xxxx_xxxxxx.tar.gz 48 | 49 | # ArcGIS for Server requires you to increase the maximum number of open files. 50 | # Ref: http://server.arcgis.com/en/server/10.4/install/linux/arcgis-for-server-system-requirements.htm 51 | # You can do that with: 52 | echo cloud-user - nofile 65535 | sudo tee -a /etc/security/limits.conf 53 | echo cloud-user - nproc 25059 | sudo tee -a /etc/security/limits.conf 54 | 55 | # Log in & log out for the changes to the limits to take effect. 56 | 57 | ##### 58 | ## Install ArcGIS for server 59 | ##### 60 | 61 | # Now we are ready to actually install 62 | 63 | cd ArcGISServer / 64 | ./Setup -m silent -l Yes -a /home/cloud-user/provision_file_ArcGIS_Server.prvc 65 | 66 | # The installation will take some minutes. 67 | 68 | # If you need only Python and arcpy you don't need to start the server. 69 | # Note that you have to use ArcGIS' own Python installation instead of the 70 | # default system installation. Python on ArcGIS Server for Linux runs 71 | # a Windows version of Python under Wine. 72 | 73 | # You start the ArcGIS Python console with: 74 | /home/cloud-user/arcgis/server/tools/python 75 | 76 | ##### 77 | ## Test installation with a simple ArcPy script 78 | ##### 79 | # The test_data folder includes a test elevation file dem.tif and a simple 80 | # script that makes loads some arcpy libraries and uses the FlowDirection 81 | # function (see http://pro.arcgis.com/en/pro-app/tool-reference/spatial-analyst/flow-direction.htm). 82 | # The result is store to the ./test_data/output/ directory 83 | # 84 | # Move the test_data folder to the ArcGIS Server instance 85 | # and run my_arcpy_script.py from there with: 86 | /home/cloud-user/arcgis/server/tools/python my_arcpy_script.py 87 | 88 | # You are done! 89 | -------------------------------------------------------------------------------- /pouta/arcpy/ansible_preparations.md: -------------------------------------------------------------------------------- 1 | # Ansible preparations 2 | Setting up a working Ansible environment may not be trivial, especially if you are not an experienced Linux user. 3 | 4 | Below you will find some information and hints on how to preapre a working environment for Ansible. 5 | 6 | ## cPouta account 7 | These are the minimum requirements before you can start using example Ansible playbooks: 8 | 9 | - A pouta project with key-pairs and security groups to make connecting from your local machine possible. Instructions: https://docs.csc.fi/cloud/pouta/launch-vm-from-web-gui/ 10 | 11 | - cPouta project's API access file, see [Configure your terminal environment for OpenStack](https://docs.csc.fi/cloud/pouta/install-client/#configure-your-terminal-environment-for-openstack) 12 | 13 | ## Computer environment 14 | 15 | It is recommended to use a computer with a Linux operating system. Most of the instructions you will find here assume that you are working with a Linux computer. Note, that you can create a Linux virtual machine in cPouta and install the necessary tools and settings into it as necessary OR use Windows Linux Subsystem. 16 | 17 | ### Ansible tools 18 | 19 | You need to have an environment with the necessary tools to run an ansible script: python, [openstack-client](https://docs.csc.fi/cloud/pouta/install-client), ansible and shade. 20 | 21 | 22 | ### Setting up automatic access to keypairs and servers 23 | 24 | In order for the ansible scripts to run smoothly, you will need to make sure that 25 | the processes don't need interaction from the user and that the necessary keypairs 26 | are loaded. 27 | 28 | Some hints that may help: 29 | - make sure that the key pair Ansible is using to contact the remote server is 30 | available. For example with: 31 | ````bash 32 | # Start a ssh agent to automatically manage your keypairs 33 | eval $(ssh-agent -s) 34 | # Add your keypair to the ssh agent 35 | ssh-add ~/.ssh/your_private_key.pem 36 | # this private key is the one corresponding to the key pair name specified in 37 | # the Ansible script. 38 | ```` 39 | - by default, new remote connections have to be confirmed manually, which would interrupt the workflow of an Ansible script. To avoid new servers' fingerprint interactive checks, set the following environment variable: 40 | ````bash 41 | export ANSIBLE_HOST_KEY_CHECKING=False 42 | ```` 43 | -------------------------------------------------------------------------------- /pouta/arcpy/ansible_run_arcpy.yml: -------------------------------------------------------------------------------- 1 | # Ansible demo for reusing an existing ArcGIS Server volume 2 | # (see ansible_install_arcpy.yml playbook) to restart a remote virtual machine 3 | # on CSC's cPouta and running simple ArcPy script. 4 | # 5 | # You'll need to have your Ansible environment properly setup and modify 6 | # the Ansible variables to your own: 7 | # - NAME_OF_YOUR_KEY: the name of the key pair as seen in your cPouta project 8 | # - NAME_OF_YOUR_SECURITY_GROUP: the name of a security group as seen in your cPouta project 9 | # 10 | # This Ansible script assumes that you have the "test_data" folder in the same 11 | # directory where this Ansible script is. This is how those are defined in the code: 12 | # test_data_dir: test_data 13 | # demo_script: my_arcpy_script.py 14 | # 15 | # The script copies the "test_data" folder to the remote virtual machine, 16 | # then executes the example "my_arcpy_script.py" Python script and copies the 17 | # results to the local machine to the subfolder "test_data/results". 18 | #Then the remote "test_data" folder is removed and the virtual machine is deleted. 19 | --- 20 | - name: Create virtual machine from existing ArcGIS Server on cPouta 21 | hosts: localhost # The OpenStack modules run on your local machine. 22 | connection: local 23 | vars: 24 | demo_key: NAME_OF_YOUR_KEY 25 | demo_sg: NAME_OF_YOUR_SECURITY_GROUP 26 | demo_security_groups: default, {{ demo_sg }} 27 | arcgis_server_vol: arcpy-volume 28 | demo_instance: test-arcpy 29 | cpouta_flavor: standard.tiny 30 | 31 | tasks: 32 | - name: Create a virtual machine from existing ArcGIS Server volume 33 | register: result 34 | os_server: 35 | name: "{{ demo_instance }}" 36 | flavor: "{{ cpouta_flavor }}" 37 | key_name: "{{ demo_key }}" 38 | security_groups: "{{ demo_security_groups }}" 39 | boot_volume: "{{ arcgis_server_vol }}" 40 | 41 | - name: Add new host to inventory 42 | add_host: name={{ result.server.public_v4 }} groups=arcpy_nodes 43 | 44 | - name: clear ssh known_hosts 45 | known_hosts: name={{ result.server.public_v4 }} state=absent 46 | when: result | changed 47 | 48 | - name: Wait for instance to be ready 49 | wait_for: host={{ result.server.public_v4 }} port=22 search_regex=OpenSSH delay=3 50 | 51 | - name: Run example script in cPouta instance 52 | hosts: arcpy_nodes 53 | remote_user: cloud-user 54 | vars: 55 | test_data_dir: test_data 56 | demo_script: my_arcpy_script.py 57 | 58 | tasks: 59 | - synchronize: 60 | mode: push 61 | src: ./{{ test_data_dir }}/ 62 | dest: ~/{{ test_data_dir }}/ 63 | 64 | - name: run ArcPy script 65 | shell: ~/arcgis/server/tools/python ~/{{ test_data_dir }}/{{ demo_script }} 66 | 67 | - synchronize: 68 | mode: pull 69 | src: ~/{{ test_data_dir }}/ 70 | dest: ./{{ test_data_dir }}/results 71 | 72 | - name: Delete remote directory 73 | file: 74 | state: absent 75 | path: ~/{{ test_data_dir }}/ 76 | 77 | - name: Destroy the computing ArcGIS Server instance (the ArcGIS Server volume remains) 78 | hosts: localhost 79 | vars: 80 | demo_instance: test-arcpy 81 | 82 | tasks: 83 | - name: Destroy a VM 84 | os_server: 85 | name: "{{ demo_instance }}" 86 | state: absent 87 | ... 88 | -------------------------------------------------------------------------------- /pouta/arcpy/test_data/my_arcpy_script.py: -------------------------------------------------------------------------------- 1 | import arcpy 2 | from arcpy.sa import * 3 | import os 4 | 5 | arcpy.env.overwriteOutput = True 6 | 7 | directory = "./output/" 8 | 9 | if not os.path.exists(directory): 10 | os.makedirs(directory) 11 | 12 | outFlowDirection = FlowDirection("./dem.tif", "NORMAL") 13 | outFlowDirection.save(directory+"flowdir.tif") 14 | -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/ansible.cfg: -------------------------------------------------------------------------------- 1 | [ssh_connection] 2 | ssh_args = -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=10 -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/group_vars/all.yml: -------------------------------------------------------------------------------- 1 | --- 2 | key_name: kylli_pouta_2024 3 | os_image: "Ubuntu-22.04" 4 | instance_flavor: "standard.small" # This will affect your billing, select one suitable for you https://docs.csc.fi/cloud/pouta/vm-flavors-and-billing/#cpouta-flavors 5 | instance_name: "ubuntu-docker-kylli" 6 | 7 | internal_ips: # Please change this to your own, you can use https://apps.csc.fi/myip to check your IP 8 | - 0.0.0.0/0 9 | -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/install-geoserver.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Deploy VM 4 | hosts: localhost 5 | collections: 6 | - openstack.cloud 7 | vars: 8 | installed_packages: "geoserver" 9 | 10 | roles: 11 | - openstack 12 | 13 | - name: Install Docker and GeoServer 14 | hosts: created_instances 15 | collections: 16 | - openstack.cloud 17 | vars: 18 | geoserver_version: "2.25.2" # If you want to install a different version of GeoServer, change this 19 | geoserver_datadir: "/geoserver_data" 20 | 21 | roles: 22 | - docker 23 | - geoserver 24 | -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/install-odm.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Deploy VM 4 | hosts: localhost 5 | collections: 6 | - openstack.cloud 7 | vars: 8 | installed_packages: "opendronemap" 9 | 10 | roles: 11 | - openstack 12 | 13 | - name: Install Docker and OpenDroneMap 14 | hosts: created_instances 15 | gather_facts: false 16 | collections: 17 | - openstack.cloud 18 | vars: 19 | images_dir: "/data/images" 20 | docker_name: "odm" 21 | 22 | roles: 23 | - docker 24 | - opendronemap -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/requirements.yml: -------------------------------------------------------------------------------- 1 | --- 2 | collections: 3 | - name: openstack.cloud 4 | - name: community.docker 5 | -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/roles/docker/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # This tutorial is for Ubuntu 4 | - name: Create Docker group 5 | become: true 6 | group: 7 | name: docker 8 | state: present 9 | 10 | - name: Add user to Docker group 11 | become: true 12 | user: 13 | name: ubuntu 14 | groups: docker 15 | append: yes 16 | 17 | - name: Update all packages 18 | become: true 19 | apt: 20 | name: "*" 21 | update_cache: yes 22 | state: latest 23 | 24 | - name: Add signing key 25 | become: true 26 | apt_key: 27 | url: "https://download.docker.com/linux/ubuntu/gpg" 28 | state: present 29 | 30 | - name: Add Docker repository into sources list 31 | become: true 32 | apt_repository: 33 | repo: deb https://download.docker.com/linux/ubuntu/ jammy stable 34 | state: present 35 | 36 | - name: Install Docker 37 | become: true 38 | apt: 39 | name: 40 | - docker-ce 41 | - docker-ce-cli 42 | - containerd.io 43 | - docker-compose-plugin 44 | state: latest 45 | update_cache: True 46 | 47 | - name: Start Docker 48 | service: 49 | name: docker 50 | enabled: True 51 | state: started -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/roles/geoserver/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Get GeoServer container info 4 | community.docker.docker_container_info: 5 | name: geoserver 6 | register: geoserver_status 7 | 8 | - name: Pull GeoServer image 9 | community.docker.docker_image: 10 | name: docker.osgeo.org/geoserver:{{ geoserver_version }} 11 | source: pull 12 | pull: 13 | platform: amd64 14 | when: not geoserver_status.exists 15 | 16 | - name: Create Geoserver data directory 17 | file: 18 | path: "{{geoserver_datadir}}" 19 | state: directory 20 | mode: '0770' 21 | owner: ubuntu 22 | become: yes 23 | 24 | - name: Start GeoServer 25 | community.docker.docker_container: 26 | name: geoserver 27 | image: docker.osgeo.org/geoserver:{{ geoserver_version }} 28 | state: started 29 | restart: true 30 | ports: 31 | - "8080:8080" 32 | mounts: 33 | - type: "bind" 34 | target: /opt/geoserver_data 35 | source: "{{geoserver_datadir}}" 36 | env: # These are only needed when using extensions, here ysld is used as an example 37 | INSTALL_EXTENSIONS: "true" 38 | STABLE_EXTENSIONS: "ysld" 39 | # If you want to use Community modules, add COMMUNITY_EXTENSIONS followed by the modules you want to use 40 | # e.g. COMMUNITY_EXTENSIONS: "ogcapi-features,ogcapi-images" 41 | 42 | - name: GeoServer info 43 | debug: 44 | msg: 45 | - "You now have GeoServer available at: http://{{ hostvars['localhost']['server_facts']['servers'][0]['access_ipv4'] }}:8080/geoserver/" 46 | - "To login in to the virtual machine, connect with: ssh ubuntu@{{ hostvars['localhost']['server_facts']['servers'][0]['access_ipv4'] }}" 47 | -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/roles/opendronemap/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Get OpenDroneMap info 4 | community.docker.docker_container_info: 5 | name: "{{ docker_name }}" 6 | register: opendronemap_status 7 | 8 | - name: Create Geoserver data directory 9 | file: 10 | path: "{{images_dir}}" 11 | state: directory 12 | mode: '0770' 13 | owner: ubuntu 14 | become: yes 15 | 16 | - name: Pull OpenDroneMap Image 17 | community.docker.docker_image: 18 | name: opendronemap/odm 19 | source: pull 20 | pull: 21 | platform: amd64 22 | when: not opendronemap_status.exists 23 | 24 | - name: OpenDroneMap info 25 | debug: 26 | msg: 27 | - "To login in to the virtual machine, connect with: ssh ubuntu@{{ hostvars['localhost']['server_facts']['servers'][0]['access_ipv4'] }}" 28 | -------------------------------------------------------------------------------- /pouta/docker_geoserver_or_opendronemap/roles/openstack/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - set_fact: 4 | security_group: "{{ instance_name }}-security-group" 5 | 6 | - name: Create security group 7 | openstack.cloud.security_group: 8 | state: present 9 | name: "{{ security_group }}" 10 | description: "Security group for {{ instance_name }}" 11 | 12 | - name: Add port 22 opening to ips {{ internal_ips }} to rule {{ instance_name }} 13 | openstack.cloud.security_group_rule: 14 | state: present 15 | security_group: "{{ security_group }}" 16 | protocol: tcp 17 | port_range_min: 22 18 | port_range_max: 22 19 | remote_ip_prefix: "{{ item }}" 20 | with_items: 21 | - "{{ internal_ips }}" 22 | 23 | - name: Add port 5432 opening to ips {{ internal_ips }} to rule {{ instance_name }} 24 | openstack.cloud.security_group_rule: 25 | state: present 26 | security_group: "{{ instance_name }}-security-group" 27 | protocol: tcp 28 | port_range_min: 5432 29 | port_range_max: 5432 30 | remote_ip_prefix: "{{ item }}" 31 | with_items: 32 | - "{{ internal_ips }}" 33 | when: installed_packages == "postgis" 34 | 35 | - name: Add port 8080 opening to ips {{ internal_ips }} to rule {{ instance_name }} 36 | openstack.cloud.security_group_rule: 37 | state: present 38 | security_group: "{{ instance_name }}-security-group" 39 | protocol: tcp 40 | port_range_min: 8080 41 | port_range_max: 8080 42 | remote_ip_prefix: "0.0.0.0/0" 43 | when: installed_packages == "geoserver" 44 | 45 | - name: Add port 8082 opening to ips {{ internal_ips }} to rule {{ instance_name }} 46 | openstack.cloud.security_group_rule: 47 | state: present 48 | security_group: "{{ instance_name }}-security-group" 49 | protocol: tcp 50 | port_range_min: 8082 51 | port_range_max: 8082 52 | remote_ip_prefix: "0.0.0.0/0" 53 | when: installed_packages == "geoserver" 54 | 55 | - name: Create instance 56 | openstack.cloud.server: 57 | name: "{{ instance_name }}" 58 | state: present 59 | key_name: "{{ key_name }}" 60 | image: "{{ os_image }}" 61 | flavor: "{{ instance_flavor }}" 62 | security_groups: "default,{{security_group}}" 63 | metadata: 64 | group: "created_instances" 65 | register: servers 66 | 67 | - name: Acquire Floating IP 68 | openstack.cloud.floating_ip: 69 | server: "{{ item }}" 70 | network: "public" 71 | reuse: true 72 | wait: true 73 | timeout: 60 74 | delay: 1 75 | retries: 3 76 | async: 60 77 | poll: 0 78 | with_items: 79 | - "{{ servers.server }}" 80 | 81 | - name: Register openstack servers facts 82 | openstack.cloud.server_info: 83 | name: "{{ instance_name }}" 84 | register: server_facts 85 | 86 | - name: Add hosts to inventory 87 | add_host: 88 | hostname: "{{ item.name }}" 89 | group: "{{ item.metadata.group }}" 90 | ansible_host: "{{ item.access_ipv4 }}" 91 | ansible_user: ubuntu 92 | with_items: 93 | - "{{ server_facts.servers }}" 94 | 95 | - name: Wait for SSH connection 96 | wait_for: 97 | host: "{{ item.access_ipv4 }}" 98 | state: started 99 | port: 22 100 | delay: 0 101 | with_items: 102 | - "{{ server_facts.servers }}" 103 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # CSC geocomputing Python examples 2 | 3 | * [Puhti](./puhti/README.md) - serial/array/parallel processing with Python. How to parallelize your Python code with different methods for running in Puhti supercomputer. 4 | * [Working with Allas data from Python](./allas). Examples with S3 and Swift APIs 5 | * [Reading NLS topographic database geopackage with Python](./geopackage/README.md) 6 | * [GRASS multiprocessing from Python](./grass_multiprocessing_with_python/README.md) 7 | * [Routing](./routing/readme.md) Examples using NetworkX and igraph, serial and parallel. 8 | * [Sentinel data download from Finhub using sentinelsat](./sentinel/README.md) 9 | * [STAC, xarray and dask for downloading and processing data](./STAC/stac_xarray_dask_example.ipynb) 10 | * [Zonal statistics in parallel](./zonal_stats/README.md) using rasterstats, serial and parallel. 11 | * [Python Dask-geopandas](./dask_geopandas/README.md) using Dask-geopandas in spatial analysis. 12 | 13 | 14 | > **_NOTE:_** If you are using [Jupyter lab](https://jupyter.org/) on your own computer, the [Jupyter-github](https://github.com/jupyterlab/jupyterlab-github) extension provides you with the possibility to browse public github repositories within Jupyter. Install the extension, click on the little github/cat icon in the left bar and fill `csc-training/geocomputing` into the search field and press enter. This lets you open and run all python files and notebooks within this repository on your own computer. 15 | -------------------------------------------------------------------------------- /python/STAC/csc_stac_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # This example shows how to use STAC, dask and xarray via Python script. 5 | # If new to STAC, see the STAC_CSC_example.ipynb for longer explanation how STAC works. 6 | # In this example, we will search and download data through a STAC Catalog and process it using Dask and Xarray. 7 | # We will use Sentinel-1 data stored at FMI to compute a mean value of vv_mean for one month. The result will be saved to a new GeoTiff file. 8 | 9 | import requests 10 | import stackstac 11 | from dask.distributed import Client, Lock 12 | import pystac_client 13 | import rioxarray 14 | import sys 15 | import os 16 | 17 | # Settings 18 | STAC_URL = "https://paituli.csc.fi/geoserver/ogc/stac/v1" 19 | collection = 'sentinel_1_11_days_mosaics_at_fmi' 20 | time_filter="2021-03-01/2021-03-31" 21 | asset = 'mean_vv' 22 | output_file = os.path.join(os.getcwd(), "sentinel1_mean_vv.tif") 23 | 24 | # Use as many workers as you have available cores 25 | no_of_workers = len(os.sched_getaffinity(0)) 26 | 27 | def find_items_from_stac(): 28 | catalog = pystac_client.Client.open(STAC_URL) 29 | search_bbox = catalog.search( 30 | collections=[collection], 31 | datetime=time_filter 32 | ) 33 | return search_bbox.item_collection() 34 | 35 | def main(): 36 | 37 | # Create Dask client 38 | # Because STAC+xarray analysis is usually slowed down by data download speed, then it is good to use 1 core per worker. 39 | # If you have computationally heavy analysis, this could be changed to several cores per worker. 40 | client = Client(n_workers=no_of_workers) 41 | 42 | item_collection = find_items_from_stac() 43 | 44 | # Use the `stackstac` library to convert item collection to Xarray DataArray. 45 | cube = stackstac.stack( 46 | items=item_collection, 47 | assets=[asset], 48 | #chunksize=(-1,1,2046,2046), 49 | epsg=3067 50 | ).squeeze() 51 | 52 | # Create new data cube for the mean value. 53 | mean = cube.mean("time", keep_attrs=True) 54 | 55 | # Compute and save the result 56 | mean_ndvi_tiff = mean.rio.to_raster( 57 | output_file, 58 | lock=Lock(name="rio", client=client), 59 | tiled=True, 60 | ) 61 | 62 | # Close Dask cluster 63 | client.close() 64 | 65 | # With Dask, it is important to use the main function 66 | if __name__ == "__main__": 67 | main() 68 | print("Analysis ready") 69 | -------------------------------------------------------------------------------- /python/STAC/csc_stac_example_batch_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_2000599 # Choose the project to be billed 3 | #SBATCH --time=00:20:00 # Maximum duration of the job. Upper limit depends on partition. 4 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 5 | #SBATCH --cpus-per-task=10 # How many processors work on one task. Upper limit depends on number of CPUs per node. 6 | #SBATCH --mem-per-cpu=10G # Minimum memory required per allocated CPU. Default units are megabytes. 7 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 8 | 9 | # Load the geoconda module which has Python with Dask, Xarray and STAC libraries 10 | module load geoconda 11 | 12 | # Run the Python code. 13 | python csc_stac_example.py 14 | -------------------------------------------------------------------------------- /python/STAC/environment.yml: -------------------------------------------------------------------------------- 1 | name: stac 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - gdal 7 | - geopandas 8 | - dask 9 | - jupyterlab 10 | - pyproj 11 | - pystac-client 12 | - pystac 13 | - requests 14 | - rioxarray 15 | - stackstac 16 | - libgdal-jp2openjpeg 17 | - python-graphviz 18 | - jupyter-resource-usage 19 | - dask-labextension 20 | -------------------------------------------------------------------------------- /python/STAC/img/DEM_data_source_cpu_walltime.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/DEM_data_source_cpu_walltime.gif -------------------------------------------------------------------------------- /python/STAC/img/DEM_tile_size_cpu_walltime.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/DEM_tile_size_cpu_walltime.gif -------------------------------------------------------------------------------- /python/STAC/img/S1_data_source_cpu_walltime.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/S1_data_source_cpu_walltime.gif -------------------------------------------------------------------------------- /python/STAC/img/S1_tile_size_cpu_walltime.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/geocomputing/cd56b0fa44c743fc2f6a65d50e053f787b033fc8/python/STAC/img/S1_tile_size_cpu_walltime.gif -------------------------------------------------------------------------------- /python/allas/working_with_allas_from_Python_S3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Dec 4 13:57:52 2019 5 | @author: ekkylli 6 | Updated 7.7.2025 7 | """ 8 | 9 | # Example script for using Allas directly from an Python script: 10 | # - Reading raster and vector files 11 | # - Writing raster and vector files 12 | # - Looping over all files of certain type in a bucket 13 | 14 | # Please notice that this example works ONLY with GDAL-based libraries for spatial data: rasterio, geopandas etc. 15 | 16 | # The required packages depend on the task 17 | # For working with rasters 18 | import rasterio 19 | # For working with vectors 20 | import geopandas as gpd 21 | # For listing files and writing to Allas 22 | import boto3 23 | import os 24 | 25 | 26 | # Before starting to use S3, set up your credentials and endpoint. 27 | # This example here applies for using Allas from CSC Puhti or Mahti supercomputers. 28 | # To use some other S3 stroage or from some other computer, 29 | # See https://docs.csc.fi/support/tutorials/gis/gdal_cloud/#s3-connection-details 30 | # 31 | # 1) Set up your credentials to Allas: 32 | # module load allas 33 | # allas-conf --mode s3cmd 34 | # This is needed only once, as long as you are using the same CSC project. 35 | # This also sets S3 endopoint to .aws/config file in a way understandable for boto3 library, but not for GDAL. 36 | # 37 | # 2) Set S3-endpoint for GDAL-library: 38 | # module load allas 39 | # OR 40 | os.environ["AWS_S3_ENDPOINT"] = "a3s.fi" 41 | # This sets AWS_S3_ENDPOINT environment variable to "a3s.fi". 42 | # Environment variables are cleaned after session end, so it must be set again in each new session. 43 | 44 | # If you want to WRITE files with rasterio/geopandas directly to Allas, set also this. 45 | os.environ["CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE"] = "YES" 46 | 47 | # Reading raster file 48 | r = rasterio.open('/vsis3/name_of_your_Allas_bucket/name_of_your_input_raster_file.tif') 49 | input_data = r.read() 50 | 51 | # Writing raster file 52 | with rasterio.open('/vsis3/name_of_your_Allas_bucket/name_of_your_output_raster_file.tif', 'w', **r.profile) as dst: 53 | dst.write(input_data) 54 | 55 | # Reading vector file 56 | v = gpd.read_file('/vsis3/name_of_your_Allas_bucket/name_of_your_input_vector_file.gpkg') 57 | 58 | # Writing vector file 59 | v.to_file('/vsis3/name_of_your_Allas_bucket/name_of_your_output_vector_file.gpkg', layer='layername', driver="GPKG") 60 | 61 | # Looping through all files in a bucket, find ones that are tifs. 62 | # Then print the extent of each file as example. 63 | 64 | # Create connection to S3 storage 65 | os.environ["AWS_REQUEST_CHECKSUM_CALCULATION"] = "when_required" 66 | os.environ["AWS_RESPONSE_CHECKSUM_VALIDATION"] = "when_required" 67 | s3_resource = boto3.resource('s3') 68 | 69 | # By default boto3 is connecting to Amazon S3, to use custom endpoint, define it in .aws/config file as done by allas-conf --mode s3cmd 70 | # OR define it in the Python code 71 | # s3_resource = boto3.resource("s3", endpoint_url='https://a3s.fi') 72 | 73 | my_bucket = s3_resource.Bucket('name_of_your_Allas_bucket') 74 | 75 | for my_bucket_object in my_bucket.objects.all(): 76 | if (my_bucket_object.key.endswith('.tif')): 77 | filePath = '/vsis3/gis-int2/' + my_bucket_object.key 78 | print(filePath) 79 | r = rasterio.open(filePath) 80 | print(r.bounds) 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /python/allas/working_with_allas_from_Python_Swift.py: -------------------------------------------------------------------------------- 1 | import swiftclient 2 | import rasterio 3 | import geopandas as gpd 4 | from rasterio.io import MemoryFile 5 | import tempfile 6 | import os 7 | 8 | """ 9 | Example script for using Allas directly from a Python script with swift library 10 | Created on 27.01.2020 by Johannes Nyman 11 | """ 12 | 13 | ### 1. Establishing the Swift connection to Allas 14 | 15 | # You need to run the following commands in Puhti to get the authentication to Allas active 16 | 17 | """ 18 | module load allas 19 | allas-conf 20 | """ 21 | 22 | # These exist after running allas-conf 23 | _authurl = os.environ['OS_STORAGE_URL'] 24 | _auth_token = os.environ['OS_AUTH_TOKEN'] 25 | _project_name = os.environ['OS_PROJECT_NAME'] 26 | _user = os.environ['OS_USERNAME'] 27 | 28 | 29 | # Various settings for connecting to Puhti 30 | _auth_version = '3' 31 | _os_options = { 32 | 'user_domain_name': 'Default', 33 | 'project_domain_name': 'Default', 34 | 'project_name': _project_name 35 | } 36 | 37 | # Creating the connection client 38 | conn = swiftclient.Connection( 39 | user=_user, 40 | preauthurl=_authurl, 41 | preauthtoken=_auth_token, 42 | os_options=_os_options, 43 | auth_version=_auth_version 44 | ) 45 | 46 | ### 1. Download a file from Allas to local filesystem 47 | obj = '' 48 | container = '' 49 | file_output = '' 50 | headers, raster = conn.get_object(container, obj) 51 | with open(file_output, 'bw') as f: 52 | f.write(raster) 53 | 54 | ### 2. Writing a raster file to Allas using the Swift library 55 | fp = "" 56 | bucket_name = '' 57 | raster = rasterio.open(fp) 58 | input_data = raster.read() 59 | 60 | # The file is written to memory first and then uploaded to Allas 61 | with MemoryFile() as mem_file: 62 | with mem_file.open(**raster.profile) as dataset: 63 | dataset.write(input_data) 64 | conn.put_object(bucket_name, os.path.basename(fp), contents=mem_file) 65 | 66 | 67 | ### 3. Writing a vector file to Allas using the Swift library 68 | fp = "" 69 | bucket_name = '' 70 | vector = gpd.read_file(fp) 71 | 72 | # The file is written to memory first and then uploaded to Allas 73 | tmp = tempfile.NamedTemporaryFile() 74 | vector.to_file(tmp, layer='test', driver="GPKG") 75 | tmp.seek(0) # Moving pointer to the beginning of temp file. 76 | conn.put_object(bucket_name, os.path.basename(fp) ,contents=tmp) 77 | 78 | 79 | ### 5. Looping through buckets and files inside your project 80 | resp_headers, containers = conn.get_account() 81 | for container in containers: 82 | print(container['name']) 83 | for data in conn.get_container(container['name'])[1]: 84 | print("\t" + container['name'] + "/" + data['name']) 85 | 86 | -------------------------------------------------------------------------------- /python/dask_geopandas/README.md: -------------------------------------------------------------------------------- 1 | # Dask geopandas example 2 | > Dask-GeoPandas is a project merging the geospatial capabilities of GeoPandas and the scalability of Dask. GeoPandas is an open source project designed to make working with geospatial data in Python easier. GeoPandas extends the datatypes used by pandas to allow spatial operations on geometric types. Dask provides advanced parallelism and distributed out-of-core computation with a dask.dataframe module designed to scale pandas. 3 | 4 | In general, one can work with Dask-GeoDataFrames as they are regular GeoDataFrames. A good approach would be to start solving a problem using plain GeoPandas, because for small data problems, Dask-GeoPandas generates a significant overhead. Only after one would run into memory or performance issues with GeoPandas, they should switch to Dask-GeoPandas with one partition having less than 1GB of data in it. 5 | 6 | Unfortunately, Dask-GeoPandas provides only a limited number of operations. Before using dask-geopandas, check if the method that you need is available in [Dask-GeoPandas](https://dask-geopandas.readthedocs.io/en/stable/api.html). 7 | 8 | In this example, we will use Finnish addresses (osoitteet), and based on post code data, we will assign each address its post code. To do that, we will load two shapefiles into GeoDataFrames and perform a spatial join. In the end, we compare the execution times of both dask-geopandas and plain geopandas. 9 | 10 | To launch this notebook in Puhti, you need JupyterLab with at least 5GB of memory and 4 cores. 11 | 12 | ### Documentation 13 | - [Dask-geopandas documentation](https://dask-geopandas.readthedocs.io/en/stable/) 14 | - [CSC Dask tutorial](https://docs.csc.fi/support/tutorials/dask-python/) 15 | - [Jupyter in Puhti supercomputer](https://docs.csc.fi/computing/webinterface/jupyter/) 16 | - [Dask batch jobs with Puhti](https://github.com/csc-training/geocomputing/tree/master/python/puhti/05_parallel_dask) 17 | -------------------------------------------------------------------------------- /python/geopackage/README.md: -------------------------------------------------------------------------------- 1 | ## Reading NLS topographic database geopackage with Python 2 | The NLS topographic database has been saved into several geopackage files in Puhti at /appl/data/geo/mml/maastotietokanta/20XX/gpkg. The larger layers are in their own gpkg files and the smaller layers have been bundled into a single file. The larger layers are quite large and reading them takes some time. If the whole layers are not however needed it is possible to use Geopandas to only read the desired parts of the files. The examples can be found in the [read_gpkg.py](read_gpkg.py) script 3 | 4 | In Puhti [geoconda](https://docs.csc.fi/apps/geoconda/) module can be used. 5 | 6 | Similar examples for reading the geopackage with R using SF package can be found [here](https://github.com/csc-training/geocomputing/tree/master/R/geopackage). 7 | -------------------------------------------------------------------------------- /python/geopackage/list_layers_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Jun 15 08:35:45 2018 5 | 6 | @author: ekkylli 7 | 8 | This file lists the layers of GeoPackage, the number of features in each layer an d their type. 9 | """ 10 | 11 | from osgeo import ogr 12 | 13 | data = ogr.Open('/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-vakavesi_20-02-06.gpkg') 14 | 15 | print('Data Name:', data.GetName()) 16 | 17 | # get a layer with GetLayer('layername'/layerindex) 18 | for layer in data: 19 | print('Layer Name:', layer.GetName()) 20 | print('Layer Feature Count:', len(layer)) 21 | 22 | layer_defn = layer.GetLayerDefn() 23 | for i in range(layer_defn.GetGeomFieldCount()): 24 | # some times the name doesn't appear 25 | # but the type codes are well defined 26 | print(layer_defn.GetGeomFieldDefn(i).GetName(), layer_defn.GetGeomFieldDefn(i).GetType()) 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /python/geopackage/make_each_layer_a_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jun 19 15:38:52 2018 5 | 6 | @author: ekkylli 7 | 8 | Code for saving all layers of GeoPackage as separate files 9 | """ 10 | import os 11 | from osgeo import gdal, ogr 12 | 13 | #OutputFolder 14 | outFolder='layers' 15 | 16 | #Check that the folder exists 17 | if not os.path.exists(outFolder): 18 | os.makedirs(outFolder) 19 | 20 | #Make error messages visible 21 | gdal.UseExceptions() #Fail when can't open! 22 | def gdal_error_handler(err_class, err_num, err_msg): 23 | errtype = { 24 | gdal.CE_None:'None', 25 | gdal.CE_Debug:'Debug', 26 | gdal.CE_Warning:'Warning', 27 | gdal.CE_Failure:'Failure', 28 | gdal.CE_Fatal:'Fatal' 29 | } 30 | err_msg = err_msg.replace('\n',' ') 31 | err_class = errtype.get(err_class, 'None') 32 | print ('Error Number: %s' % (err_num)) 33 | print ('Error Type: %s' % (err_class)) 34 | print ('Error Message: %s' % (err_msg)) 35 | 36 | #Enable error handler USE THIS FIRST TO SEE THE ERRORS, remove late for faster throughput 37 | #It seems that some field width warnings are given when actual data is ok. 38 | #gdal.PushErrorHandler(gdal_error_handler) 39 | 40 | #Disable error handler 41 | #gdal.PopErrorHandler() 42 | 43 | # Note, the original GeoPackage is opened with both ogr and gdal. 44 | # TODO, it might not be necessary actually 45 | ogrDS = ogr.Open('/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-vakavesi_20-02-06.gpkg') 46 | gdalDS = gdal.OpenEx('/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-vakavesi_20-02-06.gpkg', gdal.OF_VECTOR) 47 | 48 | # get a layer with GetLayer('layername'/layerindex) 49 | for layer in ogrDS: 50 | 51 | # Generate the name for new file 52 | layerName = layer.GetName() 53 | print("Saving layer " + layerName) 54 | outFile=os.path.join(outFolder,layerName +'.gpkg') 55 | 56 | # Remove output shapefile if it already exists 57 | outDriver = ogr.GetDriverByName('GPKG') 58 | if os.path.exists(outFile): 59 | outDriver.DeleteDataSource(outFile) 60 | 61 | #Save file with gdal, only one layer per file 62 | ds1 = gdal.VectorTranslate(outFile, gdalDS, layers = [layerName] , format = 'GPKG') 63 | #Important, this is the way to save the file! 64 | del ds1 65 | -------------------------------------------------------------------------------- /python/geopackage/read_gpkg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | Examples for reading data from NLS geopackage with geopandas, fiona and sqlite3. 4 | The geopackges are rather big, so reading the whole file might not be optimal. 5 | We can however read parts of it quickly without having to inspect each row as shown in examples below: 6 | """ 7 | import geopandas as gpd 8 | import fiona 9 | fn_muut= "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-muut_20-02-06.gpkg" 10 | fn_suo = "/appl/data/geo/mml/maastotietokanta/2020/gpkg/MTK-suo_20-02-06.gpkg" 11 | 12 | """ 13 | Reading a layer into a dataframe. Some layers are large, but for smaller layers this can be quick enough. 14 | """ 15 | def read_whole_layer(): 16 | df=gpd.read_file(fn_muut, layer="hylky") 17 | print("Hylky:\n",df.head()) 18 | 19 | """ 20 | Reading an area specified by a boundingbox from a single layer into a dataframe. 21 | Geopandas takes advantage of geopackage's spatial indexing and this is a fast operation even on large layers. 22 | For line and polygon geometries all features that at least intersect bounding box are selected. 23 | """ 24 | def read_area(): 25 | 26 | bb=(374692, 6671989, 379750, 6676677) 27 | df=gpd.read_file(fn_suo, layer="suo", bbox=bb) 28 | print("\n\nSuo:\n",df.head()) 29 | 30 | 31 | """ 32 | Reading rows in range 10-20. Again only the rows that we want will be read. 33 | """ 34 | def read_rows_in_range(): 35 | c = fiona.open(fn_suo,layer="suo") 36 | start=10 37 | end=20 38 | df=gpd.GeoDataFrame.from_features(c[start:end]) 39 | print(df) 40 | 41 | """ 42 | Reading specific rows. As above but for specific row numbers rather than a range of rows. 43 | """ 44 | def read_specific_rows(): 45 | with fiona.open(fn_suo,layer="suo") as c: 46 | rows = (1,5,100) 47 | df=gpd.GeoDataFrame.from_features([c[i] for i in rows]) 48 | print(df) 49 | 50 | """ 51 | Reading rows where an attribute has a certain value (or based on any SQL query). 52 | Fiona and thus geopandas don't support reading only specifc rows based on an attribute. However if you really need to be able to do this fast you can do it by first creating an index for the column you want to use and then using sqlite to get numbers of rows you need. After this you can create dataframe as above. This method can of course be used to run any SQL query to first select IDs of the rows that we want before reading the data into memory. Weather these queries need to actually inspect each row or if faster execution is possible depends on the query and the indexes available. The main benefit here is that you can take advantage of additional indexes and you don't need to first read all the rows into geopandas dataframe. 53 | Geopandas also specifies read_postgis() method that you can use to accomplish the same end result, but using this with geopackage creates a need for some geometry type conversions that can be problematic. 54 | """ 55 | 56 | import sqlite3 57 | def create_index(): 58 | table="suo" 59 | col="mtk_id" 60 | conn = sqlite3.connect(fn) 61 | c = conn.cursor() 62 | sql="CREATE INDEX index_{}_{} ON {} ({})".format(table, col, table, col) 63 | c.execute(sql) 64 | conn.commit() 65 | conn.close() 66 | 67 | def read_by_attribute(): 68 | conn = sqlite3.connect(fn) 69 | c = conn.cursor() 70 | layer="suo" 71 | attribute_col="mtk_id" 72 | attribute_val=219920480 73 | id_col = 'fid' 74 | sql="select {} from {} where {}={}".format(id_col, layer, attribute_col, attribute_val) 75 | c.execute(sql) 76 | rows = c.fetchall() 77 | rows = [r[0] for r in rows] 78 | print(rows) 79 | with fiona.open(fn,layer="suo") as c: 80 | df=gpd.GeoDataFrame.from_features([c[i] for i in rows]) 81 | print(df) 82 | 83 | if __name__=='__main__': 84 | read_whole_layer() 85 | read_specific_rows() 86 | read_rows_in_range() 87 | read_area() 88 | 89 | -------------------------------------------------------------------------------- /python/puhti/00_interactive/interactive_single_core_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example Python script how to calculate NDVI for one Sentinel satellite images 3 | using just 1 process. 4 | 5 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC 6 | 7 | """ 8 | import os 9 | import sys 10 | import time 11 | import rasterio 12 | 13 | ### The filepath to one Sentinel image 14 | sentinel_image_path = "/appl/data/geo/sentinel/s2_example_data/L2A/S2B_MSIL2A_20190530T094039_N0212_R036_T36VUR_20190530T113343.SAFE" 15 | 16 | 17 | def readImage(image_folder_fp): 18 | print(f"Reading Sentinel image from: {image_folder_fp}") 19 | ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings 20 | for subdir, dirs, files in os.walk(image_folder_fp): 21 | for file in files: 22 | if file.endswith("_B04_10m.jp2"): 23 | red_fp = os.path.join(subdir, file) 24 | if file.endswith("_B08_10m.jp2"): 25 | nir_fp = os.path.join(subdir, file) 26 | ### Read the red and nir (near-infrared) band files with Rasterio 27 | red = rasterio.open(red_fp) 28 | nir = rasterio.open(nir_fp) 29 | ### Return the rasterio objects as a list 30 | return red, nir 31 | 32 | 33 | def calculateNDVI(red, nir): 34 | print("Computing NDVI") 35 | ### This function calculates NDVI from the red and nir bands 36 | ## Read the rasterio objects pixel information to numpy arrays 37 | red = red.read(1) 38 | nir = nir.read(1) 39 | ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000) 40 | red = red / 10000 41 | nir = nir / 10000 42 | ### the NDVI formula 43 | ndvi = (nir - red) / (nir + red) 44 | return ndvi 45 | 46 | 47 | def saveImage(ndvi, sentinel_image_path, input_image): 48 | ## Create an output folder to this location, if it does not exist 49 | outputdir = "output" 50 | if not os.path.exists(outputdir): 51 | os.makedirs(outputdir) 52 | ## Create output filepath for the image. We use the input name with _NDVI end 53 | output_file = os.path.join( 54 | outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif") 55 | ) 56 | print(f"Saving image: {output_file}") 57 | ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red) 58 | metadata = input_image.profile 59 | ## Change the data type from integer to float and file type from jp2 to GeoTiff 60 | metadata.update(dtype=rasterio.float64, driver="GTiff") 61 | ## Write the ndvi numpy array to a GeoTiff with the updated metadata 62 | with rasterio.open(output_file, "w", **metadata) as dst: 63 | dst.write(ndvi, 1) 64 | 65 | 66 | def processImage(sentinel_image_path): 67 | ### This function processes one image (read, compute, save) 68 | ## Read the image and get rasterio objects from the red nir bands 69 | red, nir = readImage(sentinel_image_path) 70 | ## Calculate NDVI and get the resulting numpy array 71 | ndvi = calculateNDVI(red, nir) 72 | ## Write the NDVI numpy array to file to the same extent as the red input band 73 | saveImage(ndvi, sentinel_image_path, red) 74 | 75 | 76 | def main(): 77 | ## run the process on input dir if it is a directory 78 | if os.path.isdir(sentinel_image_path): 79 | print(f"\nProcess of {sentinel_image_path} started") 80 | processImage(sentinel_image_path) 81 | print(f"Processing of {sentinel_image_path} done\n") 82 | 83 | 84 | if __name__ == "__main__": 85 | ## This part is the first to execute when script is ran. It times the execution time and runs the main function 86 | start = time.time() 87 | main() 88 | end = time.time() 89 | print(f"Script completed in {str(end - start)} seconds") 90 | -------------------------------------------------------------------------------- /python/puhti/01_serial/single_core_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example Python script how to calculate NDVI for one Sentinel satellite image 3 | using just 1 process. 4 | 5 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC 6 | 7 | """ 8 | import os 9 | import sys 10 | import time 11 | import rasterio 12 | 13 | 14 | ### The filepath for the input Sentinel image that is given as input parameter 15 | sentinel_image_path = sys.argv[1] 16 | 17 | 18 | def readImage(image_folder_fp): 19 | print(f"Reading Sentinel image from: {image_folder_fp}") 20 | ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings 21 | for subdir, dirs, files in os.walk(image_folder_fp): 22 | for file in files: 23 | if file.endswith("_B04_10m.jp2"): 24 | red_fp = os.path.join(subdir, file) 25 | if file.endswith("_B08_10m.jp2"): 26 | nir_fp = os.path.join(subdir, file) 27 | ### Read the red and nir (near-infrared) band files with Rasterio 28 | red = rasterio.open(red_fp) 29 | nir = rasterio.open(nir_fp) 30 | ### Return the rasterio objects as a list 31 | return red, nir 32 | 33 | 34 | def calculateNDVI(red, nir): 35 | print("Computing NDVI") 36 | ### This function calculates NDVI from the red and nir bands 37 | ## Read the rasterio objects pixel information to numpy arrays 38 | red = red.read(1) 39 | nir = nir.read(1) 40 | ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000) 41 | red = red / 10000 42 | nir = nir / 10000 43 | ### the NDVI formula 44 | ndvi = (nir - red) / (nir + red) 45 | return ndvi 46 | 47 | 48 | def saveImage(ndvi, sentinel_image_path, input_image): 49 | ## Create an output folder to this location, if it does not exist 50 | outputdir = "output" 51 | if not os.path.exists(outputdir): 52 | os.makedirs(outputdir) 53 | ## Create output filepath for the image. We use the input name with _NDVI end 54 | output_file = os.path.join( 55 | outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif") 56 | ) 57 | print(f"Saving image: {output_file}") 58 | ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red) 59 | metadata = input_image.profile 60 | ## Change the data type from integer to float and file type from jp2 to GeoTiff 61 | metadata.update(dtype=rasterio.float64, driver="GTiff") 62 | ## Write the ndvi numpy array to a GeoTiff with the updated metadata 63 | with rasterio.open(output_file, "w", **metadata) as dst: 64 | dst.write(ndvi, 1) 65 | 66 | 67 | def processImage(sentinel_image_path): 68 | ### This function processes one image (read, compute, save) 69 | ## Read the image and get rasterio objects from the red nir bands 70 | red, nir = readImage(sentinel_image_path) 71 | ## Calculate NDVI and get the resulting numpy array 72 | ndvi = calculateNDVI(red, nir) 73 | ## Write the NDVI numpy array to file to the same extent as the red input band 74 | saveImage(ndvi, sentinel_image_path, red) 75 | 76 | 77 | def main(): 78 | print(f"\nProcess of {sentinel_image_path} started") 79 | processImage(sentinel_image_path) 80 | print(f"Processing of {sentinel_image_path} done\n") 81 | 82 | 83 | if __name__ == "__main__": 84 | ## This part is the first to execute when script is ran. It times the execution time and rans the main function 85 | start = time.time() 86 | main() 87 | end = time.time() 88 | print(f"Script completed in {str(end - start)} seconds") 89 | -------------------------------------------------------------------------------- /python/puhti/01_serial/single_core_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 5 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 6 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 8 | #SBATCH --cpus-per-task=1 # How many processors work on one task. Upper limit depends on number of CPUs per node. 9 | #SBATCH --mem-per-cpu=2G # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | 12 | module load geoconda 13 | 14 | ones2file=/appl/data/geo/sentinel/s2_example_data/L2A/S2B_MSIL2A_20190530T094039_N0212_R036_T36VUR_20190530T113343.SAFE 15 | 16 | srun python single_core_example.py $ones2file 17 | -------------------------------------------------------------------------------- /python/puhti/01_serial/single_core_example_folder.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example Python script how to calculate NDVI for three Sentinel satellite images 3 | using just 1 process. 4 | For going through all the files, a for-loop is used in the main()- function 5 | 6 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC 7 | 8 | """ 9 | import os 10 | import sys 11 | import time 12 | import rasterio 13 | 14 | 15 | ### The filepath for the input Sentinel image folder is an input argument to the script 16 | image_folder = sys.argv[1] 17 | 18 | 19 | def readImage(image_folder_fp): 20 | print(f"Reading Sentinel image from: {image_folder_fp}") 21 | ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings 22 | for subdir, dirs, files in os.walk(image_folder_fp): 23 | for file in files: 24 | if file.endswith("_B04_10m.jp2"): 25 | red_fp = os.path.join(subdir, file) 26 | if file.endswith("_B08_10m.jp2"): 27 | nir_fp = os.path.join(subdir, file) 28 | ### Read the red and nir (near-infrared) band files with Rasterio 29 | red = rasterio.open(red_fp) 30 | nir = rasterio.open(nir_fp) 31 | ### Return the rasterio objects as a list 32 | return red, nir 33 | 34 | 35 | def calculateNDVI(red, nir): 36 | print("Computing NDVI") 37 | ### This function calculates NDVI from the red and nir bands 38 | ## Read the rasterio objects pixel information to numpy arrays 39 | red = red.read(1) 40 | nir = nir.read(1) 41 | ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000) 42 | red = red / 10000 43 | nir = nir / 10000 44 | ### the NDVI formula 45 | ndvi = (nir - red) / (nir + red) 46 | return ndvi 47 | 48 | 49 | def saveImage(ndvi, sentinel_image_path, input_image): 50 | ## Create an output folder to this location, if it does not exist 51 | outputdir = "output" 52 | if not os.path.exists(outputdir): 53 | os.makedirs(outputdir) 54 | ## Create output filepath for the image. We use the input name with _NDVI end 55 | output_file = os.path.join( 56 | outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif") 57 | ) 58 | print(f"Saving image: {output_file}") 59 | ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red) 60 | metadata = input_image.profile 61 | ## Change the data type from integer to float and file type from jp2 to GeoTiff 62 | metadata.update(dtype=rasterio.float64, driver="GTiff") 63 | ## Write the ndvi numpy array to a GeoTiff with the updated metadata 64 | with rasterio.open(output_file, "w", **metadata) as dst: 65 | dst.write(ndvi, 1) 66 | 67 | 68 | def processImage(sentinel_image_path): 69 | ### This function processes one image (read, compute, save) 70 | ## Read the image and get rasterio objects from the red nir bands 71 | red, nir = readImage(sentinel_image_path) 72 | ## Calculate NDVI and get the resulting numpy array 73 | ndvi = calculateNDVI(red, nir) 74 | ## Write the NDVI numpy array to file to the same extent as the red input band 75 | saveImage(ndvi, sentinel_image_path, red) 76 | 77 | 78 | def main(): 79 | ## Loop the directory where all sentinel image folders are and run processImage function to them one by one 80 | for directory in os.listdir(image_folder): 81 | sentinel_image_path = os.path.join(image_folder, directory) 82 | if os.path.isdir(sentinel_image_path): 83 | print(f"\nProcess of {sentinel_image_path} started") 84 | processImage(sentinel_image_path) 85 | print(f"Processing of {sentinel_image_path} done\n") 86 | 87 | 88 | if __name__ == "__main__": 89 | ## This part is the first to execute when script is ran. It times the execution time and rans the main function 90 | start = time.time() 91 | main() 92 | end = time.time() 93 | print(f"Script completed in {str(end - start)} seconds") 94 | -------------------------------------------------------------------------------- /python/puhti/01_serial/single_core_example_folder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 5 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 6 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 8 | #SBATCH --cpus-per-task=1 # How many processors work on one task. Upper limit depends on number of CPUs per node. 9 | #SBATCH --mem-per-cpu=2G # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | 12 | module load geoconda 13 | 14 | data_folder=/appl/data/geo/sentinel/s2_example_data/L2A/ 15 | 16 | srun python single_core_example_folder.py $data_folder 17 | -------------------------------------------------------------------------------- /python/puhti/01_serial/single_core_example_list.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --output=slurm-%j.out # File to write the standard output to. %j is replaced by the job ID. 5 | #SBATCH --error=slurm-%j.err # File to write the standard error to. %j is replaced by the job ID. Defaults to slurm-%j.out if not provided. 6 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 8 | #SBATCH --cpus-per-task=1 # How many processors work on one task. Upper limit depends on number of CPUs per node. 9 | #SBATCH --mem-per-cpu=2G # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | 12 | module load geoconda 13 | 14 | #collect all filepaths in a text file 15 | readlink -f /appl/data/geo/sentinel/s2_example_data/L2A/S2* > image_path_list.txt 16 | 17 | #loop through files in txtfile 18 | while read ones2file; do 19 | srun python single_core_example.py $ones2file 20 | done image_path_list.txt 19 | 20 | parallel -a image_path_list.txt python gnu_parallel_example.py 21 | -------------------------------------------------------------------------------- /python/puhti/03_array/array_job_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example Python script how to calculate NDVI for three Sentinel satellite images 3 | with an array job. 4 | This script handles only ONE file, which is given as parameter to the script. 5 | 6 | Author: Johannes Nyman, Kylli Ek, Samantha Wittke, Elias Annila CSC 7 | 8 | """ 9 | 10 | import os 11 | import sys 12 | import time 13 | import rasterio 14 | 15 | ### The filepath for the input Sentinel image that is given as input parameter 16 | sentinel_image_path = sys.argv[1] 17 | 18 | 19 | def readImage(image_folder_fp): 20 | print("Reading Sentinel image from: %s" % (image_folder_fp)) 21 | 22 | ### Rather than figuring out what the filepath inside SAFE folder is, this is just finding the red and nir files with correct endings 23 | for subdir, dirs, files in os.walk(image_folder_fp): 24 | for file in files: 25 | if file.endswith("_B04_10m.jp2"): 26 | red_fp = os.path.join(subdir, file) 27 | if file.endswith("_B08_10m.jp2"): 28 | nir_fp = os.path.join(subdir, file) 29 | 30 | ### Read the red and nir (near-infrared) band files with Rasterio 31 | red = rasterio.open(red_fp) 32 | nir = rasterio.open(nir_fp) 33 | 34 | ### Return the rasterio objects as a list 35 | return red, nir 36 | 37 | 38 | def calculateNDVI(red, nir): 39 | print("Computing NDVI") 40 | ### This function calculates NDVI from the red and nir bands 41 | 42 | ## Read the rasterio objects pixel information to numpy arrays 43 | red = red.read(1) 44 | nir = nir.read(1) 45 | 46 | ### Scale the image values back to real reflectance values (sentinel pixel values have been multiplied by 10000) 47 | red = red / 10000 48 | nir = nir / 10000 49 | 50 | ### the NDVI formula 51 | ndvi = (nir - red) / (nir + red) 52 | return ndvi 53 | 54 | 55 | def saveImage(ndvi, sentinel_image_path, input_image): 56 | ## Create an output folder to this location, if it does not exist 57 | outputdir = "output" 58 | if not os.path.exists(outputdir): 59 | os.makedirs(outputdir) 60 | ## Create output filepath for the image. We use the input name with _NDVI end 61 | output_file = os.path.join( 62 | outputdir, os.path.basename(sentinel_image_path).replace(".SAFE", "_NDVI.tif") 63 | ) 64 | print(f"Saving image: {output_file}") 65 | ## Copy the metadata (extent, coordinate system etc.) from one of the input bands (red) 66 | metadata = input_image.profile 67 | ## Change the data type from integer to float and file type from jp2 to GeoTiff 68 | metadata.update(dtype=rasterio.float64, driver="GTiff") 69 | ## Write the ndvi numpy array to a GeoTiff with the updated metadata 70 | with rasterio.open(output_file, "w", **metadata) as dst: 71 | dst.write(ndvi, 1) 72 | 73 | 74 | def processImage(sentinel_image_path): 75 | ### This function processes one image (read, compute, save) 76 | 77 | ## Read the image and get rasterio objects from the red nir bands 78 | red, nir = readImage(sentinel_image_path) 79 | 80 | ## Calculate NDVI and get the resulting numpy array 81 | ndvi = calculateNDVI(red, nir) 82 | 83 | ## Write the NDVI numpy array to file to the same extent as the red input band 84 | saveImage(ndvi, sentinel_image_path, red) 85 | 86 | 87 | def main(): 88 | print("\nProcess started") 89 | processImage(sentinel_image_path) 90 | print("Processing done\n") 91 | 92 | 93 | if __name__ == "__main__": 94 | ## This part is the first to execute when script is ran. It times the execution time and runs the main function 95 | start = time.time() 96 | main() 97 | end = time.time() 98 | print("Script completed in " + str(end - start) + " seconds") 99 | -------------------------------------------------------------------------------- /python/puhti/03_array/array_job_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --output=slurm-%A_%a.out # File to write the standard output to. %A is replaced by the job ID and %a with the array index. 3 | #SBATCH --error=slurm-%A_%a.err # File to write the standard error to. %A is replaced by the job ID and %a with the array index. Defaults to slurm-%A_%a.out if not provided. 4 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 5 | # SBATCH --reservation=geocomputing_thu # Only available during the course 6 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 8 | #SBATCH --cpus-per-task=1 # How many processors work on one task. Upper limit depends on number of CPUs per node. 9 | #SBATCH --mem-per-cpu=2G # Minimum memory required per usable allocated CPU. Default units are megabytes. 10 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | #SBATCH --array=1-3 # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -. 12 | 13 | module load geoconda 14 | 15 | # For looping through all the files: 16 | 17 | # Make a list of input files 18 | readlink -f /appl/data/geo/sentinel/s2_example_data/L2A/S2* > image_path_list.txt 19 | 20 | # Select the inputfile from row n to the array job n. 21 | image_path=$(sed -n ${SLURM_ARRAY_TASK_ID}p image_path_list.txt) 22 | 23 | # Feed the filename to the Python script 24 | srun python array_job_example.py $image_path 25 | -------------------------------------------------------------------------------- /python/puhti/04_parallel_multiprocessing/multiprocessing_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 5 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 6 | #SBATCH --cpus-per-task=3 # How many processors work on one task. Upper limit depends on number of CPUs per node. 7 | #SBATCH --mem-per-cpu=2G # Minimum memory required per usable allocated CPU. Default units are megabytes. 8 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 9 | 10 | module load geoconda 11 | 12 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A 13 | 14 | srun python multiprocessing_example.py $datadir 15 | -------------------------------------------------------------------------------- /python/puhti/05_parallel_joblib/joblib_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 5 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 6 | #SBATCH --cpus-per-task=3 # How many processors work on one task. Upper limit depends on number of CPUs per node. 7 | #SBATCH --mem-per-cpu=2G # Minimum memory required per usable allocated CPU. Default units are megabytes. 8 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 9 | 10 | module load geoconda 11 | 12 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A 13 | 14 | srun python joblib_example.py $datadir 15 | -------------------------------------------------------------------------------- /python/puhti/06_parallel_dask/multi_node/dask_multinode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --time=00:10:00 # Maximum duration of the job. Upper limit depends on partition. 5 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 6 | #SBATCH --cpus-per-task=1 # How many processors work on one task. Upper limit depends on number of CPUs per node. 7 | #SBATCH --mem-per-cpu=6G # Memory required per allocated CPU. Default units are megabytes. 8 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 9 | 10 | # The resources reserved here are only for the master job, so 1 core and moderate memory should be enough. 11 | # The resources for workers are reservd in the Python file. 12 | 13 | ### Load the geoconda module which has Python and Dask installed 14 | module load geoconda 15 | 16 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A 17 | 18 | ### Run the Dask example. The directory given to the script has 3 Sentinel images 19 | ### We also give our project name so the master job is able to launch worker jobs 20 | 21 | srun python dask_multinode.py $datadir $SLURM_JOB_ACCOUNT 22 | -------------------------------------------------------------------------------- /python/puhti/06_parallel_dask/single_node/dask_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_20xxxxx # Choose the project to be billed 3 | # SBATCH --reservation=geocomputing_thu # Only available during the course 4 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 5 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 6 | #SBATCH --cpus-per-task=3 # How many processors work on one task. Upper limit depends on number of CPUs per node. 7 | #SBATCH --mem-per-cpu=6G # Memory required per usable allocated CPU. Default units are megabytes. 8 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 9 | 10 | ### Load the geoconda module which has Python and Dask installed 11 | module load geoconda 12 | 13 | datadir=/appl/data/geo/sentinel/s2_example_data/L2A 14 | 15 | ### Run the Dask example. The directory given to the script hosts 3 Sentinel images 16 | srun python dask_singlenode.py $datadir 17 | -------------------------------------------------------------------------------- /python/routing/batch_igraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -A 3 | #SBATCH --output=out_%J.txt # File to write the standard output to. 4 | #SBATCH --error=err_%J.txt # File to write the standard error to. 5 | #SBATCH --time=00:15:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #Number of reserved cores, this number can be later accessed with $SLURM_CPUS_PER_TASK 7 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 8 | #We're operating with shared memory so reserve total amount of memory, not per cpu 9 | #SBATCH --mem=12000 # Real memory required per node. 10 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | 12 | # load needed modules 13 | module load geoconda 14 | 15 | #Pass number of cores reserved to python script as argument, so that correct number of processes can be started 16 | python igraph_parallel.py $SLURM_CPUS_PER_TASK 17 | -------------------------------------------------------------------------------- /python/routing/batch_nx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -A 3 | #SBATCH --output=out_%J.txt # File to write the standard output to. 4 | #SBATCH --error=err_%J.txt # File to write the standard error to. 5 | #SBATCH --time=00:10:00 # Maximum duration of the job. Upper limit depends on partition. 6 | #Number of reserved cores, this number can be later accessed with $SLURM_CPUS_PER_TASK 7 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 8 | #We're operating with shared memory so reserve total amount of memory, not per cpu 9 | #SBATCH --mem=6000 # Real memory required per node. 10 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 11 | 12 | # load needed modules 13 | module load geoconda 14 | #Pass number of cores reserved to python script as argument, so that correct number of processes can be started 15 | python nx_parallel.py $SLURM_CPUS_PER_TASK 16 | -------------------------------------------------------------------------------- /python/routing/igraph_parallel.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | import multiprocessing as mp 3 | import time 4 | import sys 5 | 6 | #Graphml file containing street network from Hanko, Finland. 7 | graph_file="data/hanko.graphml" 8 | g = igraph.read(graph_file) 9 | 10 | #Create edge weights based on length 11 | for e in g.es: 12 | e['weight']=float(e['length']) 13 | 14 | #Create "size" amount of (start, end) pairs in list like such: [(s1,e1),(s2,e2)...] where start and end are vertex indices 15 | size=500 16 | args= [(int(len(g.vs)*(1/size)*i),int(len(g.vs)*(1/size)*(i+1)-1)) for i in range(0,size)] 17 | 18 | 19 | #Function to calculate shortest path and in this case just return sum of edge weights. (You can also get path length directly from igraph with igraph.shortest_paths(), but usually you'd probably also want the actual path. 20 | def sp(start,end): 21 | path=g.get_shortest_paths(start, to=end, weights='weight',output="epath") 22 | path_len=[g.es[e]['weight'] for e in path[0]] 23 | return sum(path_len) 24 | 25 | #Get number of cores from batch job script as argument, so number of processes used here matches the reserved number of cores 26 | print(sys.argv[1], " cores") 27 | 28 | #Create multiprocessing pool and map shortest path calculations for each start, end pair to the pool. 29 | with mp.Pool(processes=int(sys.argv[1])) as pool: 30 | t0 = time.time() 31 | results = pool.starmap(sp, args) 32 | print("Time spent on path calculations", time.time()-t0, " seconds") 33 | print(sum(results)) 34 | -------------------------------------------------------------------------------- /python/routing/nx_parallel.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import multiprocessing as mp 3 | import time 4 | import numpy as np 5 | import sys 6 | import osmnx 7 | #Graphml file containing street network Hanko, Finland. 8 | graph_file="data/hanko.graphml" 9 | g=nx.read_graphml(graph_file, node_type=int) 10 | 11 | print("graph read") 12 | 13 | #Add edge weights based on length 14 | for e in g.edges(data=True): 15 | e[2]['w']=float(e[2]['length']) 16 | 17 | #Function to calculate shortest path between two random points on map. Using seed to keep the paths same for each run. If specific nodes were wanted for start and end points these could be accessed with their OSM id numbers. 18 | def sp(seed): 19 | np.random.seed(seed) 20 | route = nx.shortest_path(g, source=np.random.choice(g.nodes),target=np.random.choice(g.nodes), weight='w') 21 | return route 22 | 23 | #Get number of available cores as argument from batch job script so that number of processes used on pool matches number of cores reserved in batch job script. 24 | print(sys.argv[1], " cores") 25 | with mp.Pool(processes=int(sys.argv[1])) as pool: 26 | t0 = time.time() 27 | #Map shortest path function to seeds ranging from 0 to 99 to create 100 paths. 28 | results = pool.map(sp,range(100)) 29 | print("Time spent on path calculations", time.time()-t0, " seconds") 30 | 31 | -------------------------------------------------------------------------------- /python/routing/osmnx-graphml.py: -------------------------------------------------------------------------------- 1 | import osmnx as ox 2 | import time 3 | import os 4 | 5 | 6 | #Get graphml from Overpass api 7 | def place_to_graphml(place, graphml_file): 8 | G = ox.graph_from_place(place,network_type="drive") 9 | ox.save_load.save_graphml(G, graphml_file) 10 | 11 | #Get grpahml from local .osm file 12 | def osm_to_graphml(osm_file, graphml_file): 13 | G = ox.graph_from_file(place,network_type="drive") 14 | ox.save_load.save_graphml(G, graphml_file) 15 | 16 | place_to_graphml("Helsinki, Finland", "helsinki.graphml") 17 | osm_to_graphml("finland-latest.osm", "finland.graphml") 18 | -------------------------------------------------------------------------------- /python/routing/readme.md: -------------------------------------------------------------------------------- 1 | ## Routing using Python igraph or networkx package with multiprocessing 2 | 3 | Here you can find example code for doing network routing in Puhti with Python igraph or networkx package. For reading the data from OpenStreetMap osmnx package is used. Osmnx can create network data needed by networkx directly. For igraph the data has to be save in GraphML format first, and then igraph can read from GraphML. In general igraph is faster and requires less memory, while networkx might be easier to use. 4 | 5 | Files: 6 | * osmnx-graphml.py - how to create a graph in GraphML format from OpenStreetMap data. 7 | * igraph_parallel.py and nx_parallel.py - how to run shortest path analysis with igraph and networkx in parallel with multiprocessing module. 8 | * batch_igarph.sh and batch_nx.sh - batch job files for submitting the shortest paths scripts to batch job system in Taito. 9 | 10 | Notes: 11 | * Creating GraphML file from OpenStreetMap data with osmnx requires a lot of memory as osmnx uses networkx graphs. Downloading from overpass API is suitable for only smaller areas. 12 | * Memory consumption increases with parallelisation, but not by much 13 | * Parallelisation is done within one node, in Puhti up to 40 cores and 382GB memory (or 1.5TB in hugemem queue) can be used. 14 | 15 | ### Test results (from Taito Supercomputer 2019) 16 | Time and memory consumption for shortest paths analysis on whole *Finland* street network from OSM using igraph. 17 | 18 | | Cores |Wall clock (min:s)|Time on pathfinding (min:s)|Mem (GB)| 19 | | ------ |------------------|-----------------------|--------| 20 | | 1|14:01|11:48|7.35| 21 | | 4|5:39|3:27|8.27| 22 | | 10|3:43|1:31|10.05| 23 | 24 | Time and memory consumption for shortest paths analysis on *Helsinki* street network using networkx 25 | 26 | | Cores |Wall clock (min:s)|Time on pathfinding (min:s)|Mem (GB)| 27 | | ------ |------------------|-----------------------|--------| 28 | | 1|5:01|3:51|2.39| 29 | | 4|4:35|1:08|3.11| 30 | | 10|1:25|0:30|4.38| 31 | 32 | 33 | -------------------------------------------------------------------------------- /python/sentinel/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Sentinel download script 3 | 4 | This script is an example how to find and download large quantities of Sentinel-2 images using Python and the [sentinelsat library](https://sentinelsat.readthedocs.io). 5 | 6 | The script works only for the Finnish **Finhub API**, or other national mirrors that use the same API. It does however not work for the Copernice Space Data Ecosystem. Check [CSC Earth Observation guide](https://docs.csc.fi/support/tutorials/gis/eo_guide/) for alternative ways of downloading Sentinel data from the CDSE. 7 | 8 | Another option for similar task is to use [STAC ](../STAC). 9 | 10 | ### Running 11 | On local computer just install the sentinelsat library first. 12 | 13 | In Puhti sentinelsat is included in the [geoconda module](https://docs.csc.fi/apps/geoconda/), which must be loaded before running the script. 14 | 15 | ``` 16 | module load geoconda 17 | python sentinelsat_download_from_finhub.py 18 | ``` 19 | 20 | You can run the script simply on login-node for smaller amounts of data. 21 | 22 | For bigger amounts of data you can use [screen](https://linuxize.com/post/how-to-use-linux-screen/) or [interactive session](https://docs.csc.fi/computing/running/interactive-usage/). 23 | 24 | ### Unzipping 25 | 26 | As the Python unzipping is a little complicated with files over 1GB, we recommend using bash commands to unzip the files 27 | 28 | Unzip all files to the current directory 29 | `unzip '*.zip` 30 | 31 | Unzip all files to current directory and delete them at the same time 32 | `find . -depth -name '*.zip' -execdir unzip -n {} \; -delete` 33 | 34 | ## Things to consider 35 | 36 | * Finhub API has data limited to Nordics only, as well as only some chosen data products. For more options, check the Copernicus Data Space ecosystem 37 | * If the area of interest is in the middle of two UTM zones, the script often downloads the same image in two different projections. You can specify the UTM zone if you do not want to download duplicates 38 | -------------------------------------------------------------------------------- /python/sentinel/helsinki.geojson: -------------------------------------------------------------------------------- 1 | { 2 | "type": "FeatureCollection", 3 | "features": [ 4 | { 5 | "type": "Feature", 6 | "properties": {}, 7 | "geometry": { 8 | "type": "Polygon", 9 | "coordinates": [ 10 | [ 11 | [ 12 | 24.792, 13 | 60.136 14 | ], 15 | [ 16 | 24.792, 17 | 60.231 18 | ], 19 | [ 20 | 25.064, 21 | 60.231 22 | ], 23 | [ 24 | 25.064, 25 | 60.136 26 | ], 27 | [ 28 | 24.792, 29 | 60.136 30 | ] 31 | ] 32 | ] 33 | } 34 | } 35 | ] 36 | } -------------------------------------------------------------------------------- /python/sentinel/sentinelsat_download_from_finhub.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | An example script for downloading Sentinel data from FinHub with sentinelsat Python library. 4 | 5 | https://finhub.nsdc.fmi.fi 6 | """ 7 | 8 | import sentinelsat 9 | from datetime import date 10 | import pandas as pd 11 | import time 12 | 13 | ### Set your credentials 14 | finhub_user = 'a' 15 | finhub_pwd = 'a' 16 | finhub_url = 'https://finhub.nsdc.fmi.fi' 17 | 18 | ### Open API connection 19 | finhub_api = sentinelsat.SentinelAPI(finhub_user, finhub_pwd, finhub_url) 20 | 21 | ### Search by polygon (WGS84), time, and query keywords 22 | footprint = sentinelsat.geojson_to_wkt(sentinelsat.read_geojson(r'helsinki.geojson')) 23 | startDate = date(2020,1,1) 24 | endDate = date(2020,7,30) 25 | cloudcoverage = (0, 20) 26 | platformname = 'Sentinel-2' 27 | producttype = 'S2MSI1C' 28 | # producttype='S2MSI2A' #for L2 images 29 | area_relation = "Contains" # footprint has to be fully inside the image. Other options "Intersects", "IsWithin" 30 | 31 | ### If your area is between two UTM zones, this script often downloads two versions of the same image 32 | ### Uncomment and add e.g "T35" to only focus on one UTM zone 33 | utm_zone = "" 34 | 35 | ### Image output directory 36 | directory_path = r'sentinel_temp' 37 | 38 | ### Help setting to see product names in full lenghth 39 | pd.set_option('display.max_colwidth', None) 40 | pd.set_option('display.expand_frame_repr', False) 41 | 42 | def calculateTotalSize(size_column): 43 | total_size = 0 44 | for i in size_column: 45 | if "MB" in i: 46 | total_size += float(i.replace(" MB",""))/1000 47 | if "GB" in i: 48 | total_size += float(i.replace(" GB","")) 49 | return round(total_size,2) 50 | 51 | def queryAndDownload(): 52 | 53 | finhub_products = finhub_api.query(footprint, date=(startDate, endDate), platformname=platformname, 54 | cloudcoverpercentage=cloudcoverage, producttype=producttype, 55 | area_relation=area_relation) 56 | 57 | ### Checking, if any results were found 58 | if (len(finhub_products) == 0): 59 | finhub_hasresults = False 60 | print('No products found from Finhub. Terminating') 61 | else: 62 | finhub_hasresults = True 63 | 64 | if finhub_hasresults: 65 | finhub_df = finhub_api.to_dataframe(finhub_products) 66 | if utm_zone: 67 | finhub_df = finhub_df[finhub_df['title'].str.contains(utm_zone)] 68 | 69 | finhub_id_to_download = finhub_df.uuid.tolist() 70 | print(f'{len(finhub_id_to_download)} image(s) will be downloaded from Finhub repository') 71 | print(finhub_df.title.to_string(index=False)) 72 | 73 | print(f'All together {calculateTotalSize(finhub_df["size"])} GB will be downloaded') 74 | 75 | ### Download files 76 | if (finhub_hasresults): 77 | finhub_api.download_all(finhub_id_to_download, directory_path=directory_path) 78 | 79 | 80 | def main(): 81 | queryAndDownload() 82 | 83 | 84 | if __name__ == '__main__': 85 | start_time = time.time() 86 | main() 87 | print("The Download script finished in " + str((time.time() - start_time) / 60) + " minutes") 88 | 89 | -------------------------------------------------------------------------------- /python/sentinel_without_credentials/README.md: -------------------------------------------------------------------------------- 1 | ## Allas Sentinel bucket content without AWS credentials 2 | 3 | This is an example script of how to get contents from public buckets in Allas containing Sentinel-2 data (without credentials). 4 | 5 | ### Running 6 | 7 | On CSC's supercomputer Puhti, you can use the [geoconda module](https://docs.csc.fi/apps/geoconda/) which includes the boto3 library: 8 | ``` 9 | module load geoconda 10 | python get_open_sentinel_buckets.py 11 | ``` 12 | 13 | On local machine install the required library: boto3 14 | 15 | ``` 16 | pip install boto3 17 | ``` 18 | 19 | 20 | ### Results 21 | 22 | The script prints out the first bucket's name, and the contents of the first 5 SAFEs in that bucket. The bucket's contents are accessible in the get_contents function. 23 | The image URLs could also be used directly with e.g. `rasterio` package. 24 | -------------------------------------------------------------------------------- /python/zonal_stats/raster-stats/README.md: -------------------------------------------------------------------------------- 1 | * `zonal_stats_serial.py` is the basic version, here the work is done on one core in serial mode. 2 | * `zonal_stats_parallel.py` is the parallel version, where processing of polygons is split to several cores. For parallelization `multiprocessing` library is used. 3 | * `zonal-stats-stac-parallel.py`is the parallel version, where statistics is calculated for several rastes found via STAC. For parallelization `dask` delayed functions are used. 4 | 5 | Additionally batch job scripts are provided, for running this script on CSC's Puhti supercluster. For submitting the job to Puhti: 6 | `sbatch batch_job_XX.sh` 7 | -------------------------------------------------------------------------------- /python/zonal_stats/raster-stats/batch_job-stac-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_2000599 # Choose the project to be billed 3 | #SBATCH --time=02:00:00 # Maximum duration of the job. Upper limit depends on partition. 4 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 5 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 6 | #SBATCH --mem-per-cpu=3G # Minimum memory required per allocated CPU. Default units are megabytes. 7 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 8 | 9 | # Load the geoconda module which has Python with Dask, Xarray and STAC libraries 10 | module load geoconda/.3.12.9_conda_conda 11 | 12 | # Run the Python code. 13 | python zonal-stats-stac-parallel.py -------------------------------------------------------------------------------- /python/zonal_stats/raster-stats/batch_job_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -A 3 | #SBATCH --time=00:05:00 # Maximum duration of the job. Upper limit depends on partition. 4 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 5 | #SBATCH --mem=1000 # Real memory required per node. 6 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 7 | 8 | module load geoconda 9 | python zonal_stats_parallel.py 10 | -------------------------------------------------------------------------------- /python/zonal_stats/raster-stats/batch_job_serial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -A 3 | #SBATCH --time=00:10:00 # Maximum duration of the job. Upper limit depends on partition. 4 | #SBATCH --mem-per-cpu=1000 # Minimum memory required per usable allocated CPU. Default units are megabytes. 5 | #SBATCH --partition=test # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 6 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 7 | 8 | module load geoconda 9 | srun python zonal_stats_serial.py 10 | -------------------------------------------------------------------------------- /python/zonal_stats/raster-stats/zonal_stats_serial.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple example Python script for zonal_stats Python function. 3 | https://pythonhosted.org/rasterstats/ 4 | 5 | Some notes about input datasets: 6 | Raster: 7 | * If all zones together cover almost all raster data and the raster data is not too big, then the fastest is to read raster dataset to memory 8 | in the beginning of the script. Just make sure to reserve enogh memory. This causes also least disk readings and is in general the preferred way. 9 | * If the zones cover only some part raster data or if the raster is too big for memory, then direct read from disk might be better. 10 | See the comments in script, how to modify the code to read directly from disk. If reading data from disk, make sure that the raster has a format that can be paritally read (for example GeoTiff) and that it is has inner tiling (https://gdal.org/drivers/raster/gtiff.html -> TILED) for optimal reading. 11 | In this case, consider also moving the raster to local disk on the computing node: 12 | https://docs.csc.fi/computing/disk/#compute-nodes 13 | 14 | Author: Elias Annila, Kylli Ek, CSC 15 | Date: 27.01.2022, updated 13.3.2025 16 | """ 17 | 18 | from rasterstats import zonal_stats 19 | import geopandas as gpd 20 | import rasterio 21 | import time 22 | 23 | # Set the processing area, leave out if you want to process the whole file 24 | x_min = 350000.0 25 | y_min = 6700000.0 26 | buffer = 200000 27 | x_max = x_min + buffer 28 | y_max = y_min + buffer 29 | 30 | bbox_3067 = (x_min, y_min, x_max, y_max) 31 | 32 | # File paths: 33 | # Raster you want to use to compute zonal stastics from 34 | raster_file = '/appl/data/geo/mml/dem10m/dem10m_direct.vrt' 35 | # If running the code outside Puhti, get the data from Paituli. 36 | # https://www.nic.funet.fi/index/geodata/mml/dem10m/dem10m_direct.vrt 37 | 38 | # Polygons file 39 | polygons_file = '/appl/data/geo/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg' 40 | # If running the code outside Puhti, get the data from Paituli. 41 | # polygons_file = 'https://www.nic.funet.fi/index/geodata/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg' 42 | 43 | # Statistics calculated for each zone 44 | statistics = ['mean'] 45 | #statistics = ['count', 'min' ,'mean', 'max','majority'] 46 | 47 | def main(): 48 | 49 | # Read the vector polygons, leave out bbox, if you want to process the whole file 50 | zones = gpd.read_file(polygons_file , layer="KASVULOHKO", bbox=bbox_3067) 51 | 52 | # zonal_stats does not directly work with rasterio opened file, but needs data and transformation variables 53 | with rasterio.open(raster_file) as src: 54 | # If you want to use the whole raster file, leave out the window part. 55 | raster = src.read(indexes=1, window=rasterio.windows.from_bounds(x_min, y_min, x_max, y_max, src.transform)) 56 | results = zonal_stats(zones.geometry, raster, affine=src.transform, stats=statistics) 57 | 58 | # If you need to read the file from disk. 59 | # results = zonal_stats(zones.geometry, raster_file, stats=statistics) 60 | 61 | #Join the results back to geopandas dataframe 62 | for stat in statistics: 63 | results_as_list = [d[stat] for d in results] 64 | zones[stat] = results_as_list 65 | 66 | if __name__ == '__main__': 67 | t0 = time.time() 68 | main() 69 | t1 = time.time() 70 | total = t1-t0 71 | print("Everything done, took: " + str(round(total, 0))+"s") 72 | -------------------------------------------------------------------------------- /python/zonal_stats/xarray-spatial/xarray-spatial-batch-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --time=00:15:00 # Maximum duration of the job. Upper limit depends on partition. 4 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 5 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 6 | #SBATCH --mem-per-cpu=3G # Minimum memory required per allocated CPU. Default units are megabytes. 7 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 8 | 9 | # Load the geoconda module which has Python with Dask, Xarray and STAC libraries 10 | module load geoconda 11 | 12 | # Run the Python code. 13 | python xarray-spatial-zonal-stats.py 14 | -------------------------------------------------------------------------------- /python/zonal_stats/xarray-spatial/xarray-spatial-zonal-stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # Created 13.3.2025, by Kylli Ek, CSC 4 | 5 | import geopandas as gpd 6 | from geocube.api.core import make_geocube 7 | from xrspatial import zonal_stats 8 | import rioxarray 9 | from dask.distributed import Client, Lock 10 | import os, time 11 | 12 | # Set the processing area, leave out if you want to process the whole file 13 | x_min = 350000.0 14 | y_min = 6700000.0 15 | buffer = 200000 16 | x_max = x_min + buffer 17 | y_max = y_min + buffer 18 | 19 | bbox_3067 = (x_min, y_min, x_max, y_max) 20 | 21 | # File paths: 22 | # Raster you want to use to compute zonal stastics from 23 | raster_file = '/appl/data/geo/mml/dem10m/dem10m_direct.vrt' 24 | # If running the code outside Puhti, get the data from Paituli. 25 | # raster_file = 'https://www.nic.funet.fi/index/geodata/mml/dem10m/dem10m_direct.vrt' 26 | 27 | # Polygons file 28 | polygons_file = '/appl/data/geo/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg' 29 | # If running the code outside Puhti, get the data from Paituli. 30 | # polygons_file = 'https://www.nic.funet.fi/index/geodata/ruokavirasto/kasvulohkot/2020/LandUse_ExistingLandUse_GSAAAgriculturalParcel.gpkg' 31 | 32 | def main(): 33 | # Get the number of workers 34 | no_of_workers = len(os.sched_getaffinity(0)) 35 | # Create Dask Client for parallel processing 36 | client = Client(n_workers=no_of_workers) 37 | 38 | # Read the raster file, to read as Dasked backed Xarray DataArray 39 | # Notice the use of `chunks=True`. 40 | dem10m = rioxarray.open_rasterio(raster_file, chunks=True) 41 | # Crop the raster file to processing area, leave out if you want to process the whole file 42 | dem10m_clip = dem10m.rio.clip_box(minx=x_min, miny=y_min, maxx=x_max, maxy=y_max) 43 | 44 | # Read the vector polygons, leave out bbox, if you want to process the whole file 45 | polygons = gpd.read_file(polygons_file , layer="KASVULOHKO", bbox=bbox_3067) 46 | polygons['ID'] = polygons.PERUSLOHKOTUNNUS.astype(int) 47 | 48 | # Create Xarray DataArray similar to the raster data 49 | out_grid = make_geocube( 50 | vector_data=polygons, 51 | measurements=["ID"], 52 | like=dem10m_clip 53 | ) 54 | 55 | # Write the rasterized polygons to the disk, so that Dasked backed Xarray DataArray could be created of them. 56 | out_grid["ID"].rio.to_raster("fields.tif", lock=Lock(name="rio")) 57 | 58 | # Read the rasterized polygons back in as Dasked backed Xarray DataArray 59 | fields = rioxarray.open_rasterio("fields.tif", chunks=True) 60 | 61 | # Caclulate the zonal statistics 62 | zonal_stats_values = zonal_stats(fields[0], dem10m_clip[0], stats_funcs=['mean']).compute() 63 | 64 | # Join the results back to the original zones data 65 | polygons_result = polygons.merge(zonal_stats_values.compute(), left_on='ID', right_on='zone', how='left') 66 | 67 | # With Dask, it is important to use the main function 68 | if __name__ == "__main__": 69 | start = time.time() 70 | main() 71 | end = time.time() 72 | print("Script completed in " + str(end - start) + " seconds") 73 | -------------------------------------------------------------------------------- /snap/01_simple_job/CreateStackGraph.xml: -------------------------------------------------------------------------------- 1 | 2 | 1.0 3 | 4 | CreateStack 5 | 6 | 7 | 8 | 9 | 10 | 11 | NEAREST_NEIGHBOUR 12 | Master 13 | Product Geolocation 14 | 15 | 16 | 17 | Write 18 | 19 | 20 | 21 | 22 | /scratch/project_200XXXX/snap/output/S1A_IW_GRDH_1SDH_20171002T154536_20171002T154601_018636_01F6C8_90C0_Stack.tif 23 | GeoTIFF-BigTIFF 24 | 25 | 26 | 27 | ProductSet-Reader 28 | 29 | 30 | /scratch/project_200XXXX/snap/input/S1A_IW_GRDH_1SDH_20171002T154536_20171002T154601_018636_01F6C8_90C0.tif,/scratch/project_200XXXX/snap/input/S1A_IW_GRDH_1SDH_20171014T154537_20171014T154602_018811_01FC18_2318.tif 31 | 32 | 33 | -------------------------------------------------------------------------------- /snap/01_simple_job/snap_batch_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --account=project_200xxxx # Choose the project to be billed 3 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 4 | #SBATCH --time=04:00:00 # Maximum duration of the job. Upper limit depends on partition. 5 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 6 | #SBATCH --cpus-per-task=2 # How many processors work on one task. Upper limit depends on number of CPUs per node. 7 | #SBATCH --mem=60000 # Real memory required per node. 8 | #SBATCH --gres=nvme:50 # How much local disk to reserve. Default units are gigabytes. 9 | 10 | #The last row resrves 50G of local fast disk on the compute node, it will be used for SNAP and JAVA cache, set by snap_add_userdir. 11 | 12 | module load snap 13 | source snap_add_userdir $LOCAL_SCRATCH 14 | gpt /scratch/project_200XXXX/scripts/CreateStackGraph.xml -q 2 -c 40G -J-Xmx55G -e 15 | 16 | # Match values in gpt command with job reservation: 17 | # -q 2 with --cpus-per-task=2 18 | # -J-Xmx55G with --mem=60000, use for job a few Gb less than reserved 19 | # -c 40G with -J-Xmx55G, use ~75 % of available memory for data cache, depends on task.. -------------------------------------------------------------------------------- /snap/02_array_job/resample_and_lai.xml: -------------------------------------------------------------------------------- 1 | 2 | 1.0 3 | 4 | Resample 5 | 6 | ${sourceProduct} 7 | 8 | 9 | 10 | 11 | 12 | 10 13 | Nearest 14 | First 15 | First 16 | true 17 | 18 | 19 | 20 | BiophysicalOp 21 | 22 | 23 | 24 | 25 | true 26 | false 27 | false 28 | false 29 | false 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /snap/02_array_job/snap_array_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH --output=out_%A_%a.txt # File to write the standard output to. 3 | #SBATCH --error=err_%A_%a.txt # File to write the standard error to. 4 | #SBATCH --account=project_200xxxx # Choose the project to be billed 5 | #SBATCH --partition=small # Which queue to use. Defines maximum time, memory, tasks, nodes and local storage for job 6 | #SBATCH --time=02:00:00 # Maximum duration of the job. Upper limit depends on partition. 7 | #SBATCH --ntasks=1 # Number of tasks. Upper limit depends on partition. 8 | #SBATCH --cpus-per-task=4 # How many processors work on one task. Upper limit depends on number of CPUs per node. 9 | #SBATCH --mem=8000 # Real memory required per node. 10 | #SBATCH --array=1-3 # Indices to specify what array index values should be used. Multiple values may be specified using a comma separated list or a range of values separated by -. 11 | #SBATCH --gres=nvme:10 # How much local disk to reserve. Default units are gigabytes. 12 | 13 | ### Load SNAP module 14 | module load snap 15 | 16 | ### For looping through all the files: 17 | 18 | ### Make a list of input files. This folder has 3 S2L2 images 19 | readlink -f /appl/data/geo/sentinel/s2_example_data/L2A/S2* > image_path_list.txt 20 | 21 | ### Select the inputfile row by row 22 | image_path=$(sed -n ${SLURM_ARRAY_TASK_ID}p image_path_list.txt) 23 | 24 | ### Parse image basename to be used in output filename 25 | image_filename="$(basename -- $image_path)" 26 | 27 | ### Assign an output_folder 28 | output_folder=/scratch/project_2000599/snap/output/ 29 | 30 | # Set custom SNAP user dir 31 | source snap_add_userdir $LOCAL_SCRATCH/cache_"$SLURM_ARRAY_TASK_ID" 32 | 33 | ### -q is num of cores, -t is target file, -SsourceProduct is the xml inside each SAFE folder 34 | gpt resample_and_lai.xml -q 4 -c 5G -J-Xmx7G -t ${output_folder}/${image_filename}_LAI.tif -SsourceProduct=${image_path}/MTD_MSIL2A.xml -e 35 | 36 | # Match values in gpt command with job reservation: 37 | # -q 4 with --cpus-per-task=4 38 | # -J-Xmx7G with --mem=8000, use for job a few Gb less than reserved 39 | # -c 5G with -J-Xmx7G, use ~75 % of available memory for data cache, depends on task.. -------------------------------------------------------------------------------- /snap/README.md: -------------------------------------------------------------------------------- 1 | # SNAP GPT in Puhti 2 | 3 | Examples: 4 | 5 | * [Simple job with one GPT graph](01_simple_job). This example is S1 stacking with several input files. 6 | * [Array job with one GPT graph for 3 images](02_array_job). This is an example of running the SNAP graph for multiple Sentinel-2 Level 3 images with an [array job](https://docs.csc.fi/computing/running/array-jobs/). It resamples the bands and calculates LAI (Leaf area index) for one image. 7 | 8 | 9 | Both examples include 2 files: 10 | 11 | * .xml-file - the SNAP Graph that defines the processing workflow. 12 | * .sh-file - the batch job script that makes resource (time, memory, cores) reservations to Puhti and starts the gpt command. The batch job file [is submitted to the Puhti queuing system](https://docs.csc.fi/computing/running/submitting-jobs/) 13 | 14 | ``` 15 | sbatch snap_batch_job.sh 16 | OR 17 | sbatch snap_array_job.sh 18 | ``` -------------------------------------------------------------------------------- /supercomputer_installations/ames-stereo_3.2.0.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - nasa-ames-stereo-pipeline 3 | - usgs-astrogeology 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - stereo-pipeline 8 | 9 | -------------------------------------------------------------------------------- /supercomputer_installations/arcgis-python-api-2.1.0.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - esri 3 | dependencies: 4 | - arcgis_learn=2.1.0 5 | - python=3.9 6 | 7 | -------------------------------------------------------------------------------- /supercomputer_installations/geoconda/geoconda_3.10.6.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | # - esri 4 | dependencies: 5 | - python>=3.10 6 | - gdal 7 | # - arcgis 8 | - boto3 9 | - bottleneck 10 | - cartopy 11 | - cfgrib 12 | # - copc-lib 13 | - dask 14 | - dask-geopandas 15 | - dask-jobqueue 16 | - dask-labextension 17 | - dask-ml 18 | - descartes 19 | - earthengine-api 20 | - geoalchemy2 21 | - geopandas 22 | - graphviz 23 | - gstools 24 | - hvplot 25 | - imbalanced-learn 26 | - jinja2 27 | - jupyterlab 28 | - jupyter-git 29 | - laspy 30 | - laxpy 31 | - lidar 32 | - lxml 33 | - metpy 34 | - natsort 35 | - ncview 36 | - netcdf4 37 | - networkx 38 | - osmnx 39 | - owslib 40 | - pdal 41 | - pip 42 | - plotly 43 | - py6S 44 | - pygmt 45 | - pyicu 46 | - pyntcloud 47 | - pyogrio 48 | - pyproj 49 | - pysal 50 | - pystac-client 51 | - python-cdo 52 | - python-graphviz 53 | - python-igraph 54 | - python-keystoneclient 55 | - python-pdal 56 | - python-swiftclient 57 | - python-wget 58 | - rasterio 59 | - rasterstats 60 | - rioxarray 61 | - rtree 62 | - satpy 63 | - scikit-image 64 | - scikit-learn 65 | - sentinelsat 66 | - shapely 67 | - spyder 68 | - stackstac 69 | - wrf-python 70 | - xarray 71 | - xarray-spatial 72 | - xesmf 73 | - pip: 74 | - laszip 75 | # - open3d 76 | 77 | 78 | -------------------------------------------------------------------------------- /supercomputer_installations/geoconda/geoconda_3.10.9.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | # - esri 4 | dependencies: 5 | - python>=3.10 6 | - gdal 7 | # - arcgis 8 | - boto3 9 | - bottleneck 10 | - cartopy 11 | - cfgrib 12 | - copc-lib 13 | - dask 14 | - dask-geopandas 15 | - dask-jobqueue 16 | - dask-labextension 17 | - dask-ml 18 | - descartes 19 | - earthengine-api 20 | - geoalchemy2 21 | - geopandas 22 | - geopy 23 | - graphviz 24 | - gstools 25 | - hvplot 26 | - imageio 27 | - imbalanced-learn 28 | - jinja2 29 | - jupyterlab 30 | - jupyterlab-git 31 | - keplergl 32 | - laspy 33 | - laxpy 34 | - leafmap 35 | - lidar 36 | - lxml 37 | - metpy 38 | - natsort 39 | - ncview 40 | - netcdf4 41 | - networkx 42 | - osmnx 43 | - owslib 44 | - pdal 45 | - pip 46 | - plotly 47 | - py6S 48 | - pydeck 49 | - pygmt 50 | - pyicu 51 | - pyntcloud 52 | - pyogrio 53 | - pyproj 54 | - pysal 55 | - pystac-client 56 | - python-cdo 57 | - python-graphviz 58 | - python-igraph 59 | - python-keystoneclient 60 | - python-pdal 61 | - python-swiftclient 62 | - python-wget 63 | - rasterio 64 | - rasterstats 65 | - rioxarray 66 | - rtree 67 | - satpy 68 | - scikit-image 69 | - scikit-learn 70 | - sentinelsat 71 | - shapely 72 | - spyder 73 | - stackstac 74 | - tifffile 75 | - wrf-python 76 | - xarray 77 | - xarray-spatial 78 | - xarray_leaflet 79 | - xesmf 80 | - pip: 81 | - laszip 82 | - open3d 83 | - git+https://github.com/ernstste/landsatlinks.git 84 | 85 | -------------------------------------------------------------------------------- /supercomputer_installations/geoconda/geoconda_3.11.0.yml: -------------------------------------------------------------------------------- 1 | name: geoconda-3.11 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.11 6 | - boto3 7 | - bottleneck 8 | - cartopy 9 | - cfgrib 10 | # - copc-lib Not available for Python 3.12 nor 3.11 11 | - dask 12 | - dask-geopandas 13 | - dask-jobqueue 14 | - dask-labextension 15 | - dask-ml 16 | - descartes 17 | - earthaccess 18 | - earthengine-api 19 | - geoalchemy2 20 | - geopandas 21 | - geopy 22 | - git 23 | - graphviz 24 | - gstools 25 | - hvplot 26 | - h3pandas 27 | - imageio 28 | - imbalanced-learn 29 | - jinja2 30 | - jupyterlab 31 | - jupyterlab-git 32 | - keplergl 33 | - laspy 34 | - lazrs-python 35 | - laxpy 36 | - leafmap 37 | - lxml 38 | - metpy 39 | - movingpandas 40 | - nbclassic 41 | - natsort 42 | - ncview 43 | - netcdf4 44 | - networkx 45 | - openeo 46 | # - osmnx New version coming later in the year, now would require big downgrade of geopandas 47 | - owslib 48 | - pcraster 49 | - pdal 50 | - pip 51 | - plotly 52 | - psy-view 53 | - py6s 54 | - pydeck 55 | - pygmt 56 | - pyicu 57 | - pyntcloud 58 | - pyogrio 59 | - pyproj 60 | - pysal 61 | - pystac-client 62 | - python-cdo 63 | - python-graphviz 64 | - python-igraph 65 | - python-keystoneclient 66 | - python-pdal 67 | - python-swiftclient 68 | - python-wget 69 | - rasterio 70 | - rasterstats 71 | - richdem 72 | - rioxarray 73 | - r5py 74 | - satpy 75 | - scalene 76 | - scikit-image 77 | - scikit-learn 78 | - sentinelsat 79 | - sentinelhub 80 | - shapely 81 | - spyder 82 | - stackstac 83 | - wrf-python 84 | - xarray 85 | - xarray-spatial 86 | - xarray_leaflet 87 | - xesmf 88 | - pip: 89 | - lidar 90 | - laszip 91 | - open3d 92 | - git+https://github.com/ernstste/landsatlinks.git 93 | # - git+https://github.com/pangeo-data/xesmf.git 94 | - git+https://github.com/mayrajeo/geo2ml.git 95 | -------------------------------------------------------------------------------- /supercomputer_installations/geoconda/whitebox_tools_postinstall/download_wbt: -------------------------------------------------------------------------------- 1 | python start_wbt.py 2 | -------------------------------------------------------------------------------- /supercomputer_installations/geoconda/whitebox_tools_postinstall/start_wbt.py: -------------------------------------------------------------------------------- 1 | import whitebox 2 | whitebox.WhiteboxTools() 3 | 4 | --------------------------------------------------------------------------------