├── docker_cygnss_deployment ├── .env ├── Dockerfile ├── requirements.txt └── docker-compose.yml ├── app_write_test.txt ├── Workflow.png ├── .gitmodules ├── deployment ├── mongodb-configmap.yaml ├── mongodb-secret.yaml ├── prefect-agent-deployment.yaml ├── streamlit-deployment.yaml ├── prefect-orion-deployment.yaml ├── mongodb-deployment.yaml └── mongo-express-deployment.yaml ├── set_up_infrastructure.sh ├── .gitignore ├── LICENSE.md ├── download_training_data.py ├── Usage.md ├── README.md ├── dashboard.py ├── Preprocessing.py ├── plots.py ├── API.py ├── notebooks ├── DailyAnalysis.ipynb └── Preprocessing.ipynb ├── prefect-deploy.py └── environment.yml /docker_cygnss_deployment/.env: -------------------------------------------------------------------------------- 1 | UID=201207 2 | GID=201207 3 | -------------------------------------------------------------------------------- /app_write_test.txt: -------------------------------------------------------------------------------- 1 | app_write_test/tmp/tmpxv3x4wj2prefect/tmp/tmpxv3x4wj2prefect -------------------------------------------------------------------------------- /Workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hereon-KSN/cygnss-deployment/HEAD/Workflow.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "externals/gfz_cygnss"] 2 | path = externals/gfz_cygnss 3 | url = https://gitlab.dkrz.de/aim/2020-03-gfz-remote-sensing.git 4 | -------------------------------------------------------------------------------- /deployment/mongodb-configmap.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: mongodb-configmap 5 | data: 6 | database_url: mongodb://root:example@mongodb:27017/ 7 | -------------------------------------------------------------------------------- /deployment/mongodb-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: mongodb-secret 5 | type: Opaque 6 | data: 7 | mongo-root-username: dXNlcgo= 8 | mongo-root-password: ZXhhbXBsZQo= 9 | -------------------------------------------------------------------------------- /set_up_infrastructure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git clone --recurse-submodules https://gitlab.dkrz.de/aim/cygnss-deployment 4 | 5 | cd cygnss-deployment/docker_cygnss_deployment 6 | 7 | docker-compose up --build 8 | -------------------------------------------------------------------------------- /docker_cygnss_deployment/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | FROM continuumio/miniconda3 3 | WORKDIR app/ 4 | COPY requirements.txt . 5 | RUN pip install --upgrade pip 6 | RUN conda install -c conda-forge cartopy 7 | RUN conda install xarray=0.20.1 8 | RUN pip install -r requirements.txt 9 | -------------------------------------------------------------------------------- /docker_cygnss_deployment/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.17.0 2 | scikit-learn==1.2.1 3 | pandas==1.5.3 4 | numpy==1.23.4 5 | requests==2.28.2 6 | Pillow==9.4.0 7 | pymongo==4.3.3 8 | mlflow 9 | matplotlib==3.6.3 10 | scipy==1.10.0 11 | h5py==3.8.0 12 | netcdf4==1.6.2 13 | torch==1.13.1 14 | seaborn==0.12.2 15 | pytorch-lightning==1.5.10 16 | cdsapi==0.5.1 17 | podaac-data-subscriber==1.12.0 18 | global-land-mask==1.0.0 19 | prefect==2.6.8 20 | sqlalchemy 21 | dask==2023.1.1 22 | shutils 23 | -------------------------------------------------------------------------------- /deployment/prefect-agent-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: prefect-agent-deployment 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: prefect-agent 10 | template: 11 | metadata: 12 | labels: 13 | app: prefect-agent 14 | spec: 15 | containers: 16 | - name: prefect-agent 17 | image: streamlit:v1 18 | imagePullPolicy: IfNotPresent 19 | resources: 20 | limits: 21 | memory: "8000Mi" 22 | cpu: "1000m" 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | torchserve-example/mnist/model-store/mnist.mar 2 | torchserve-example/mnist/app/__pycache__/* 3 | torchserve-example/mnist/app/static/test_data/* 4 | saved_models/* 5 | data/* 6 | notebooks/.ipynb_checkpoints/* 7 | __pycache__/* 8 | cycnss_frauke.sqlite-journal 9 | cycnss_test_frauke.sqlite 10 | notebooks/lightning_logs/* 11 | utils/* 12 | lightning_logs/ 13 | mlruns/* 14 | mlruns.db 15 | utils/mathematics.py 16 | utils/__pycache__/* 17 | plots/* 18 | docker_cygnss_deployment/volumes/ 19 | annotated_raw_data/* 20 | raw_data/* 21 | dev_data/* 22 | prediction/* 23 | 2022-cygnss-deployment/* 24 | -------------------------------------------------------------------------------- /deployment/streamlit-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: streamlit-deployment 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: streamlit 10 | template: 11 | metadata: 12 | labels: 13 | app: streamlit 14 | spec: 15 | containers: 16 | - name: streamlit 17 | image: streamlit:v1 18 | imagePullPolicy: IfNotPresent 19 | #resources: 20 | # limits: 21 | # memory: "8000Mi" 22 | # cpu: "1000m" 23 | ports: 24 | - containerPort: 8501 25 | -------------------------------------------------------------------------------- /deployment/prefect-orion-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: prefect-orion-deployment 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: prefect-orion 10 | template: 11 | metadata: 12 | labels: 13 | app: prefect-orion 14 | spec: 15 | containers: 16 | - name: prefect-orion 17 | image: prefecthq/prefect:2.6.8-python3.11 18 | imagePullPolicy: IfNotPresent 19 | resources: 20 | limits: 21 | memory: "700Mi" 22 | cpu: "500m" 23 | ports: 24 | - containerPort: 4200 25 | -------------------------------------------------------------------------------- /deployment/mongodb-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: mongodb-deployment 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: mongodb 10 | template: 11 | metadata: 12 | labels: 13 | app: mongodb 14 | spec: 15 | containers: 16 | - name: mongodb 17 | image: mongo:6.0.3 18 | imagePullPolicy: IfNotPresent 19 | ports: 20 | - containerPort: 27017 21 | volumeMounts: 22 | - mountPath: /data/db 23 | name: mongodb 24 | env: 25 | - name: MONGO_INITDB_ROOT_USERNAME 26 | valueFrom: 27 | secretKeyRef: 28 | name: mongodb-secret 29 | key: mongo-root-username 30 | - name: MONGO_INITDB_ROOT_PASSWORD 31 | valueFrom: 32 | secretKeyRef: 33 | name: mongodb-secret 34 | key: mongo-root-password 35 | volumes: 36 | - name: mongodb -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022-2023 Frauke Albrecht, Caroline Arnold, Harsh Grover (DKRZ-AIM) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /deployment/mongo-express-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: mongodb-express-deployment 5 | labels: 6 | app: mongodb-express 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: mongodb-express 12 | template: 13 | metadata: 14 | labels: 15 | app: mongodb-express 16 | spec: 17 | containers: 18 | - name: mongodb-express 19 | image: mongo-express:1.0.0-alpha.4 20 | imagePullPolicy: IfNotPresent 21 | ports: 22 | - containerPort: 8081 23 | volumeMounts: 24 | - mountPath: /data/db 25 | name: mongodb 26 | env: 27 | - name: ME_CONFIG_MONGODB_ADMINUSERNAME 28 | valueFrom: 29 | secretKeyRef: 30 | name: mongodb-secret 31 | key: mongo-root-username 32 | - name: ME_CONFIG_MONGODB_ADMINPASSWORD 33 | valueFrom: 34 | secretKeyRef: 35 | name: mongodb-secret 36 | key: mongo-root-password 37 | - name: ME_CONFIG_MONGODB_SERVER 38 | valueFrom: 39 | configMapKeyRef: 40 | name: mongodb-configmap 41 | key: database_url 42 | volumes: 43 | - name: mongodb -------------------------------------------------------------------------------- /download_training_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from API import download_raw_data 3 | from datetime import datetime, timedelta, date 4 | from Preprocessing import pre_processing 5 | 6 | def download_data(year, month, day, raw_data_root): 7 | # Using API calls 8 | download_raw_data(year, month, day, raw_data_root=raw_data_root) 9 | 10 | def main(offset): 11 | 12 | # Define the date and pass it to the individual tasks 13 | download_date = date.today() - timedelta(days=int(offset)) 14 | date_ = download_date.strftime("%Y-%m-%d") 15 | 16 | raw_data_root = '/work/ka1176/shared_data/2020-03/raw_data_v3-1' 17 | annotated_raw_data_root = '/work/ka1176/shared_data/2020-03/annotated_raw_data_v3-1' 18 | 19 | print("*"*50) 20 | print(" Download date", date_) 21 | print("*"*50) 22 | 23 | # Download data for the past 10th day from today, today - 10th day 24 | download_data(download_date.year, download_date.month, download_date.day, raw_data_root) 25 | 26 | # annotate data 27 | # create filtered hdf5 from preprocessing 28 | pre_processing(download_date.year, download_date.month, download_date.day, dev_data_dir='/scratch/k/k202141/', 29 | raw_data_root=raw_data_root, annotated_raw_data_root=annotated_raw_data_root) 30 | 31 | if __name__ == "__main__": 32 | 33 | main(sys.argv[1]) 34 | -------------------------------------------------------------------------------- /Usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | ## In Script 4 | 5 | ```bash 6 | cd ~cygnss-deployment 7 | 8 | # download CyGNSS data 9 | python API.py 10 | 11 | # download ERA5 data and annotate CyGNSS data with wind speed labels 12 | # preprocss (filter) to create hdf5 13 | python Preprocessing.py 14 | 15 | # Inference 16 | PYTHONPATH="./externals/gfz_cygnss/":${PYTHONPATH} 17 | export PYTHONPATH 18 | 19 | python ./externals/gfz_cygnss/gfz_202003/training/cygnssnet.py --load-model-path ./externals/gfz_cygnss/trained_models/ygambdos_yykDM.ckpt --data ./dev_data --save-y-true --prediction-output-path ./prediction/current_predictions.h5 20 | ``` 21 | 22 | ## In Jupyter notebook 23 | 24 | ### Kernel 25 | 26 | Create `conda` environment using 27 | 28 | ```bash 29 | conda env create --file docker/kernel-env-cuda11.yaml 30 | 31 | conda activate cygnss-d 32 | 33 | # some packages were not installed correctly 34 | conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch 35 | conda install pytorch-lightning -c conda-forge 36 | pip install global-land-mask 37 | ``` 38 | Create Jupyterhub kernel from this environment following https://docs.dkrz.de/doc/software%26services/jupyterhub/kernels.html 39 | 40 | ### Setup for preprocessing 41 | 42 | #### Earthdata 43 | 44 | - Retrieve user ID and create `.netrc` as described in ... 45 | - change the persmission of the file: chmod og-rwx ~/.netrc 46 | 47 | #### ERA5 48 | 49 | Retrieve user ID and API key and create `cdsapi` as described in ... 50 | -------------------------------------------------------------------------------- /docker_cygnss_deployment/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Use root/example as user/password credentials 2 | version: '3.1' 3 | services: 4 | mongodb: 5 | image: mongo:6.0.3 6 | container_name: mongodb 7 | restart: always 8 | volumes: 9 | - mongodbdata:/data/db 10 | environment: 11 | MONGO_INITDB_ROOT_USERNAME: root 12 | MONGO_INITDB_ROOT_PASSWORD: example 13 | networks: 14 | - backend 15 | 16 | mongo-express: 17 | image: mongo-express:1.0.0-alpha.4 18 | container_name: mongo-express 19 | restart: always 20 | ports: 21 | - 8081:8081 22 | volumes: 23 | - mongodbdata:/data/db 24 | environment: 25 | ME_CONFIG_MONGODB_ADMINUSERNAME: root 26 | ME_CONFIG_MONGODB_ADMINPASSWORD: example 27 | ME_CONFIG_MONGODB_URL: mongodb://root:example@mongodb:27017/ 28 | networks: 29 | - backend 30 | 31 | 32 | streamlit: 33 | user: "${UID}:${GID}" 34 | build: . 35 | restart: always 36 | volumes: 37 | - "./../:/app/" 38 | - /home/k/k202156/.netrc:/.netrc 39 | - /home/k/k202156/.cdsapirc:/.cdsapirc 40 | ports: 41 | - "8501:8501" 42 | - "5000:5000" 43 | - "80:80" 44 | # command: bash -c "streamlit run dashboard.py" 45 | command: bash -c "python prefect-deploy.py && streamlit run dashboard.py --server.port=80 && mlflow ui --backend-store-uri sqlite:///mlruns.db -p 5000" 46 | env_file: 47 | - .env 48 | environment: 49 | PREFECT_API_URL: http://orion:4200/api 50 | depends_on: 51 | - mongodb 52 | networks: 53 | - backend 54 | 55 | orion: 56 | image: prefecthq/prefect:2.6.8-python3.11 57 | restart: always 58 | ports: 59 | - "4200:4200" 60 | volumes: 61 | - prefect:/root/.prefect 62 | entrypoint: ["prefect", "orion", "start"] 63 | environment: 64 | PREFECT_ORION_API_HOST: 0.0.0.0 65 | PREFECT_LOGGING_SERVER_LEVEL: WARNING 66 | PREFECT_API_URL: http://localhost:4200/api 67 | #PREFECT_ORION_DATABASE_CONNECTION_URL: sqlite+aiosqlite:////root/.prefect/orion.db 68 | 69 | depends_on: 70 | - mongodb 71 | networks: 72 | - backend 73 | 74 | 75 | prefect-agent: 76 | user: "${UID}:${GID}" 77 | restart: always 78 | build: . 79 | entrypoint: ["prefect", "agent", "start", "-q", "demo"] 80 | volumes: 81 | - "./../:/app/" 82 | - ${HOME}/.netrc:/.netrc 83 | - ${HOME}/.cdsapirc:/.cdsapirc 84 | environment: 85 | PREFECT_API_URL: http://orion:4200/api 86 | PREFECT_LOGGING_LEVEL: DEBUG 87 | env_file: 88 | - .env 89 | depends_on: 90 | - orion 91 | networks: 92 | - backend 93 | 94 | 95 | networks: 96 | backend: 97 | driver: bridge 98 | 99 | volumes: 100 | mongodbdata: 101 | driver: local 102 | prefect: 103 | 104 | 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Interface for Wind Speed Prediction 2 | 3 | ### About 4 | 5 | The objective of this repository is to deploy a pre-trained *CyGNSSnet* to predict global ocean wind speed in near time. The results are shown on a web interface, which provides different illustrations of the predicted wind speed and its error compared to [ERA5 windspeed](https://www.ecmwf.int/en/forecasts/datasets/reanalysis-datasets/era5) data. 6 | 7 | *CyGNSSnet* is a neural net developed to predict wind speed from [CYGNSS](https://podaac.jpl.nasa.gov/dataset/CYGNSS_L2_V3.0)(**Cy**clone **G**lobal **N**avigation **S**atellite **S**ystem) data. The code for *CyGNSSnet* itself is not public. For more information or if you need to access it contact Caroline Arnold (arnold@dkrz.de) or the Helmholtz AI consultant team for Earth and Environment (consultant-helmholtz.ai@dkrz.de). For more information on *CyGNSSnet*, see [Asgarimehr et al, Remote Sensing of Environment (2022)](https://doi.org/10.1016/j.rse.2021.112801) 8 | ### Workflow 9 | 10 | ![Workflow](/Workflow.png) 11 | 12 | 13 | 14 | ### Quick start 15 | 16 | To start the deployment run ```sh set_up_infrastructure.sh```. 17 | 18 | This clones the git repository and starts the deployment using docker-compose. 19 | Make sure you have docker and docker-compose installed. 20 | 21 | If you have already the cloned the git repository move to the directory ```docker_cygnss_deployment``` and run 22 | 23 | ``` 24 | docker-compose up 25 | ``` 26 | 27 | To stop the container, run following command: 28 | ``` 29 | docker-compose -f ./docker-compose.yml down --remove-orphans 30 | ``` 31 | 32 | Note: In order to run it you need access to the external submodule containing the CyGNSSnet. 33 | 34 | The deployment is scheduled using prefect. It is executed every day and downloads the CyGNSS data for the current date minus 10 days. Then the predictions are calculated, stored in a mongodb database and displayed on a streamlit dashboard. 35 | 36 | To access the streamlit dashboard: http://localhost:8501 37 | 38 | To access the mongodb database: http://localhost:8081 39 | 40 | To access the prefect ui: http://localhost:5000 41 | 42 | 43 | ### Repository Structure 44 | 45 | ``` 46 | API.py: download CyGNSS data 47 | Preprocessing.py: download ERA5 data and preprocess data 48 | dashboard.py: streamlit dashboard 49 | plots.py: helper functions to create the plots for the streamlit dashboard 50 | prefect-deploy.py: Deployment scheduled for every day 51 | externals/: folder with CyGNSSnet code 52 | notebooks/: folder with some notebooks that were created during the development 53 | docker_cygnss_deployment/: folder with docker files to start deployment 54 | ``` 55 | 56 | ## Data source 57 | 58 | - CYGNSS. CYGNSS Level 2 Science Data Record Version 3.1. Ver. 3.1. PO.DAAC, CA, USA. accessed 2022/2023 at 10.5067/CYGNS-L2X31 59 | - Copernicus Climate Change Service (C3S) (2017): ERA5: Fifth generation of ECMWF atmospheric reanalyses of the global climate . Copernicus Climate Change Service Climate Data Store (CDS), 2022/2023. https://cds.climate.copernicus.eu/cdsapp#!/home 60 | -------------------------------------------------------------------------------- /dashboard.py: -------------------------------------------------------------------------------- 1 | #import libraries 2 | import streamlit as st 3 | import pandas as pd 4 | import numpy as np 5 | import requests 6 | from sklearn.ensemble import RandomForestClassifier 7 | import json 8 | import datetime 9 | from datetime import timedelta 10 | 11 | import streamlit as st 12 | from pymongo import MongoClient, errors 13 | from PIL import Image 14 | import requests 15 | from io import BytesIO 16 | 17 | 18 | 19 | def user_input_features(): 20 | option = st.sidebar.selectbox( 21 | 'What would you like to see?', ('Results', 'About us')) 22 | date_ = st.sidebar.date_input("For which date you want to see the results", datetime.date.today() - timedelta(days=12), min_value = datetime.date(2021,1,1), max_value = datetime.date.today() - timedelta(days=12)) 23 | 24 | 25 | return date_, option 26 | 27 | # Initialize connection. 28 | # Uses st.experimental_singleton to only run once. 29 | @st.experimental_singleton 30 | def init_connection(): 31 | client = MongoClient('mongodb://root:example@mongodb:27017/') 32 | return client 33 | 34 | 35 | @st.experimental_memo(ttl=600) 36 | def get_data(date_): 37 | cygnss = client.cygnss 38 | from_date = date_ 39 | criteria = {"event_date": {"$eq": from_date}} 40 | items = cygnss.cygnss_collection.find(criteria) 41 | items = list(items) # make hashable for st.experimental_memo 42 | return items 43 | 44 | 45 | date_, option = user_input_features() 46 | 47 | 48 | # Pull data from the collection. 49 | # Uses st.experimental_memo to only rerun when the query changes or after 10 min. 50 | # Initializing connection 51 | client = init_connection() 52 | 53 | date_ = date_.strftime("%Y-%m-%d") 54 | 55 | # drop database if exists, just to not clutter it with multiple values of data 56 | # client.drop_database('cygnss') 57 | items = get_data(date_) 58 | 59 | if option == 'About us': 60 | 61 | 62 | st.write(""" 63 | # About US""") 64 | 65 | st.write("The objective of this website is to use a pre-trained CyGNSSnet \ 66 | to predict global ocean wind speed in near time. The results are shown on a web interface, \ 67 | which provides different illustrations of the predicted wind speed and its error compared to ERA5 windspeed data.\ 68 | CyGNSSnet is a neural net developed to predict wind speed from CYGNSS(Cyclone Global Navigation Satellite System) data.\ 69 | The code for CyGNSSnet itself is not public. For more information or if you need to access it contact Caroline Arnold (arnold@dkrz.de)\ 70 | or the Helmholtz AI consultant team for Earth and Environment (consultant-helmholtz.ai@dkrz.de). For more information on CyGNSSnet,\ 71 | see Asgarimehr et al, Remote Sensing of Environment (2022)") 72 | 73 | if option == 'Results': 74 | 75 | 76 | # Display results. 77 | if len(items) == 0: 78 | st.write(f" Data does not exist for this date. Choose a different date please!") 79 | 80 | else: 81 | # Creating UI 82 | # st.subheader('User Input parameters') 83 | 84 | st.write(""" 85 | # Results """) 86 | 87 | # app heading 88 | st.write(""" 89 | # Ocean Wind Speed""") 90 | 91 | st.write('Date:', date_) 92 | 93 | 94 | y_bins = ["up to 4m/s", "up to 8m/s", "up to 12m/s", 95 | "up to 16m/s", "up to 20m/s", "up to 100m/s"] 96 | for item in items: # @harsh can this be more than 1 item? 97 | st.write(f"Total RMSE is: {item['rmse']:.3f} m/s ") 98 | d = {'Windspeed': y_bins, 'RMSE': item['bin_rmse'], 'Bias': item['bin_bias'], 99 | 'Counts': [int(i) for i in item['bin_counts']]} 100 | df = pd.DataFrame(data=d) 101 | # hide first column (index) of the table 102 | hide_table_row_index = """ 103 | 107 | """ 108 | st.markdown(hide_table_row_index, unsafe_allow_html=True) 109 | st.table(data=df) 110 | 111 | for item in items: 112 | #response = requests.get(item['image_url']) 113 | # Image.open(BytesIO(response.content)) 114 | scatter = Image.open(item['scatterplot_path']) 115 | st.markdown(f"## Scatterplot: ERA5 wind speed - model prediction") 116 | st.image(scatter, caption="Scatterplot") 117 | 118 | histo = Image.open(item['histogram_path']) 119 | st.markdown(f"## Histogram: ERA5 wind speed and predicted wind speed") 120 | st.image(histo, caption="Histogram") 121 | 122 | #era_avg = Image.open(item['era_average_path']) 123 | # st.markdown(f"## ERA 5 Average") 124 | #st.image(era_avg, caption="ERA5 average") 125 | 126 | #rmse_avg = Image.open(item['rmse_average_path']) 127 | # st.markdown(f"## RMSE Average") 128 | #st.image(rmse_avg, caption="RMSE average") 129 | 130 | today_longavg = Image.open(item['today_longrunavg_path']) 131 | st.markdown(f"## RMSE - Today and Longrun Average") 132 | st.image(today_longavg, caption="RMSE - Today and Longrun Average") 133 | 134 | today_long_bias = Image.open(item['today_long_bias_path']) 135 | st.markdown(f"## BIAS - Today and Longrun Average") 136 | st.image(today_long_bias, caption="Bias - Today and Longrun Average") 137 | 138 | sample_counts = Image.open(item['sample_counts_path']) 139 | st.markdown(f"## Sample Counts") 140 | st.image(sample_counts, caption="Sample Counts") 141 | 142 | rmse_bins_era = Image.open(item['rmse_bins_era_path']) 143 | st.markdown(f"## RMSE for different Windspeed Bins") 144 | st.image(rmse_bins_era, caption="RMSE for different Windspeed Bins") 145 | 146 | bias_bins_era = Image.open(item['bias_bins_era_path']) 147 | st.markdown(f"## Bias for different Windspeed Bins") 148 | st.image(bias_bins_era, caption="Bias for different Windspeed Bins") 149 | 150 | -------------------------------------------------------------------------------- /Preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Preprocessing CyGNSS data 5 | 6 | import os 7 | import sys 8 | from datetime import datetime, date, timedelta 9 | import argparse 10 | 11 | sys.path.append('externals/gfz_cygnss/') 12 | from gfz_202003.preprocessing import preprocess as prep 13 | #sys.path.append('externals/gfz_cygnss/gfz_202003') 14 | #from preprocessing import preprocess as prep 15 | 16 | import numpy as np 17 | import xarray as xr 18 | import hashlib 19 | 20 | def pre_processing(year, month, day, dev_data_dir='/app/dev_data', raw_data_root='/app/raw_data', annotated_raw_data_root='/app/annotated_raw_data'): 21 | ''' 22 | Preprocessing routines for CyGNSSnet 23 | 24 | (1) Annotate CyGNSS raw data with windspeed labels from ERA5 25 | (2) Filter and generate hdf5 file 26 | 27 | Folder structure: 28 | 29 | * raw_data 30 | * annotated_raw_data 31 | * dev_data : filtered, one file test_data.h5 32 | 33 | Parameters: 34 | year, month, day - preprocess the data downloaded for that day 35 | dev_data_dir - directory to store the filtered data for that day 36 | raw_data_root - where to find the downloaded raw data 37 | annotated_raw_data_root - where to store the annotated raw data 38 | 39 | Returns: 40 | h5_file - path to the filtered data for that day 41 | ''' 42 | 43 | raw_data_sub = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%Y/%j") 44 | 45 | raw_data_dir = os.path.join(raw_data_root, raw_data_sub) 46 | annotated_raw_data_dir = os.path.join(annotated_raw_data_root, raw_data_sub) 47 | era5_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc') 48 | 49 | if not os.path.isdir(annotated_raw_data_dir): 50 | os.makedirs(annotated_raw_data_dir, exist_ok=True) 51 | 52 | if not os.path.isdir(dev_data_dir): 53 | os.makedirs(dev_data_dir, exist_ok=True) 54 | 55 | start_date = datetime(year, month, day).strftime("%Y-%m-%dT%H:%M:%SZ") 56 | end_date = (datetime(year, month, day) + timedelta(1)).strftime("%Y-%m-%dT%H:%M:%SZ") 57 | 58 | for cygnss_file in os.listdir(raw_data_dir): 59 | if cygnss_file.startswith('cyg') and cygnss_file.endswith('.nc'): 60 | print("annotating", cygnss_file) 61 | 62 | pcf = os.path.join(raw_data_dir, cygnss_file) 63 | phf = os.path.join(annotated_raw_data_dir, cygnss_file.replace('.nc', '.md5')) 64 | 65 | print("create hash", phf) 66 | 67 | if os.path.exists(phf): 68 | print("-- hash exists, skip") 69 | continue 70 | 71 | annotate_dataset(pcf, era5_data, save_dataset=True) 72 | 73 | hmd5 = hash_large_file(pcf) 74 | with open(phf, 'w') as hf: 75 | hf.write(hmd5) 76 | 77 | dday = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%j") # need that later 78 | 79 | args = argparse.Namespace(raw_data_dir=annotated_raw_data_root, 80 | output_dir=dev_data_dir, 81 | v_map=['brcs', 'eff_scatter', 'raw_counts', 'power_analog'], 82 | n_valid_days=0, 83 | n_test_days=1, 84 | n_processes=1, 85 | only_merge=False, 86 | use_land_data=False, 87 | is_ml_ops=True, 88 | version='v3.1', 89 | day=dday, 90 | year=year, 91 | reduce_mode='') 92 | 93 | prep.generate_input_data(args) 94 | 95 | def hash_large_file(file): 96 | ''' 97 | Read a large file in chunks and compute the MD5 checksum 98 | 99 | Parameters: 100 | file - the file to be hashed 101 | 102 | Returns: 103 | hash(file) 104 | ''' 105 | with open(file,'rb') as f: 106 | file_hash = hashlib.md5() 107 | while chunk := f.read(8192): 108 | file_hash.update(chunk) 109 | 110 | print(file_hash.hexdigest()) 111 | return file_hash.hexdigest() 112 | 113 | def annotate_dataset(cygnss_file, era5_file, save_dataset=False): 114 | ''' 115 | Annotate a given CyGNSS dataset with ERA5 windspeed labels and save to disk 116 | 117 | The ERA5 grid is padded to mimic periodic boundary conditions. 118 | 119 | Annotate additional ERA5 parameters (GPM_precipitation) 120 | 121 | TODO: hash 122 | 123 | Parameters: 124 | cygnss_file : path to CyGNSS dataset 125 | era5_file : path to orresponding ERA5 dataset 126 | save_dataset : if True, save dataset to disk in annotated_raw_data_dir (default: False) 127 | 128 | Returns: 129 | Annotated CyGNSS dataset 130 | ''' 131 | 132 | # necessary because lazy loading prohibits overwriting the netcdf files at the end of this section 133 | with xr.open_dataset(cygnss_file) as data: 134 | cygnss_ds = data.load() 135 | 136 | with xr.open_dataset(era5_file) as data: 137 | era5_ds = data.load() 138 | 139 | # needs to be shifted by 180 for compatibility with CyGNSS 140 | era5_ds = era5_ds.assign_coords(longitude=era5_ds.coords['longitude'] + 180) 141 | 142 | # pad to the right (> 360 deg lon) 143 | era5_r = era5_ds.where(era5_ds.longitude < 10, drop=True) 144 | # pad to the left (< 0 deg lon) 145 | era5_l = era5_ds.where(era5_ds.longitude > 350, drop=True) 146 | # shift coordinate outside bounding box 147 | era5_r = era5_r.assign_coords(longitude=era5_r.coords['longitude'] + 360) 148 | era5_l = era5_l.assign_coords(longitude=era5_l.coords['longitude'] - 360) 149 | 150 | padded_ds = xr.merge([era5_l, era5_ds, era5_r]) 151 | 152 | interp_ds = padded_ds.interp(longitude=cygnss_ds.sp_lon, latitude=cygnss_ds.sp_lat, time=cygnss_ds.ddm_timestamp_utc, method='nearest') 153 | 154 | cygnss_ds['ERA5_u10'] = interp_ds['u10'] 155 | cygnss_ds['ERA5_v10'] = interp_ds['v10'] 156 | cygnss_ds['GPM_precipitation'] = interp_ds['tp'] 157 | 158 | tmp_attrs = cygnss_ds['ERA5_u10'].attrs 159 | tmp_attrs['long_name'] = cygnss_ds['ERA5_u10'].long_name + ' (interpolated)' 160 | cygnss_ds['ERA5_u10'].attrs = tmp_attrs 161 | 162 | tmp_attrs = cygnss_ds['ERA5_v10'].attrs 163 | tmp_attrs['long_name'] = cygnss_ds['ERA5_v10'].long_name + ' (interpolated)' 164 | cygnss_ds['ERA5_v10'].attrs = tmp_attrs 165 | 166 | cygnss_ds = cygnss_ds.drop_vars(['longitude', 'latitude', 'time']) 167 | 168 | # dummy values only for preprocessing routine 169 | cygnss_ds['ERA5_mdts'] = -9999 170 | cygnss_ds['ERA5_mdww'] = -9999 171 | cygnss_ds['ERA5_swh'] = -9999 172 | cygnss_ds['ERA5_shts'] = -9999 173 | cygnss_ds['ERA5_shww'] = -9999 174 | cygnss_ds['ERA5_p140121'] = -9999 175 | cygnss_ds['ERA5_p140124'] = -9999 176 | cygnss_ds['ERA5_p140127'] = -9999 177 | 178 | # additional condition - check for quality flag here 179 | cygnss_ds = cygnss_ds.where(cygnss_ds['quality_flags'] == 4, drop=True) 180 | 181 | if save_dataset: 182 | cygnss_ds.to_netcdf(cygnss_file.replace('raw_data', 'annotated_raw_data')) 183 | 184 | return cygnss_ds 185 | 186 | if __name__=='__main__': 187 | pre_processing() 188 | -------------------------------------------------------------------------------- /plots.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from matplotlib import lines, colors, ticker 7 | import seaborn as sns 8 | import cartopy.crs as ccrs 9 | from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter 10 | from mpl_toolkits.axes_grid1 import AxesGrid 11 | import itertools 12 | plt.switch_backend('agg') 13 | 14 | deg = 1 # grid resolution (publication: 1) 15 | 16 | grid_lon = np.arange(-180, 181, deg) 17 | grid_lat = np.arange(-90, 91, deg) 18 | 19 | def average_to_grid2(lon, lat, var, resolution=1, fill_value=-1): 20 | ''' 21 | Grid a time-dependent variable in lon/lat and average over all counts 22 | 23 | lon - time series of lon coordinate (1D) (0...360) 24 | lat - time series of lat coordinate (1D) 25 | var - time series of variable (1D) 26 | resolution - target grid resolution (default: 1 deg) 27 | fill_value - a value that can be used for filling (i.e. that does not show up in var) 28 | 29 | Returns: 30 | 2D gridded arrays for lat, lon, count-averaged var 31 | ''' 32 | 33 | assert len(lon) == len(lat) 34 | assert len(lon) == len(var) 35 | 36 | grid_lon = np.arange(0, 360+resolution, resolution) 37 | grid_lat = np.arange(-90, 90+resolution, resolution)[::-1] # top left is +lat 38 | 39 | ix_lon = np.digitize(lon, grid_lon) 40 | ix_lat = np.digitize(lat, grid_lat) 41 | 42 | xx, yy = np.meshgrid(grid_lon, grid_lat, indexing='ij') 43 | gridded_var = np.empty(xx.shape, dtype='float') 44 | gridded_var[:] = fill_value 45 | 46 | ij = itertools.product(np.unique(ix_lon), np.unique(ix_lat)) 47 | 48 | for i,j in ij: 49 | cond = (ix_lon==i) & (ix_lat==j) 50 | gridded_var[i,j] = np.mean(var[cond]) 51 | 52 | gridded_var[gridded_var==fill_value] = None 53 | 54 | 55 | return xx, yy, gridded_var 56 | 57 | def make_scatterplot(y_true, y_pred, date_): 58 | ymin = 2.5 59 | ymax = 25.0 60 | 61 | fig=plt.figure() 62 | ax=fig.add_subplot(111) 63 | 64 | img=ax.hexbin(y_true, y_pred, cmap='viridis', norm=colors.LogNorm(vmin=1, vmax=25000), mincnt=1) 65 | clb=plt.colorbar(img) 66 | clb.set_ticks([1, 10, 100, 1000, 10000]) 67 | clb.set_ticklabels([r'$1$', r'$10$', r'$10^2$', r'$10^3$', r'$10^4$']) 68 | clb.set_label('Samples in bin') 69 | clb.ax.tick_params() 70 | 71 | ax.set_xlabel('ERA5 wind speed (m/s)') 72 | ax.set_ylabel('Predicted wind speed (m/s)') 73 | 74 | ax.plot(np.linspace(0, 30), np.linspace(0, 30), 'w:') 75 | 76 | ax.set_ylim(ymin, 25) 77 | ax.set_xlim(ymin, 25) 78 | 79 | ax.set_xticks([5, 10, 15, 20, 25]) 80 | ax.set_xticklabels([5, 10, 15, 20, 25]) 81 | ax.set_yticks([5, 10, 15, 20, 25]) 82 | ax.set_yticklabels([5, 10, 15, 20, 25]) 83 | 84 | fig.tight_layout() 85 | plt.savefig(f'/app/plots/scatter_{date_}.png') 86 | 87 | def make_histogram(y_true, y_pred, date_): 88 | fig=plt.figure() 89 | ax=fig.add_subplot(111) 90 | 91 | sns.histplot(y_true, ax=ax, color='C7', label='ERA5 wind speed (m/s)') 92 | sns.histplot(y_pred, ax=ax, color='C2', label='Predicted wind speed (m/s)') 93 | 94 | ax.legend(fontsize=12) 95 | 96 | ax.set_xticks([5, 10, 15, 20, 25]) 97 | ax.set_xticklabels([5, 10, 15, 20, 25]) 98 | ax.set_xlabel('ERA5 wind speed (m/s)') 99 | 100 | plt.savefig(f'/app/plots/histo_{date_}.png') 101 | 102 | def era_average(y_true, sp_lon, sp_lat, date_): 103 | xx, yy, gridded_y_true = average_to_grid2(sp_lon[:], sp_lat[:], y_true[:], resolution=deg) 104 | proj = ccrs.PlateCarree(180) 105 | 106 | fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj)) 107 | cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_y_true[:].T, levels=60, transform=proj, antialiased=False, cmap='magma') 108 | ax.coastlines() 109 | gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':') 110 | gl.top_labels = False 111 | gl.right_labels= False 112 | clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average ERA5 wind speed (m/s)') 113 | 114 | clb.set_ticks(np.arange(2.5, 18, 2.5)) 115 | clb.ax.tick_params(labelsize=8) 116 | 117 | gl.xlabel_style = {'size': 8, 'color': 'black'} 118 | gl.ylabel_style = {'size': 8, 'color': 'black'} 119 | 120 | plt.savefig(f'/app/plots/era_average_{date_}.png') 121 | 122 | def rmse_average(y_true, y_pred, sp_lon, sp_lat): 123 | xx, yy, gridded_rmse = average_to_grid2(sp_lon[:], sp_lat[:], np.abs(y_pred[:] - y_true[:]), resolution=deg) 124 | proj = ccrs.PlateCarree(180) 125 | fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj)) 126 | cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_rmse[:].T, levels=60, transform=proj, antialiased=False, cmap='viridis') 127 | ax.coastlines() 128 | gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':') 129 | gl.top_labels = False 130 | gl.right_labels= False 131 | clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average RMSE (m/s)') 132 | 133 | clb.set_ticks(np.arange(0, np.nanmax(gridded_rmse)+1, 1.0)) 134 | clb.ax.tick_params(labelsize=8) 135 | 136 | gl.xlabel_style = {'size': 8, 'color': 'black'} 137 | gl.ylabel_style = {'size': 8, 'color': 'black'} 138 | 139 | 140 | def today_longrunavg(df_mockup, y_bins, date_): 141 | 142 | fig=plt.figure(figsize=(10,4)) 143 | ax=fig.add_subplot(111) 144 | 145 | sns.barplot(data=df_mockup, x='bins', y='rmse', hue='time', ax=ax) 146 | ax.legend() 147 | 148 | ax.set_xlabel('ERA5 wind speed (m/s)') 149 | ax.set_ylabel('RMSE (m/s)') 150 | 151 | ax.set_xticks(range(len(y_bins))) 152 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins]) 153 | 154 | plt.savefig(f'/app/plots/today_longrunavg_{date_}.png') 155 | 156 | def today_longrunavg_bias(df_mockup, y_bins, date_): 157 | 158 | fig=plt.figure(figsize=(10,4)) 159 | ax=fig.add_subplot(111) 160 | 161 | sns.barplot(data=df_mockup, x='bins', y='bias', hue='time', ax=ax) 162 | ax.legend() 163 | 164 | ax.set_xlabel('ERA5 wind speed (m/s)') 165 | ax.set_ylabel('Bias (m/s)') 166 | 167 | ax.set_xticks(range(len(y_bins))) 168 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins]) 169 | 170 | plt.savefig(f'/app/plots/today_long_bias_{date_}.png') 171 | 172 | def sample_counts(df_rmse, y_bins, date_): 173 | 174 | fig=plt.figure(figsize=(10,4)) 175 | ax=fig.add_subplot(111) 176 | sns.barplot(data=df_rmse, x='bins', y='counts', ax=ax) 177 | ax.set_xlabel('ERA5 wind speed (m/s)') 178 | ax.set_ylabel('Sample counts') 179 | 180 | ax.set_xticks(range(len(y_bins))) 181 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins]) 182 | 183 | plt.savefig(f'/app/plots/sample_counts_{date_}.png') 184 | 185 | def rmse_bins_era(df_rmse, y_bins, date_): 186 | 187 | fig=plt.figure(figsize=(10,4)) 188 | ax=fig.add_subplot(111) 189 | sns.barplot(data=df_rmse, x='bins', y='rmse', ax=ax) 190 | ax.set_xlabel('ERA5 wind speed (m/s)') 191 | ax.set_ylabel('RMSE (m/s)') 192 | 193 | ax.set_xticks(range(len(y_bins))) 194 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins]) 195 | 196 | plt.savefig(f'/app/plots/rmse_bins_era_{date_}.png') 197 | 198 | def bias_bins_era(df_rmse, y_bins, date_): 199 | 200 | fig=plt.figure(figsize=(10,4)) 201 | ax=fig.add_subplot(111) 202 | sns.barplot(data=df_rmse, x='bins', y='bias', ax=ax) 203 | ax.set_xlabel('ERA5 wind speed (m/s)') 204 | ax.set_ylabel('Bias (m/s)') 205 | 206 | ax.set_xticks(range(len(y_bins))) 207 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins]) 208 | 209 | plt.savefig(f'/app/plots/bias_bins_era_{date_}.png') 210 | -------------------------------------------------------------------------------- /API.py: -------------------------------------------------------------------------------- 1 | import xarray as xr 2 | import numpy as np 3 | import os 4 | import sys 5 | from datetime import date, timedelta, datetime 6 | 7 | from subscriber import podaac_access as pa 8 | import cdsapi 9 | from urllib.error import HTTPError 10 | from urllib.request import urlretrieve 11 | import logging 12 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) 13 | 14 | def download_raw_data(year, month, day, raw_data_root='/app/raw_data'): 15 | ''' 16 | Download raw data using API 17 | 18 | * CyGNSS data 19 | * ERA5 data 20 | 21 | For compliance with the CyGNSSnet preprocessing routines, the data is stored in 22 | 23 | > {raw_data_root}/{year}/{day-of-year} 24 | 25 | Parameters: 26 | year, month, day - download data from the full day specified 27 | raw_data_root - root of path to store the data 28 | ''' 29 | 30 | raw_data_sub = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%Y/%j") 31 | 32 | raw_data_dir = os.path.join(raw_data_root, raw_data_sub) 33 | 34 | print('Downloading data in this directory: ', raw_data_dir) 35 | 36 | start_date = datetime(year, month, day).strftime("%Y-%m-%dT%H:%M:%SZ") 37 | end_date = (datetime(year, month, day) + timedelta(1)).strftime("%Y-%m-%dT%H:%M:%SZ") 38 | 39 | print(f'--start-date {start_date}') 40 | print(f'--end-date {end_date}') 41 | 42 | # PODAAC data 43 | adapted_podaac_downloader(start_date, end_date, raw_data_dir) 44 | 45 | # ERA5 data 46 | era5_downloader(year, month, day, raw_data_dir) 47 | 48 | 49 | def era5_downloader(year, month, day, raw_data_dir): 50 | ''' 51 | ERA5 data downloader from Copernicus 52 | 53 | We need to download all the time steps of the current day, as well as the 54 | time step midnight on the following day. These are merged. 55 | 56 | Parameters: 57 | year, month, day - download data from the full day specified 58 | data_path - path to store the data 59 | ''' 60 | 61 | print("Start ERA5 download") 62 | target_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc') 63 | era5_data = os.path.join(raw_data_dir, 'ERA5_today.nc') 64 | tomorrow_era5_data = os.path.join(raw_data_dir, 'ERA5_tomorrow.nc') 65 | cds = cdsapi.Client() 66 | 67 | # Retrieve today's data 68 | cds.retrieve( 69 | 'reanalysis-era5-single-levels', 70 | { 71 | 'product_type': 'reanalysis', 72 | 'format': 'netcdf', 73 | 'variable': [ 74 | '10m_u_component_of_wind', '10m_v_component_of_wind', 75 | 'total_precipitation', 76 | ], 77 | 'year': year, 78 | 'month': month, 79 | 'day': day, 80 | 'time': [ 81 | '00:00', '01:00', '02:00', 82 | '03:00', '04:00', '05:00', 83 | '06:00', '07:00', '08:00', 84 | '09:00', '10:00', '11:00', 85 | '12:00', '13:00', '14:00', 86 | '15:00', '16:00', '17:00', 87 | '18:00', '19:00', '20:00', 88 | '21:00', '22:00', '23:00' 89 | ], 90 | 'area': [ 91 | 50, -180, -50, 180, 92 | ], 93 | }, 94 | era5_data) 95 | 96 | # Retrieve tomorrow's data 97 | tomorrow = datetime(year, month, day) + timedelta(1) 98 | 99 | cds.retrieve( 100 | 'reanalysis-era5-single-levels', 101 | { 102 | 'product_type': 'reanalysis', 103 | 'format': 'netcdf', 104 | 'variable': [ 105 | '10m_u_component_of_wind', '10m_v_component_of_wind', 106 | 'total_precipitation', 107 | ], 108 | 'year': tomorrow.year, 109 | 'month': tomorrow.month, 110 | 'day': tomorrow.day, 111 | 'time': [ 112 | '00:00', '01:00' 113 | ], 114 | 'area': [ 115 | 50, -180, -50, 180, 116 | ], 117 | }, 118 | tomorrow_era5_data) 119 | 120 | # Retrieve tomorrow's data 121 | with xr.open_dataset(era5_data) as f1, xr.open_dataset(tomorrow_era5_data) as f2: 122 | era5_ds = xr.merge([f1.load(), f2.load()]) 123 | era5_ds.to_netcdf(target_data) 124 | 125 | print('SUCCESS: Retrieved ERA5 data') 126 | 127 | 128 | def adapted_podaac_downloader(start_date, end_date, data_path): 129 | ''' 130 | PODAAC data downloader adapted for CyGNSSnet 131 | 132 | Adapted from the run routine in 133 | https://github.com/podaac/data-subscriber/blob/main/subscriber/podaac_data_downloader.py 134 | 135 | Parameters: 136 | start_date - download start date in ISO format 137 | end_date - download end date in ISO format 138 | data_path - path to store the data 139 | ''' 140 | 141 | # Default values 142 | page_size = 2000 143 | edl = pa.edl 144 | cmr = pa.cmr 145 | token_url = pa.token_url 146 | 147 | pa.setup_earthdata_login_auth(edl) 148 | token = pa.get_token(token_url) 149 | print('Completed PODAAC authentification') 150 | 151 | provider = 'POCLOUD' 152 | #search_cycles = args.search_cycles [None ?] 153 | short_name = 'CYGNSS_L1_V3.1' 154 | extensions = None 155 | #process_cmd = args.process_cmd [empty ?] 156 | 157 | download_limit = None 158 | ts_shift = timedelta(hours=0) 159 | 160 | verbose = True 161 | force = False 162 | 163 | 164 | if not os.path.isdir(data_path): 165 | print("NOTE: Making new data directory at " + data_path + "(This is the first run.)") 166 | os.makedirs(data_path, exist_ok=True) 167 | 168 | temporal_range = pa.get_temporal_range(start_date, end_date, 169 | datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501 170 | params = [ 171 | ('page_size', page_size), 172 | ('sort_key', "-start_date"), 173 | ('provider', provider), 174 | ('ShortName', short_name), 175 | ('temporal', temporal_range), 176 | ] 177 | print("Temporal Range: " + temporal_range) 178 | 179 | # TODO bbox 180 | 181 | #if args.bbox is not None: 182 | # params.append(('bounding_box', args.bbox)) 183 | 184 | # If 401 is raised, refresh token and try one more time 185 | try: 186 | results = pa.get_search_results(params, verbose) 187 | except HTTPError as e: 188 | if e.code == 401: 189 | token = pa.refresh_token(token, 'podaac-subscriber') 190 | params['token'] = token 191 | results = pa.get_search_results(params, verbose) 192 | else: 193 | raise e 194 | 195 | if verbose: 196 | print(str(results['hits']) + " granules found for " + short_name) # noqa E501 197 | 198 | downloads_all = [] 199 | downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if 200 | u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in 201 | results['items']] 202 | downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in 203 | results['items']] 204 | checksums = pa.extract_checksums(results) 205 | 206 | for f in downloads_data: 207 | downloads_all.append(f) 208 | for f in downloads_metadata: 209 | downloads_all.append(f) 210 | 211 | downloads = [item for sublist in downloads_all for item in sublist] 212 | 213 | if len(downloads) >= page_size: 214 | logging.warning("Only the most recent " + str( 215 | page_size) + " granules will be downloaded; try adjusting your search criteria (suggestion: reduce time period or spatial region of search) to ensure you retrieve all granules.") 216 | 217 | # filter list based on extension 218 | if not extensions: 219 | extensions = pa.extensions 220 | filtered_downloads = [] 221 | for f in downloads: 222 | for extension in extensions: 223 | if f.lower().endswith(extension): 224 | filtered_downloads.append(f) 225 | 226 | downloads = filtered_downloads 227 | 228 | print("Found " + str(len(downloads)) + " total files to download") 229 | if verbose: 230 | print("Downloading files with extensions: " + str(extensions)) 231 | 232 | # NEED TO REFACTOR THIS, A LOT OF STUFF in here 233 | # Finish by downloading the files to the data directory in a loop. 234 | # Overwrite `.update` with a new timestamp on success. 235 | success_cnt = failure_cnt = skip_cnt = 0 236 | for f in downloads: 237 | try: 238 | output_path = os.path.join(data_path, os.path.basename(f)) 239 | 240 | # decide if we should actually download this file (e.g. we may already have the latest version) 241 | if(os.path.exists(output_path) and not force and pa.checksum_does_match(output_path, checksums)): 242 | print(str(datetime.now()) + " SKIPPED: " + f) 243 | skip_cnt += 1 244 | continue 245 | 246 | urlretrieve(f, output_path) 247 | #pa.process_file(process_cmd, output_path, args) 248 | print(str(datetime.now()) + " SUCCESS: " + f) 249 | success_cnt = success_cnt + 1 250 | 251 | #if limit is set and we're at or over it, stop downloading 252 | if download_limit and success_cnt >= download_limit: 253 | break 254 | 255 | except Exception: 256 | logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True) 257 | failure_cnt = failure_cnt + 1 258 | 259 | print("Downloaded Files: " + str(success_cnt)) 260 | print("Failed Files: " + str(failure_cnt)) 261 | print("Skipped Files: " + str(skip_cnt)) 262 | pa.delete_token(token_url, token) 263 | print("END\n\n") 264 | 265 | if __name__=='__main__': 266 | download_data_date = date.today() - timedelta(days=10) 267 | download_raw_data(year = download_data_date.year, month = download_data_date.month, day = download_data_date.day) 268 | -------------------------------------------------------------------------------- /notebooks/DailyAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "6189727d-4f56-49fc-b2f0-b642097206b3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import h5py\n", 11 | "from matplotlib import pyplot as plt\n", 12 | "from matplotlib import lines, colors, ticker\n", 13 | "import seaborn as sns\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "\n", 17 | "from sklearn.metrics import mean_squared_error\n", 18 | "\n", 19 | "import sys\n", 20 | "sys.path.append('../externals/gfz_cygnss/')\n", 21 | "import gfz_202003.utils.mathematics as mat" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "dcf836dd-cf50-43af-bd72-34f151b9b006", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "f_pred = h5py.File('/work/ka1176/caroline/gitlab/cygnss-deployment/prediction/current_predictions.h5', 'r')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "ac9c112a-931d-46e4-9e54-bf8fa164ed9a", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "f_pred.keys()" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "44e45c03-df95-4b80-868d-ebc01a1d6642", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "y_true = f_pred['y_true'][:]\n", 52 | "y_pred = f_pred['y_pred'][:]\n", 53 | "sp_lon = f_pred['sp_lon'][:]\n", 54 | "sp_lat = f_pred['sp_lat'][:]" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "479629bd-1a80-46fb-af33-e806db8be955", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "rmse = mean_squared_error(y_true, y_pred, squared=False)\n", 65 | "\n", 66 | "print(f'Overall root mean square error (RMSE): {rmse:.4f} m/s')" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "88ef8364-a0b7-4c92-a9b2-2d1485a4c54d", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "y_bins = [4, 8, 12, 16, 20, 100]\n", 77 | "y_ix = np.digitize(y_true, y_bins, right=False)\n", 78 | "\n", 79 | "all_rmse = np.zeros(len(y_bins))\n", 80 | "all_bias = np.zeros(len(y_bins))\n", 81 | "all_counts = np.zeros(len(y_bins))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "794d4be3-e785-4c9a-ac2b-bd5eb7ad795e", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "for i, yy in enumerate(y_bins):\n", 92 | " if np.any(y_ix==i):\n", 93 | " rmse = mean_squared_error(y_true[y_ix==i], y_pred[y_ix==i], squared=False)\n", 94 | " all_rmse[i] = rmse\n", 95 | " all_bias[i] = np.mean(y_pred[y_ix==i] - y_true[y_ix==i])\n", 96 | " all_counts[i] = np.sum(y_ix==i)\n", 97 | " print(f'RMSE in bin {i} (up to {yy} m/s): {rmse:.4f} m/s')\n", 98 | " else:\n", 99 | " all_rmse[i] = None\n", 100 | " all_bias[i] = None\n", 101 | " all_counts[i] = 0\n", 102 | " print(f\"--- No samples in bin {i} (up to {yy} m/s)\")\n", 103 | " \n", 104 | "df_rmse = pd.DataFrame(dict(rmse=all_rmse, bias=all_bias, bins=y_bins, counts=all_counts))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "718f101f-8c42-4ade-9d5c-e0650490ca9b", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "9512aff8-2d0f-48a7-944d-8fa9e0c9cbf0", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "sns.set_style('whitegrid')\n", 123 | "sns.set_context('talk')" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "f7281124-4ae0-4075-a0f4-5cc526144e36", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "fig=plt.figure()\n", 134 | "ax=fig.add_subplot(111)\n", 135 | "\n", 136 | "sns.histplot(y_true, ax=ax, color='C7', label='ERA5 wind speed (m/s)')\n", 137 | "sns.histplot(y_pred, ax=ax, color='C2', label='Predicted wind speed (m/s)')\n", 138 | "\n", 139 | "ax.legend(fontsize=12)\n", 140 | "\n", 141 | "ax.set_xticks([5, 10, 15, 20, 25])\n", 142 | "ax.set_xticklabels([5, 10, 15, 20, 25])\n", 143 | "ax.set_xlabel('ERA5 wind speed (m/s)')\n", 144 | "\n", 145 | "plt.show()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "bdf3083d-4881-4552-9174-766235fef0a6", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "ymin = 2.5\n", 156 | "ymax = 25.0\n", 157 | "\n", 158 | "fig=plt.figure()\n", 159 | "ax=fig.add_subplot(111)\n", 160 | "\n", 161 | "img=ax.hexbin(y_true, y_pred, cmap='viridis', norm=colors.LogNorm(vmin=1, vmax=25000), mincnt=1)\n", 162 | "clb=plt.colorbar(img)\n", 163 | "clb.set_ticks([1, 10, 100, 1000, 10000])\n", 164 | "clb.set_ticklabels([r'$1$', r'$10$', r'$10^2$', r'$10^3$', r'$10^4$'])\n", 165 | "clb.set_label('Samples in bin')\n", 166 | "clb.ax.tick_params()\n", 167 | "\n", 168 | "ax.set_xlabel('ERA5 wind speed (m/s)')\n", 169 | "ax.set_ylabel('Predicted wind speed (m/s)')\n", 170 | "\n", 171 | "ax.plot(np.linspace(0, 30), np.linspace(0, 30), 'r:')\n", 172 | "\n", 173 | "ax.set_ylim(ymin, 25)\n", 174 | "ax.set_xlim(ymin, 25)\n", 175 | "\n", 176 | "ax.set_xticks([5, 10, 15, 20, 25])\n", 177 | "ax.set_xticklabels([5, 10, 15, 20, 25])\n", 178 | "ax.set_yticks([5, 10, 15, 20, 25])\n", 179 | "ax.set_yticklabels([5, 10, 15, 20, 25])\n", 180 | "\n", 181 | "fig.tight_layout()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "0460d844-6142-497a-9710-312e2c3da617", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "import cartopy.crs as ccrs\n", 192 | "from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter\n", 193 | "from mpl_toolkits.axes_grid1 import AxesGrid" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "a2bc579a-bf63-43ba-8547-6e0173f8903c", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "deg = 1 # grid resolution (publication: 1)\n", 204 | "\n", 205 | "xx, yy, gridded_y_true = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_true[:], resolution=deg)\n", 206 | "xx, yy, gridded_y_pred = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_pred[:], resolution=deg)\n", 207 | "xx, yy, gridded_rmse = mat.average_to_grid2(sp_lon[:], sp_lat[:], np.abs(y_pred[:] - y_true[:]), resolution=deg)\n", 208 | "xx, yy, gridded_bias = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_pred[:] - y_true[:], resolution=deg)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "id": "db6e3e4b-1127-4013-9041-27a4f74412d4", 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "grid_lon = np.arange(-180, 181, deg)\n", 219 | "grid_lat = np.arange(-90, 91, deg)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "2844ffa6-b5f0-4f26-b11d-1c111929b59d", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "proj = ccrs.PlateCarree(180)\n", 230 | "fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))\n", 231 | "cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_y_true[:].T, levels=60, transform=proj, antialiased=False, cmap='magma')\n", 232 | "ax.coastlines()\n", 233 | "gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')\n", 234 | "gl.top_labels = False\n", 235 | "gl.right_labels= False\n", 236 | "clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average ERA5 wind speed (m/s)')\n", 237 | "\n", 238 | "clb.set_ticks(np.arange(2.5, 18, 2.5))\n", 239 | "clb.ax.tick_params(labelsize=8)\n", 240 | "\n", 241 | "gl.xlabel_style = {'size': 8, 'color': 'black'}\n", 242 | "gl.ylabel_style = {'size': 8, 'color': 'black'}\n", 243 | "\n", 244 | "plt.show()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "id": "e28af2cb-7400-460e-982e-f03f33ebf67c", 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "proj = ccrs.PlateCarree(180)\n", 255 | "fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))\n", 256 | "cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_rmse[:].T, levels=60, transform=proj, antialiased=False, cmap='viridis')\n", 257 | "ax.coastlines()\n", 258 | "gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')\n", 259 | "gl.top_labels = False\n", 260 | "gl.right_labels= False\n", 261 | "clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average RMSE (m/s)')\n", 262 | "\n", 263 | "clb.set_ticks(np.arange(0, np.nanmax(gridded_rmse)+1, 1.0))\n", 264 | "clb.ax.tick_params(labelsize=8)\n", 265 | "\n", 266 | "gl.xlabel_style = {'size': 8, 'color': 'black'}\n", 267 | "gl.ylabel_style = {'size': 8, 'color': 'black'}\n", 268 | "\n", 269 | "plt.show()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "937f9d18-12fa-436a-baea-f92af46d5e87", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "CyGNSS Deployment", 284 | "language": "python", 285 | "name": "cygnss-d" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.9.13" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 5 302 | } 303 | -------------------------------------------------------------------------------- /prefect-deploy.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pydoc import cli 3 | import sys 4 | import shutil 5 | import time 6 | import pandas as pd 7 | import numpy as np 8 | import h5py 9 | import torch 10 | from torch.utils.data import DataLoader, Dataset 11 | import pytorch_lightning as pl 12 | from pytorch_lightning.callbacks.model_summary import ModelSummary 13 | from sklearn.metrics import mean_squared_error 14 | from collections import namedtuple 15 | import xarray 16 | import mlflow 17 | from prefect import flow, task 18 | import streamlit as st 19 | # TODO Fix these imports 20 | # from prefect.deployments import DeploymentSpec 21 | #from prefect.flow_runners import SubprocessFlowRunner 22 | from prefect.orion.schemas.schedules import IntervalSchedule, CronSchedule 23 | from prefect.deployments import Deployment 24 | from prefect.filesystems import RemoteFileSystem 25 | from prefect.infrastructure import DockerContainer 26 | from prefect.task_runners import SequentialTaskRunner 27 | from pymongo import MongoClient, errors 28 | from API import download_raw_data 29 | from datetime import datetime, timedelta, date 30 | sys.path.append('/app/externals/gfz_cygnss/') 31 | sys.path.append('/app/externals/gfz_cygnss/gfz_202003') 32 | sys.path.append('/app/externals/gfz_cygnss/gfz_202003/training') 33 | 34 | from cygnssnet import ImageNet, DenseNet, CyGNSSNet, CyGNSSDataset, CyGNSSDataModule 35 | from plots import make_scatterplot, make_histogram, era_average, rmse_average, today_longrunavg, today_longrunavg_bias, sample_counts, rmse_bins_era, bias_bins_era 36 | #import plots 37 | from Preprocessing import pre_processing 38 | 39 | @task 40 | def download_data(year, month, day): 41 | # Using API calls 42 | download_raw_data(year, month, day) 43 | 44 | @task 45 | def get_data(client): 46 | cygnss = client.cygnss 47 | items = cygnss.cygnss_collection.find() 48 | items = list(items) # make hashable for st.experimental_memo 49 | for item in items: 50 | print(f"RMSE is: {item['rmse']}") 51 | 52 | 53 | @task 54 | def drop_database(client): 55 | client.drop_database('cygnss') 56 | 57 | @task 58 | @st.experimental_singleton 59 | def save_to_db(domain, port, y_pred, rmse, date_, rmse_time): 60 | # use a try-except indentation to catch MongoClient() errors 61 | try: 62 | print('entering mongo db connection') 63 | 64 | 65 | 66 | client = MongoClient( 67 | host = [ str(domain) + ":" + str(port) ], 68 | serverSelectionTimeoutMS = 3000, # 3 second timeout 69 | username = "root", 70 | password = "example", 71 | ) 72 | 73 | # uncomment and if you wanna clear out the data 74 | #client.drop_database('cygnss') 75 | 76 | # print the version of MongoDB server if connection successful 77 | print ("server version:", client.server_info()["version"]) 78 | data = { 79 | "rmse": rmse.tolist(), 80 | "bin_rmse": rmse_time["rmse"].tolist(), 81 | "bin_bias": rmse_time["bias"].tolist(), 82 | "bin_counts": rmse_time["counts"].tolist(), 83 | "event_date": date_, 84 | "scatterplot_path": f"/app/plots/scatter_{date_}.png", 85 | "histogram_path": f"/app/plots/histo_{date_}.png", 86 | "era_average_path": f"/app/plots/era_average_{date_}.png", 87 | "rmse_average_path": f"/app/plots/rmse_average_{date_}.png", 88 | "today_longrunavg_path": f"/app/plots/today_longrunavg_{date_}.png", 89 | "today_long_bias_path": f"/app/plots/today_long_bias_{date_}.png", 90 | "sample_counts_path": f"/app/plots/sample_counts_{date_}.png", 91 | "rmse_bins_era_path": f"/app/plots/rmse_bins_era_{date_}.png", 92 | "bias_bins_era_path": f"/app/plots/bias_bins_era_{date_}.png", 93 | "y_pred": y_pred.tolist() 94 | } 95 | 96 | cygnss_collection = client["cygnss"].cygnss_collection 97 | 98 | 99 | cygnss_collection = cygnss_collection.insert_many([data]) 100 | 101 | print(f"Multiple tutorials: {cygnss_collection.inserted_ids}") 102 | 103 | except errors.ServerSelectionTimeoutError as err: 104 | # set the client and DB name list to 'None' and `[]` if exception 105 | client = None 106 | # catch pymongo.errors.ServerSelectionTimeoutError 107 | print (err) 108 | 109 | 110 | @task 111 | def get_hyper_params(model_path, model, data_path): 112 | # Note for future: for fixed model write h_params in config file 113 | checkpoint = torch.load(os.path.join(model_path, model), 114 | map_location=torch.device("cpu")) 115 | checkpoint['hyper_parameters']["data"] = data_path 116 | checkpoint['hyper_parameters']["num_workers"] = 1 117 | col_idx_lat = checkpoint["hyper_parameters"]["v_par_eval"].index('sp_lat') 118 | col_idx_lon = checkpoint["hyper_parameters"]["v_par_eval"].index('sp_lon') 119 | args = namedtuple("ObjectName", checkpoint['hyper_parameters'].keys())\ 120 | (*checkpoint['hyper_parameters'].values()) 121 | return args, col_idx_lat, col_idx_lon 122 | 123 | @task 124 | def get_backbone(args, input_shapes): 125 | if args.model=='cnn': 126 | backbone = ImageNet(args, input_shapes) 127 | elif args.model=='dense': 128 | backbone = DenseNet(args, input_shapes) 129 | return backbone 130 | 131 | @task 132 | def make_predictions(test_loader, model): 133 | trainer = pl.Trainer(enable_progress_bar=False) 134 | trainer.test(model=model, dataloaders=test_loader) 135 | y_pred = trainer.predict(model=model, dataloaders=[test_loader]) 136 | y_pred = torch.cat(y_pred).detach().cpu().numpy().squeeze() 137 | return y_pred 138 | 139 | @task 140 | def rmse_bins(y_true, y_pred, y_bins): 141 | # Find the indices for the windspeed bins - below 12 m/s, below 16 m/s, above 16 m/s 142 | y_ix = np.digitize(y_true, y_bins, right=False) 143 | 144 | all_rmse = np.zeros(len(y_bins)) 145 | all_bias = np.zeros(len(y_bins)) 146 | all_counts = np.zeros(len(y_bins)) 147 | 148 | for i, yy in enumerate(y_bins): 149 | if np.any(y_ix==i): 150 | rmse = mean_squared_error(y_true[y_ix==i], y_pred[y_ix==i], squared=False) 151 | all_rmse[i] = rmse 152 | all_bias[i] = np.mean(y_pred[y_ix==i] - y_true[y_ix==i]) 153 | all_counts[i] = np.sum(y_ix==i) 154 | else: 155 | all_rmse[i] = None 156 | all_bias[i] = None 157 | all_counts[i] = 0 158 | df_rmse = pd.DataFrame(dict(rmse=all_rmse, bias=all_bias, bins=y_bins, counts=all_counts)) 159 | return df_rmse 160 | 161 | @task 162 | def rmse_over_time(y_bins, df_rmse): 163 | # mock up data that represents the long running average rmse 164 | df_rmse["time"] = "today" 165 | 166 | df_mockup = pd.DataFrame(dict(bins=y_bins, 167 | rmse=df_rmse["rmse"] + np.random.rand(len(y_bins))-0.5, 168 | bias=df_rmse["bias"] + np.random.rand(len(y_bins))-0.5, 169 | counts=df_rmse["counts"] * 1000)) 170 | df_mockup["time"] = "long-running average" 171 | 172 | df_mockup = pd.concat([df_rmse, df_mockup], ignore_index=True) 173 | return df_mockup 174 | 175 | @task 176 | def make_plots(y, y_pred, date_, df_mockup, df_rmse, y_bins): 177 | make_scatterplot(y, y_pred, date_) 178 | make_histogram(y, y_pred, date_) 179 | #era_average(y, sp_lon, sp_lat, date_) 180 | #rmse_average(y, y_pred, sp_lon, sp_lat, date_) 181 | today_longrunavg(df_mockup, y_bins, date_) 182 | today_longrunavg_bias(df_mockup, y_bins, date_) 183 | sample_counts(df_rmse, y_bins, date_) 184 | rmse_bins_era(df_rmse, y_bins, date_) 185 | bias_bins_era(df_rmse, y_bins, date_) 186 | 187 | @task 188 | def remove(): 189 | shutil.rmtree("/app/raw_data", ignore_errors=False, onerror=None) 190 | shutil.rmtree("/app/annotated_raw_data", ignore_errors=False, onerror=None) 191 | shutil.rmtree("/app/dev_data", ignore_errors=False, onerror=None) 192 | 193 | @flow 194 | def main(): 195 | # TODO: Set these settings for prefect, to make paths relative instead of global 196 | # prefect config set PREFECT_LOCAL_STORAGE_PATH="/your/custom/path" 197 | # prefect config set PREFECT_HOME="/your/custom/path" 198 | 199 | # create directory for plots, if it does not exist 200 | if not os.path.isdir('/app/plots'): 201 | os.makedirs('/app/plots', exist_ok=True) 202 | 203 | # write a file in app directory to check its write permission and where files are stored 204 | with open("/app/app_write_test.txt", "w") as file: 205 | file.write("app_write_test") 206 | file.write(os.getcwd()) 207 | file.write(os.path.dirname(__file__)) 208 | print(file.name) 209 | 210 | # Define the date and pass it to the individual tasks 211 | download_date = date.today() - timedelta(days=12) 212 | date_ = download_date.strftime("%Y-%m-%d") 213 | 214 | # Download data for the past 10th day from today, today - 10th day 215 | download_data(year=download_date.year, month=download_date.month, day=download_date.day) 216 | 217 | # annotate data 218 | # create filtered hdf5 from preprocessing 219 | data_path = '/app/dev_data/' 220 | pre_processing(download_date.year, download_date.month, download_date.day, data_path) 221 | 222 | model_path = '/app/externals/gfz_cygnss/trained_models/' 223 | model = 'ygambdos_yykDM.ckpt' 224 | h5_file = h5py.File(os.path.join(data_path, 'test_data.h5'), 'r', rdcc_nbytes=0) 225 | 226 | mlflow.set_tracking_uri("sqlite:///mlruns.db") # TODO: change this to other db 227 | mlflow.set_experiment("cygnss") 228 | 229 | 230 | # get hyperparameters 231 | args, col_idx_lat, col_idx_lon = get_hyper_params.submit(model_path, model, data_path).result() 232 | 233 | cdm = CyGNSSDataModule(args) 234 | cdm.setup(stage='test') 235 | input_shapes = cdm.get_input_shapes(stage='test') 236 | backbone = get_backbone.submit(args, input_shapes).result() 237 | 238 | # load model 239 | cygnss_model = CyGNSSNet.load_from_checkpoint(os.path.join(model_path, model), 240 | map_location=torch.device('cpu'), 241 | args=args, 242 | backbone=backbone) 243 | cygnss_model.eval() 244 | 245 | test_loader = cdm.test_dataloader() 246 | # make predictions 247 | y_pred = make_predictions(test_loader, cygnss_model) 248 | 249 | # get true labels 250 | dataset = CyGNSSDataset('test', args) 251 | y = dataset.y 252 | 253 | # calculate rmse 254 | y_bins = [4, 8, 12, 16, 20, 100] 255 | df_rmse = rmse_bins.submit(y, y_pred, y_bins).result() 256 | df_mockup = rmse_over_time.submit(y_bins, df_rmse).result() 257 | with mlflow.start_run(): 258 | rmse = mean_squared_error(y, y_pred, squared=False) 259 | mlflow.log_metric('rmse', rmse) 260 | 261 | # make plots 262 | sp_lat = test_loader.dataset.v_par_eval[:, col_idx_lat] 263 | sp_lon = test_loader.dataset.v_par_eval[:, col_idx_lon] 264 | make_plots(y, y_pred, date_, df_mockup, df_rmse, y_bins) 265 | DOMAIN = 'mongodb' 266 | PORT = 27017 267 | 268 | # Save results to the mongo database 269 | save_to_db(domain=DOMAIN, port=PORT, y_pred=y_pred, \ 270 | rmse=rmse, date_=date_, rmse_time=df_rmse) 271 | 272 | # delete dowloaded and annotated files 273 | remove() 274 | 275 | if __name__ == "__main__": 276 | 277 | deployment = Deployment.build_from_flow( 278 | schedule = CronSchedule(cron='0 3 * * *', timezone='Europe/Berlin'), 279 | flow=main, 280 | name="cygnss", 281 | work_queue_name="demo" 282 | ) 283 | deployment.apply() 284 | # main() 285 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: cygnss-d 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=5.1=1_gnu 9 | - absl-py=1.2.0=pyhd8ed1ab_0 10 | - aiohttp=3.8.1=py39hb9d737c_1 11 | - aiosignal=1.2.0=pyhd8ed1ab_0 12 | - async-timeout=4.0.2=pyhd8ed1ab_0 13 | - attrs=22.1.0=pyh71513ae_1 14 | - blas=1.0=mkl 15 | - blinker=1.4=py_1 16 | - bottleneck=1.3.5=py39h7deecbd_0 17 | - brotli=1.0.9=h5eee18b_7 18 | - brotli-bin=1.0.9=h5eee18b_7 19 | - brotlipy=0.7.0=py39h27cfd23_1003 20 | - bzip2=1.0.8=h7b6447c_0 21 | - c-ares=1.18.1=h7f8727e_0 22 | - ca-certificates=2022.9.24=ha878542_0 23 | - cachetools=5.2.0=pyhd8ed1ab_0 24 | - cartopy=0.18.0=py39h0d9ca2b_1 25 | - certifi=2022.6.15.1=pyhd8ed1ab_0 26 | - cffi=1.15.1=py39h74dc2b5_0 27 | - cftime=1.5.1.1=py39hce1f21e_0 28 | - colorama=0.4.5=pyhd8ed1ab_0 29 | - cryptography=37.0.1=py39h9ce1e76_0 30 | - cudatoolkit=11.3.1=h2bc3f7f_2 31 | - curl=7.84.0=h5eee18b_0 32 | - cycler=0.11.0=pyhd3eb1b0_0 33 | - dbus=1.13.18=hb2f20db_0 34 | - expat=2.4.4=h295c915_0 35 | - ffmpeg=4.3=hf484d3e_0 36 | - fftw=3.3.9=h27cfd23_1 37 | - fontconfig=2.13.1=h6c09931_0 38 | - fonttools=4.25.0=pyhd3eb1b0_0 39 | - freetype=2.11.0=h70c0345_0 40 | - fsspec=2022.11.0=pyhd8ed1ab_0 41 | - future=0.18.2=py39h06a4308_1 42 | - geos=3.8.0=he6710b0_0 43 | - giflib=5.2.1=h7b6447c_0 44 | - glib=2.69.1=h4ff587b_1 45 | - gmp=6.2.1=h295c915_3 46 | - gnutls=3.6.15=he1e5248_0 47 | - google-auth=2.11.0=pyh6c4a22f_0 48 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0 49 | - gst-plugins-base=1.14.0=h8213a91_2 50 | - gstreamer=1.14.0=h28cd5cc_2 51 | - h5py=3.7.0=py39h737f45e_0 52 | - hdf4=4.2.13=h3ca952b_2 53 | - hdf5=1.10.6=h3ffc7dd_1 54 | - icu=58.2=he6710b0_3 55 | - idna=3.3=pyhd3eb1b0_0 56 | - intel-openmp=2021.4.0=h06a4308_3561 57 | - jpeg=9e=h7f8727e_0 58 | - kiwisolver=1.4.2=py39h295c915_0 59 | - krb5=1.19.2=hac12032_0 60 | - lame=3.100=h7b6447c_0 61 | - lcms2=2.12=h3be6417_0 62 | - ld_impl_linux-64=2.38=h1181459_1 63 | - lerc=3.0=h295c915_0 64 | - libbrotlicommon=1.0.9=h5eee18b_7 65 | - libbrotlidec=1.0.9=h5eee18b_7 66 | - libbrotlienc=1.0.9=h5eee18b_7 67 | - libclang=10.0.1=default_hb85057a_2 68 | - libcurl=7.84.0=h91b91d3_0 69 | - libdeflate=1.8=h7f8727e_5 70 | - libedit=3.1.20210910=h7f8727e_0 71 | - libev=4.33=h7f8727e_1 72 | - libevent=2.1.12=h8f2d780_0 73 | - libffi=3.3=he6710b0_2 74 | - libgcc-ng=11.2.0=h1234567_1 75 | - libgfortran-ng=11.2.0=h00389a5_1 76 | - libgfortran5=11.2.0=h1234567_1 77 | - libgomp=11.2.0=h1234567_1 78 | - libiconv=1.16=h7f8727e_2 79 | - libidn2=2.3.2=h7f8727e_0 80 | - libllvm10=10.0.1=hbcb73fb_5 81 | - libnetcdf=4.8.1=h42ceab0_1 82 | - libnghttp2=1.46.0=hce63b2e_0 83 | - libpng=1.6.37=hbc83047_0 84 | - libpq=12.9=h16c4e8d_3 85 | - libprotobuf=3.15.8=h780b84a_1 86 | - libssh2=1.10.0=h8f2d780_0 87 | - libstdcxx-ng=11.2.0=h1234567_1 88 | - libtasn1=4.16.0=h27cfd23_0 89 | - libtiff=4.4.0=hecacb30_0 90 | - libunistring=0.9.10=h27cfd23_0 91 | - libuuid=1.0.3=h7f8727e_2 92 | - libwebp=1.2.2=h55f646e_0 93 | - libwebp-base=1.2.2=h7f8727e_0 94 | - libxcb=1.15=h7f8727e_0 95 | - libxkbcommon=1.0.1=hfa300c1_0 96 | - libxml2=2.9.14=h74e7548_0 97 | - libxslt=1.1.35=h4e12654_0 98 | - libzip=1.8.0=h5cef20c_0 99 | - lz4-c=1.9.3=h295c915_1 100 | - markdown=3.4.1=pyhd8ed1ab_0 101 | - markupsafe=2.1.1=py39hb9d737c_1 102 | - matplotlib=3.5.2=py39h06a4308_0 103 | - matplotlib-base=3.5.2=py39hf590b9c_0 104 | - mkl=2021.4.0=h06a4308_640 105 | - mkl-service=2.4.0=py39h7f8727e_0 106 | - mkl_fft=1.3.1=py39hd3c417c_0 107 | - mkl_random=1.2.2=py39h51133e4_0 108 | - multidict=6.0.2=py39hb9d737c_1 109 | - munkres=1.1.4=py_0 110 | - ncurses=6.3=h5eee18b_3 111 | - netcdf4=1.5.7=py39ha0f2276_1 112 | - nettle=3.7.3=hbbd107a_1 113 | - ninja=1.10.2=h06a4308_5 114 | - ninja-base=1.10.2=hd09550d_5 115 | - nspr=4.33=h295c915_0 116 | - nss=3.74=h0370c37_0 117 | - numexpr=2.8.3=py39h807cd23_0 118 | - numpy=1.22.3=py39he7a7128_0 119 | - numpy-base=1.22.3=py39hf524024_0 120 | - oauthlib=3.2.1=pyhd8ed1ab_0 121 | - openh264=2.1.1=h4ff587b_0 122 | - openssl=1.1.1s=h7f8727e_0 123 | - packaging=21.3=pyhd3eb1b0_0 124 | - pandas=1.4.3=py39h6a678d5_0 125 | - pcre=8.45=h295c915_0 126 | - pillow=9.2.0=py39hace64e9_1 127 | - pip=22.1.2=py39h06a4308_0 128 | - ply=3.11=py39h06a4308_0 129 | - proj=6.2.1=hc80f0dc_0 130 | - pyasn1=0.4.8=py_0 131 | - pycparser=2.21=pyhd3eb1b0_0 132 | - pyjwt=2.4.0=pyhd8ed1ab_0 133 | - pyopenssl=22.0.0=pyhd3eb1b0_0 134 | - pyparsing=3.0.9=py39h06a4308_0 135 | - pyqt=5.15.7=py39h6a678d5_1 136 | - pyqt5-sip=12.11.0=py39h6a678d5_1 137 | - pyshp=2.3.1=pyhd8ed1ab_0 138 | - pysocks=1.7.1=py39h06a4308_0 139 | - python=3.9.13=haa1d7c7_1 140 | - python-dateutil=2.8.2=pyhd3eb1b0_0 141 | - python_abi=3.9=2_cp39 142 | - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0 143 | - pytorch-mutex=1.0=cuda 144 | - pytz=2022.1=py39h06a4308_0 145 | - pyu2f=0.1.5=pyhd8ed1ab_0 146 | - pyyaml=6.0=py39hb9d737c_4 147 | - qt-main=5.15.2=h327a75a_7 148 | - qt-webengine=5.15.9=hd2b0992_4 149 | - qtwebkit=5.212=h4eab89a_4 150 | - readline=8.1.2=h7f8727e_1 151 | - requests=2.28.1=py39h06a4308_0 152 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0 153 | - rsa=4.9=pyhd8ed1ab_0 154 | - scipy=1.7.3=py39h6c91a56_2 155 | - seaborn=0.11.2=pyhd3eb1b0_0 156 | - shapely=1.8.4=py39h81ba7c5_0 157 | - sip=6.6.2=py39h6a678d5_0 158 | - six=1.16.0=pyhd3eb1b0_1 159 | - sqlite=3.39.2=h5082296_0 160 | - tensorboard=2.10.0=pyhd8ed1ab_2 161 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 162 | - tk=8.6.12=h1ccaba5_0 163 | - toml=0.10.2=pyhd3eb1b0_0 164 | - torchaudio=0.12.1=py39_cu113 165 | - torchmetrics=0.9.3=pyhd8ed1ab_0 166 | - torchvision=0.13.1=py39_cu113 167 | - tornado=6.1=py39hb9d737c_3 168 | - tqdm=4.64.1=pyhd8ed1ab_0 169 | - typing-extensions=4.3.0=py39h06a4308_0 170 | - typing_extensions=4.3.0=py39h06a4308_0 171 | - werkzeug=2.2.2=pyhd8ed1ab_0 172 | - wheel=0.37.1=pyhd3eb1b0_0 173 | - xarray=0.20.1=pyhd3eb1b0_1 174 | - xz=5.2.5=h7f8727e_1 175 | - yaml=0.2.5=h7f98852_2 176 | - zipp=3.8.1=pyhd8ed1ab_0 177 | - zlib=1.2.12=h5eee18b_3 178 | - zstd=1.5.2=ha4553b6_0 179 | - pip: 180 | - aiobotocore==2.4.0 181 | - aiofiles==22.1.0 182 | - aiohttp-cors==0.7.0 183 | - aioitertools==0.11.0 184 | - aiosqlite==0.17.0 185 | - alembic==1.8.1 186 | - altair==4.2.0 187 | - anyio==3.6.1 188 | - appdirs==1.4.4 189 | - argon2-cffi==21.3.0 190 | - argon2-cffi-bindings==21.2.0 191 | - asgi-lifespan==1.0.1 192 | - astor==0.8.1 193 | - asttokens==2.0.8 194 | - asyncpg==0.26.0 195 | - azure-core==1.25.1 196 | - azure-storage-blob==12.13.1 197 | - backcall==0.2.0 198 | - beautifulsoup4==4.11.1 199 | - bleach==5.0.1 200 | - blessed==1.19.1 201 | - boto3==1.24.75 202 | - botocore==1.27.59 203 | - cdsapi==0.5.1 204 | - charset-normalizer==2.1.1 205 | - click==8.0.4 206 | - cloudpickle==2.2.0 207 | - colorful==0.5.4 208 | - commonmark==0.9.1 209 | - contextlib2==21.6.0 210 | - coolname==1.1.0 211 | - croniter==1.3.7 212 | - dask==2022.9.1 213 | - databricks-cli==0.17.3 214 | - debugpy==1.6.3 215 | - decorator==5.1.1 216 | - defusedxml==0.7.1 217 | - distlib==0.3.6 218 | - distributed==2022.9.1 219 | - docker==5.0.3 220 | - entrypoints==0.4 221 | - executing==1.0.0 222 | - fastapi==0.85.0 223 | - fastjsonschema==2.16.1 224 | - filelock==3.8.0 225 | - flask==2.2.2 226 | - frozenlist==1.3.1 227 | - gitdb==4.0.9 228 | - gitpython==3.1.27 229 | - global-land-mask==1.0.0 230 | - google-api-core==2.10.1 231 | - google-cloud-core==2.3.2 232 | - google-cloud-storage==2.5.0 233 | - google-crc32c==1.5.0 234 | - google-resumable-media==2.3.3 235 | - googleapis-common-protos==1.56.4 236 | - gpustat==1.0.0 237 | - greenlet==1.1.3 238 | - griffe==0.21.0 239 | - grpcio==1.43.0 240 | - gunicorn==20.1.0 241 | - h11==0.12.0 242 | - heapdict==1.0.1 243 | - httpcore==0.15.0 244 | - httpx==0.23.0 245 | - hyperopt==0.1.2 246 | - importlib-metadata==4.12.0 247 | - intake==0.6.6 248 | - ipykernel==6.15.2 249 | - ipython==8.5.0 250 | - ipython-genutils==0.2.0 251 | - ipywidgets==8.0.2 252 | - isodate==0.6.1 253 | - itsdangerous==2.1.2 254 | - jedi==0.18.1 255 | - jinja2==3.1.2 256 | - jmespath==1.0.1 257 | - joblib==1.2.0 258 | - json-tricks==3.15.5 259 | - jsonpatch==1.32 260 | - jsonpointer==2.3 261 | - jsonschema==4.16.0 262 | - jupyter==1.0.0 263 | - jupyter-client==7.3.5 264 | - jupyter-console==6.4.4 265 | - jupyter-core==4.11.1 266 | - jupyterlab-pygments==0.2.2 267 | - jupyterlab-widgets==3.0.3 268 | - kubernetes==24.2.0 269 | - llvmlite==0.39.1 270 | - locket==1.0.0 271 | - lxml==4.9.1 272 | - mako==1.2.2 273 | - matplotlib-inline==0.1.6 274 | - mistune==2.0.4 275 | - mlflow==2.0.1 276 | - msgpack==1.0.4 277 | - msrest==0.7.1 278 | - nbclient==0.6.8 279 | - nbconvert==7.0.0 280 | - nbformat==5.4.0 281 | - nest-asyncio==1.5.5 282 | - networkx==2.8.6 283 | - nexusformat==0.7.7 284 | - nni==2.9 285 | - notebook==6.4.12 286 | - numba==0.56.4 287 | - nvidia-ml-py==11.495.46 288 | - opencensus==0.11.0 289 | - opencensus-context==0.1.3 290 | - orjson==3.8.0 291 | - pandocfilters==1.5.0 292 | - parso==0.8.3 293 | - partd==1.3.0 294 | - pathspec==0.10.1 295 | - pendulum==2.1.2 296 | - pexpect==4.8.0 297 | - pickleshare==0.7.5 298 | - platformdirs==2.5.2 299 | - podaac-data-subscriber==1.12.0 300 | - pooch==1.6.0 301 | - prefect==2.4.0 302 | - prettytable==3.4.1 303 | - prometheus-client==0.13.1 304 | - prometheus-flask-exporter==0.20.3 305 | - prompt-toolkit==3.0.31 306 | - protobuf==3.20.2 307 | - psutil==5.9.2 308 | - ptyprocess==0.7.0 309 | - pure-eval==0.2.2 310 | - py-spy==0.3.14 311 | - pyarrow==9.0.0 312 | - pyasn1-modules==0.2.8 313 | - pydantic==1.10.2 314 | - pydeck==0.8.0b3 315 | - pydeprecate==0.3.1 316 | - pygments==2.13.0 317 | - pymongo==4.2.0 318 | - pympler==1.0.1 319 | - pyrsistent==0.18.1 320 | - python-slugify==6.1.2 321 | - pythonwebhdfs==0.2.3 322 | - pytorch-lightning==1.5.10 323 | - pytz-deprecation-shim==0.1.0.post0 324 | - pytzdata==2020.1 325 | - pyzmq==23.2.1 326 | - qtconsole==5.3.2 327 | - qtpy==2.2.0 328 | - querystring-parser==1.2.4 329 | - ray==2.0.0 330 | - readchar==4.0.3 331 | - responses==0.21.0 332 | - rfc3986==1.5.0 333 | - rich==12.5.1 334 | - s3fs==2022.11.0 335 | - s3transfer==0.6.0 336 | - schema==0.7.5 337 | - scikit-learn==1.1.2 338 | - semver==2.13.0 339 | - send2trash==1.8.0 340 | - setuptools==59.5.0 341 | - shap==0.41.0 342 | - simplejson==3.17.6 343 | - sklearn==0.0 344 | - slack-sdk==3.18.3 345 | - slicer==0.0.7 346 | - smart-open==6.2.0 347 | - smmap==5.0.0 348 | - sniffio==1.3.0 349 | - sortedcontainers==2.4.0 350 | - soupsieve==2.3.2.post1 351 | - sqlalchemy==1.4.41 352 | - sqlparse==0.4.2 353 | - stack-data==0.5.0 354 | - starlette==0.20.4 355 | - streamlit==1.12.2 356 | - tabulate==0.8.10 357 | - tblib==1.7.0 358 | - tenacity==8.0.1 359 | - tensorboard-data-server==0.6.1 360 | - terminado==0.15.0 361 | - text-unidecode==1.3 362 | - threadpoolctl==3.1.0 363 | - tinycss2==1.1.1 364 | - toolz==0.12.0 365 | - traitlets==5.4.0 366 | - typeguard==2.13.3 367 | - typer==0.6.1 368 | - tzdata==2022.2 369 | - tzlocal==4.2 370 | - urllib3==1.26.12 371 | - uvicorn==0.18.3 372 | - validators==0.20.0 373 | - virtualenv==20.16.5 374 | - watchdog==2.1.9 375 | - wcwidth==0.2.5 376 | - webencodings==0.5.1 377 | - websocket-client==1.4.1 378 | - websockets==10.3 379 | - widgetsnbextension==4.0.3 380 | - wrapt==1.14.1 381 | - yarl==1.8.1 382 | - zict==2.2.0 383 | prefix: /home/harsh/anaconda3/envs/cygnss-d 384 | -------------------------------------------------------------------------------- /notebooks/Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "261e2e39-ae3e-4b92-8dc5-b163f61eea25", 6 | "metadata": {}, 7 | "source": [ 8 | "# Preprocessing CyGNSS data\n", 9 | "\n", 10 | "Data is downloaded from NASA EarthCloud as described in the `APIs` notebook. For the expected format for CyGNSSnet, additional preprocessing steps are necessary." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 10, 16 | "id": "084a2e3e-9f9f-4844-9e28-c60e30314494", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import os\n", 21 | "import sys\n", 22 | "sys.path.append('../externals/gfz_cygnss/')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 11, 28 | "id": "06128178", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# !pip install tenacity" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 12, 38 | "id": "c0bbb084-5e0b-41a9-a337-684f832d6f85", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "ename": "TypeError", 43 | "evalue": " is not a generic class", 44 | "output_type": "error", 45 | "traceback": [ 46 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 47 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 48 | "Input \u001b[0;32mIn [12]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgfz_202003\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocessing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m preprocess \u001b[38;5;28;01mas\u001b[39;00m prep\n", 49 | "File \u001b[0;32m~/Downloads/DKRZ/MLOps/2020-03-gfz-remote-sensing/gfz_202003/preprocessing/preprocess.py:9\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01margparse\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mxr\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdatetime\u001b[39;00m\n", 50 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/__init__.py:1\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m testing, tutorial\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbackends\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 3\u001b[0m load_dataarray,\n\u001b[1;32m 4\u001b[0m load_dataset,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8\u001b[0m save_mfdataset,\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbackends\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrasterio_\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m open_rasterio\n", 51 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/testing.py:9\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m duck_array_ops, formatting, utils\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataarray\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataArray\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n", 52 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/core/duck_array_ops.py:26\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m take, tensordot, transpose, unravel_index \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m where \u001b[38;5;28;01mas\u001b[39;00m _where\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dask_array_compat, dask_array_ops, dtypes, npcompat, nputils\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnputils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m nanfirst, nanlast\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpycompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cupy_array_type, dask_array_type, is_duck_dask_array\n", 53 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/core/npcompat.py:72\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtyping\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_dtype_like\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _DTypeLikeNested, _ShapeLike, _SupportsDType\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m# Xarray requires a Mapping[Hashable, dtype] in many places which\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;66;03m# conflics with numpys own DTypeLike (with dtypes for fields).\u001b[39;00m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# https://numpy.org/devdocs/reference/typing.html#numpy.typing.DTypeLike\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;66;03m# This is a copy of this DTypeLike that allows only non-Mapping dtypes.\u001b[39;00m\n\u001b[1;32m 55\u001b[0m DTypeLikeSave \u001b[38;5;241m=\u001b[39m Union[\n\u001b[1;32m 56\u001b[0m np\u001b[38;5;241m.\u001b[39mdtype,\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# default data type (float64)\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# array-scalar types and generic types\u001b[39;00m\n\u001b[1;32m 60\u001b[0m Type[Any],\n\u001b[1;32m 61\u001b[0m \u001b[38;5;66;03m# character codes, type strings or comma-separated fields, e.g., 'float64'\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# (flexible_dtype, itemsize)\u001b[39;00m\n\u001b[1;32m 64\u001b[0m Tuple[_DTypeLikeNested, \u001b[38;5;28mint\u001b[39m],\n\u001b[1;32m 65\u001b[0m \u001b[38;5;66;03m# (fixed_dtype, shape)\u001b[39;00m\n\u001b[1;32m 66\u001b[0m Tuple[_DTypeLikeNested, _ShapeLike],\n\u001b[1;32m 67\u001b[0m \u001b[38;5;66;03m# (base_dtype, new_dtype)\u001b[39;00m\n\u001b[1;32m 68\u001b[0m Tuple[_DTypeLikeNested, _DTypeLikeNested],\n\u001b[1;32m 69\u001b[0m \u001b[38;5;66;03m# because numpy does the same?\u001b[39;00m\n\u001b[1;32m 70\u001b[0m List[Any],\n\u001b[1;32m 71\u001b[0m \u001b[38;5;66;03m# anything with a dtype attribute\u001b[39;00m\n\u001b[0;32m---> 72\u001b[0m \u001b[43m_SupportsDType\u001b[49m\u001b[43m[\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m]\u001b[49m,\n\u001b[1;32m 73\u001b[0m ]\n\u001b[1;32m 74\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# fall back for numpy < 1.20, ArrayLike adapted from numpy.typing._array_like\u001b[39;00m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Protocol\n", 54 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/typing.py:261\u001b[0m, in \u001b[0;36m_tp_cache..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m \u001b[38;5;66;03m# All real errors (not unhashable args) are raised below.\u001b[39;00m\n\u001b[0;32m--> 261\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", 55 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/typing.py:897\u001b[0m, in \u001b[0;36mGeneric.__class_getitem__\u001b[0;34m(cls, params)\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 894\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParameters to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m[...] must all be unique\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 895\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 896\u001b[0m \u001b[38;5;66;03m# Subscripting a regular Generic subclass.\u001b[39;00m\n\u001b[0;32m--> 897\u001b[0m \u001b[43m_check_generic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 898\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _GenericAlias(\u001b[38;5;28mcls\u001b[39m, params)\n", 56 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/typing_extensions.py:95\u001b[0m, in \u001b[0;36m_check_generic\u001b[0;34m(cls, parameters, elen)\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m elen \u001b[38;5;129;01mis\u001b[39;00m _marker:\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__parameters__\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__parameters__:\n\u001b[0;32m---> 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not a generic class\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 96\u001b[0m elen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__parameters__)\n\u001b[1;32m 97\u001b[0m alen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(parameters)\n", 57 | "\u001b[0;31mTypeError\u001b[0m: is not a generic class" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "from gfz_202003.preprocessing import preprocess as prep" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "39eac3de-096b-4b73-8491-232d3e0667b0", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "import numpy as np\n", 73 | "import h5py\n", 74 | "from matplotlib import pyplot as plt\n", 75 | "import seaborn as sns\n", 76 | "\n", 77 | "import datetime\n", 78 | "import xarray as xr\n", 79 | "\n", 80 | "import argparse" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "id": "3cad0fa3-0ba5-4b35-ba86-29f7bee68e71", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "import cdsapi" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "7c8c7e2a-ea30-499e-a259-73aae365be5d", 96 | "metadata": {}, 97 | "source": [ 98 | "## Download raw CyGNSS data\n", 99 | "\n", 100 | "The CyGNSSnet preprocessing routine expects the raw data files ordered as \n", 101 | "\n", 102 | "> `$raw_data_dir///cyg*.nc`\n", 103 | "\n", 104 | "Data is always downloaded for one full day for all spacecraft, generating 8 `netcdf` files per day of observations. Below is a routine to specify a date range, followed by downloading the corresponding data and storing it in the appropriate subfolders." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 6, 110 | "id": "efcfbe84-843d-4550-b22f-fbfaad434694", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "raw_data_root = '/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/raw_data'\n", 115 | "dev_data_root = '/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/dev_data'" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "2fdc36c1-d1e1-4bab-8a04-a91f8759637f", 121 | "metadata": {}, 122 | "source": [ 123 | "Select a test day and prepared the input parameters for the provided download script" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "id": "f47387cf-999d-44a5-9c15-8cf2c7886e07", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "year = 2021\n", 134 | "month = 3\n", 135 | "day = 17" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "id": "871eddb6-6022-4273-8dba-93c911f78598", 141 | "metadata": {}, 142 | "source": [ 143 | "Downloaded target directory in the expected format `year/day-of-year`" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "id": "84e34efb-f067-4964-89cb-f6ccb556e681", 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/raw_data/2021/168\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "raw_data_sub = datetime.datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\").strftime(\"%Y/%j\")\n", 162 | "\n", 163 | "raw_data_dir = os.path.join(raw_data_root, raw_data_sub)\n", 164 | "\n", 165 | "print(raw_data_dir)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "cad54be9-afc6-43b9-a841-2bfefddc81f5", 171 | "metadata": {}, 172 | "source": [ 173 | "Start and end date of download range in the required format. The end date is midnight the next day, this way only the requested day's data is downloaded." 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 9, 179 | "id": "e7da91cd-1c9b-479d-91a2-3598b58765ac", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "--start-date 2021-06-17T00:00:00Z\n", 187 | "--end-date 2021-06-18T00:00:00Z\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "start_date = datetime.datetime(year, month, day).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", 193 | "end_date = datetime.datetime(year, month, day + 1).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n", 194 | "\n", 195 | "print(f'--start-date {start_date}')\n", 196 | "print(f'--end-date {end_date}')" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "15055733-3e11-4bd5-9402-e721be9aba0c", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "dday = datetime.datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\").strftime(\"%j\") # need that later\n", 207 | "dday" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "99c1420e-e2c3-4c68-a53e-0b98e94d3a45", 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "env: PYTHONPATH=/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber\n", 221 | "[2022-09-12 16:00:57,433] {podaac_data_downloader.py:243} INFO - Found 7 total files to download\n", 222 | "[2022-09-12 16:00:59,062] {podaac_access.py:446} WARNING - Computed checksum f11baba7acac4b5b14b3891e83f715c8 does not match expected checksum 10e4ef36d29f030ea7e524f8924389fc\n", 223 | "[2022-09-12 16:01:46,860] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:01:46.860919 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg06.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n", 224 | "[2022-09-12 16:01:48,483] {podaac_access.py:446} WARNING - Computed checksum 9b3100d23550d03cb85056609ecddd5b does not match expected checksum a8851840f3a4bbdc8499ea2f17d5119b\n", 225 | "[2022-09-12 16:02:39,804] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:02:39.804552 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg08.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n", 226 | "[2022-09-12 16:02:41,684] {podaac_access.py:446} WARNING - Computed checksum fdaaa0486c6932b1a62c087edaecd64f does not match expected checksum a08d25babf87b328b96a850bfacbcc53\n", 227 | "[2022-09-12 16:03:31,252] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:03:31.252143 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg02.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n", 228 | "[2022-09-12 16:03:33,101] {podaac_access.py:446} WARNING - Computed checksum 881d6ad8374fea406dc72b27775e124f does not match expected checksum 7eef541250b6f137d8ace0e99e12eaf2\n", 229 | "[2022-09-12 16:04:15,389] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:04:15.389899 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg03.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n", 230 | "[2022-09-12 16:04:17,154] {podaac_access.py:446} WARNING - Computed checksum cf78c6b618423cf8410b43eeddfb5c63 does not match expected checksum 25dd31a5b59b5444a509ead3a359a8a5\n", 231 | "[2022-09-12 16:05:04,669] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:05:04.669819 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg04.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n", 232 | "[2022-09-12 16:05:06,367] {podaac_access.py:446} WARNING - Computed checksum 3dc2ce38484b3438c18d5491d6a68984 does not match expected checksum e7ae44462212498cab741a6dbd4624e8\n", 233 | "[2022-09-12 16:06:03,144] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:06:03.144241 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg07.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n", 234 | "[2022-09-12 16:06:04,762] {podaac_access.py:446} WARNING - Computed checksum 8cc2e314df20dec61110dc4290da3cc1 does not match expected checksum 32fddfe78b55e4ee302cf37fa7d0bf9b\n", 235 | "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:07:04.082807 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg01.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n", 236 | "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:287} INFO - Downloaded Files: 7\n", 237 | "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:288} INFO - Failed Files: 0\n", 238 | "[2022-09-12 16:07:04,083] {podaac_data_downloader.py:289} INFO - Skipped Files: 0\n", 239 | "[2022-09-12 16:07:05,046] {podaac_access.py:122} INFO - CMR token successfully deleted\n", 240 | "[2022-09-12 16:07:05,047] {podaac_data_downloader.py:299} INFO - END\n", 241 | "\n", 242 | "\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "%env PYTHONPATH=/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber\n", 248 | "!python /home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber/subscriber/podaac_data_downloader.py -c CYGNSS_L1_V3.1 -d $raw_data_dir --start-date $start_date --end-date $end_date" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "id": "520bceeb-7e10-4802-96c7-96995aa933e2", 254 | "metadata": {}, 255 | "source": [ 256 | "## Download raw ERA5 data\n", 257 | "\n", 258 | "The preprocessing pipeline requires the ERA5 windspeed labels. Download the raw ERA5 data for the same timespan." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 1, 264 | "id": "d2511baa-ade0-43bf-8e9b-953251c164fe", 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "ename": "NameError", 269 | "evalue": "name 'os' is not defined", 270 | "output_type": "error", 271 | "traceback": [ 272 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 273 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 274 | "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m era5_data \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(raw_data_dir, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mERA5_windspeed.nc\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", 275 | "\u001b[0;31mNameError\u001b[0m: name 'os' is not defined" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "era5_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc')" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "id": "b8e06265-ffa4-4b9c-a5de-b8c9151a9387", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "cds = cdsapi.Client()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 2, 296 | "id": "3c9e2d27-609c-454b-8131-10427c89ab9d", 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "ename": "NameError", 301 | "evalue": "name 'cds' is not defined", 302 | "output_type": "error", 303 | "traceback": [ 304 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 305 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 306 | "Input \u001b[0;32mIn [2]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcds\u001b[49m\u001b[38;5;241m.\u001b[39mretrieve(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreanalysis-era5-single-levels\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 3\u001b[0m {\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mproduct_type\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreanalysis\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mformat\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnetcdf\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvariable\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10m_u_component_of_wind\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10m_v_component_of_wind\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 8\u001b[0m ],\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m: year,\n\u001b[1;32m 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmonth\u001b[39m\u001b[38;5;124m'\u001b[39m: month,\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m'\u001b[39m: day,\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m00:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m01:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m02:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m03:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m04:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m05:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 15\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m06:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m07:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m08:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 16\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m09:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m11:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 17\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m12:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m13:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m14:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 18\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m15:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m16:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m17:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 19\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m18:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m19:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m20:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 20\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m21:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m22:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m23:00\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 21\u001b[0m ],\n\u001b[1;32m 22\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124marea\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m 23\u001b[0m \u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m180\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m180\u001b[39m,\n\u001b[1;32m 24\u001b[0m ],\n\u001b[1;32m 25\u001b[0m },\n\u001b[1;32m 26\u001b[0m era5_data)\n", 307 | "\u001b[0;31mNameError\u001b[0m: name 'cds' is not defined" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "cds.retrieve(\n", 313 | " 'reanalysis-era5-single-levels',\n", 314 | " {\n", 315 | " 'product_type': 'reanalysis',\n", 316 | " 'format': 'netcdf',\n", 317 | " 'variable': [\n", 318 | " '10m_u_component_of_wind', '10m_v_component_of_wind',\n", 319 | " ],\n", 320 | " 'year': year,\n", 321 | " 'month': month,\n", 322 | " 'day': day,\n", 323 | " 'time': [\n", 324 | " '00:00', '01:00', '02:00',\n", 325 | " '03:00', '04:00', '05:00',\n", 326 | " '06:00', '07:00', '08:00',\n", 327 | " '09:00', '10:00', '11:00',\n", 328 | " '12:00', '13:00', '14:00',\n", 329 | " '15:00', '16:00', '17:00',\n", 330 | " '18:00', '19:00', '20:00',\n", 331 | " '21:00', '22:00', '23:00'\n", 332 | " ],\n", 333 | " 'area': [\n", 334 | " 40, -180, -40, 180,\n", 335 | " ],\n", 336 | " },\n", 337 | " era5_data)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 3, 343 | "id": "4df67e88-6fd4-48f9-8a2c-921d51fe1c13", 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "ename": "NameError", 348 | "evalue": "name 'xr' is not defined", 349 | "output_type": "error", 350 | "traceback": [ 351 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 352 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 353 | "Input \u001b[0;32mIn [3]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m era5_ds \u001b[38;5;241m=\u001b[39m \u001b[43mxr\u001b[49m\u001b[38;5;241m.\u001b[39mopen_dataset(era5_data)\n\u001b[1;32m 2\u001b[0m era5_ds\n", 354 | "\u001b[0;31mNameError\u001b[0m: name 'xr' is not defined" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "era5_ds = xr.open_dataset(era5_data)\n", 360 | "era5_ds" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "c958a80b-5845-4a98-8372-1fdc03954a00", 366 | "metadata": {}, 367 | "source": [ 368 | "## Annotate raw CyGNSS data with windspeed labels" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "id": "ac2b8784-afcd-48ce-9e09-a6b245ae6132", 374 | "metadata": {}, 375 | "source": [ 376 | "We need to create the data variables `ERA5_u10` and `ERA5_v10` and attach them to the CyGNSS raw data." 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 22, 382 | "id": "ca049baa-b554-40f1-9269-27469c614a76", 383 | "metadata": {}, 384 | "outputs": [ 385 | { 386 | "data": { 387 | "text/plain": [ 388 | "['cyg07.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n", 389 | " 'ERA5_windspeed.nc',\n", 390 | " 'cyg02.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n", 391 | " 'cyg04.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n", 392 | " 'cyg01.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n", 393 | " 'cyg06.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n", 394 | " 'CYGNSS_L1_V3.1.citation.txt',\n", 395 | " 'cyg03.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n", 396 | " 'cyg08.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc']" 397 | ] 398 | }, 399 | "execution_count": 22, 400 | "metadata": {}, 401 | "output_type": "execute_result" 402 | } 403 | ], 404 | "source": [ 405 | "os.listdir(raw_data_dir)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "id": "50a4f34c-1a20-4580-9721-6838de1626a7", 411 | "metadata": {}, 412 | "source": [ 413 | "Check units for spacetime coordinates\n", 414 | "* Longitude\n", 415 | " * ERA5: -180 ... 0 ... +180\n", 416 | " * CyGNSS: 0 ... 180 ... 360\n", 417 | "* Latitude\n", 418 | " * ERA5 & CyGNSS: -40 ... 0 ... +40\n", 419 | "* Timestamp\n", 420 | "\n", 421 | "\n", 422 | "--> Need to shift the ERA5 longitude coordinate by 180" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "96ecbfa9-4a93-4821-b65c-05fc8697f8d5", 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "def annotate_dataset(cygnss_file, era5_file, save_dataset=False):\n", 433 | " '''\n", 434 | " Annotate a given CyGNSS dataset with ERA5 windspeed labels and save to disk\n", 435 | " \n", 436 | " Parameters:\n", 437 | " cygnss_file : path to CyGNSS dataset\n", 438 | " era5_file : path to orresponding ERA5 dataset\n", 439 | " save_dataset : if True, save dataset to disk overwriting cygnss_file (default: False)\n", 440 | " \n", 441 | " Returns:\n", 442 | " Annotated CyGNSS dataset\n", 443 | " '''\n", 444 | " \n", 445 | " # necessary because lazy loading prohibits overwriting the netcdf files at the end of this section\n", 446 | " with xr.open_dataset(cygnss_file) as data:\n", 447 | " cygnss_ds = data.load()\n", 448 | " \n", 449 | " with xr.open_dataset(era5_file) as data:\n", 450 | " era5_ds = data.load()\n", 451 | " \n", 452 | " # needs to be shifted by 180 for compatibility with CyGNSS\n", 453 | " era5_ds = era5_ds.assign_coords(longitude=era5_ds.coords['longitude'] + 180)\n", 454 | " \n", 455 | " interp_ds = era5_ds.interp(longitude=cygnss_ds.sp_lon, latitude=cygnss_ds.sp_lat, time=cygnss_ds.ddm_timestamp_utc)\n", 456 | " \n", 457 | " cygnss_ds['ERA5_u10'] = interp_ds['u10']\n", 458 | " cygnss_ds['ERA5_v10'] = interp_ds['v10']\n", 459 | "\n", 460 | " tmp_attrs = cygnss_ds['ERA5_u10'].attrs\n", 461 | " tmp_attrs['long_name'] = cygnss_ds['ERA5_u10'].long_name + ' (interpolated)'\n", 462 | " cygnss_ds['ERA5_u10'].attrs = tmp_attrs\n", 463 | "\n", 464 | " tmp_attrs = cygnss_ds['ERA5_v10'].attrs\n", 465 | " tmp_attrs['long_name'] = cygnss_ds['ERA5_v10'].long_name + ' (interpolated)'\n", 466 | " cygnss_ds['ERA5_v10'].attrs = tmp_attrs\n", 467 | " \n", 468 | " cygnss_ds = cygnss_ds.drop_vars(['longitude', 'latitude', 'time'])\n", 469 | " \n", 470 | " # dummy values only for preprocessing routine\n", 471 | " cygnss_ds['GPM_precipitation'] = -9999\n", 472 | " cygnss_ds['ERA5_mdts'] = -9999\n", 473 | " cygnss_ds['ERA5_mdww'] = -9999\n", 474 | " cygnss_ds['ERA5_swh'] = -9999\n", 475 | " cygnss_ds['ERA5_shts'] = -9999\n", 476 | " cygnss_ds['ERA5_shww'] = -9999\n", 477 | " cygnss_ds['ERA5_p140121'] = -9999\n", 478 | " cygnss_ds['ERA5_p140124'] = -9999\n", 479 | " cygnss_ds['ERA5_p140127'] = -9999\n", 480 | " \n", 481 | " if save_dataset:\n", 482 | " cygnss_ds.to_netcdf(cygnss_file)\n", 483 | " \n", 484 | " return cygnss_ds" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "id": "3e15d7b1-9d6d-4e1a-9ba6-ea28b9943f28", 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "for cygnss_file in os.listdir(raw_data_dir):\n", 495 | " if cygnss_file.startswith('cyg') and cygnss_file.endswith('.nc'):\n", 496 | " print(cygnss_file)\n", 497 | " annotate_dataset(os.path.join(raw_data_dir, cygnss_file), era5_data, save_dataset=True)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "id": "41fa8522-6306-45f9-ad34-609a30995765", 503 | "metadata": {}, 504 | "source": [ 505 | "## Check raw data" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "id": "d099d73c-53e5-4939-bf93-99105225a0e7", 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "from importlib import reload\n", 516 | "reload(prep)\n", 517 | "raw_ds = prep.open_mfdataset(os.path.join(raw_data_dir, cygnss_file))\n", 518 | "\n", 519 | "raw_ds" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "id": "b4b7fbce-05e4-418f-bf29-a294846947d0", 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "filtered_ds = prep.apply_quality_filter(raw_ds, is_ml_ops=True)\n", 530 | "filtered_ds" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "id": "66d21d51-a157-4823-98bc-9160b19627a8", 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "os.listdir('/work/ka1176/shared_data/2020-03/raw_data/2021/014/')" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "id": "c05f455c-a047-49ac-8169-6eee7f0ee38e", 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "bu = raw_ds['ddm_brcs_uncert']\n", 551 | "qf = raw_ds['quality_flags']\n", 552 | "st = raw_ds['nst_att_status']\n", 553 | "fom = raw_ds['prn_fig_of_merit']\n", 554 | "les = raw_ds['ddm_les']\n", 555 | "rxg = raw_ds['sp_rx_gain']\n", 556 | "nsca = raw_ds['nbrcs_scatter_area']\n", 557 | "lsca = raw_ds['les_scatter_area']\n", 558 | "lat = raw_ds['sp_lat']\n", 559 | "lon = raw_ds['sp_lon']\n", 560 | "ws = raw_ds['windspeed']" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "id": "1cb9d80b-5400-4a2c-878b-624fe05040b9", 566 | "metadata": {}, 567 | "source": [ 568 | "For now, use only the quality flag == 4" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "id": "289ceac1-d478-4276-868b-9f246c8222c2", 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "quality = (bu<1) & (qf == 4) & (st == 0) & (fom > 3) & (rxg > 0) & (les >= 0)" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "id": "8f472924-fcb5-4639-8609-cda8b4fb4a51", 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "np.sum((bu<1) & (st==0)).compute()" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "id": "fb390261-935f-4573-87f8-50b926f873a0", 594 | "metadata": {}, 595 | "source": [ 596 | "## Created processed data" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "id": "5e078c5c-068a-43e5-b1dc-23806c0226e5", 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "raw_ds = prep.open_mfdataset(os.path.join(raw_data_dir, 'cyg06*.nc'), channels=[0,1,2,3])" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "id": "1bbd55a2-ac14-4dbb-8aa5-9c64c64ebbd1", 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "dev_data_dir = '/work/ka1176/shared_data/2022-cygnss-deployment/dev_data/'" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "id": "3fef8930-282e-4751-8678-18db21ee13f9", 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "for ff in os.listdir('/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/2021/168/'):\n", 627 | " tmp = xr.open_dataset(os.path.join('/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/2021/168/', ff))\n", 628 | " if not 'ERA5_u10' in tmp.keys():\n", 629 | " print(ff)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "id": "93ebd942-72d9-4344-8496-329f8a9c73c9", 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [ 639 | "tmp" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "id": "c24d957d-2db5-4fd4-a88d-e18a53457384", 646 | "metadata": {}, 647 | "outputs": [], 648 | "source": [ 649 | "reload(prep)\n", 650 | "args = argparse.Namespace(raw_data_dir='/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/',\n", 651 | " output_dir=dev_data_dir,\n", 652 | " v_map=['brcs'],\n", 653 | " n_valid_days=0,\n", 654 | " n_test_days=1,\n", 655 | " n_processes=1,\n", 656 | " only_merge=False,\n", 657 | " use_land_data=False,\n", 658 | " is_ml_ops=True,\n", 659 | " version='v3.1',\n", 660 | " day=dday,\n", 661 | " year=year,\n", 662 | " reduce_mode='')\n", 663 | "\n", 664 | "prep.generate_input_data(args)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "id": "7ec9da62-5cd7-4f1d-b4eb-a11367748f5d", 670 | "metadata": {}, 671 | "source": [ 672 | "## Check the new CyGNSS data v3.1" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "id": "f6e400c4-a661-409e-8add-19734e6e954c", 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "TODO annotate the samples with date (year month day etc)" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "11d01ad2-7a19-458d-817d-efadea14d643", 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "!conda list env" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "id": "00e4867b-2e29-4967-a193-ef2356a151f0", 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "id": "3ebc77ff-5f86-4962-a058-9fc359cf5b3f", 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [] 710 | } 711 | ], 712 | "metadata": { 713 | "kernelspec": { 714 | "display_name": "CyGNSS Deployment", 715 | "language": "python", 716 | "name": "cygnss-d" 717 | }, 718 | "language_info": { 719 | "codemirror_mode": { 720 | "name": "ipython", 721 | "version": 3 722 | }, 723 | "file_extension": ".py", 724 | "mimetype": "text/x-python", 725 | "name": "python", 726 | "nbconvert_exporter": "python", 727 | "pygments_lexer": "ipython3", 728 | "version": "3.9.13" 729 | } 730 | }, 731 | "nbformat": 4, 732 | "nbformat_minor": 5 733 | } 734 | --------------------------------------------------------------------------------