├── docker_cygnss_deployment
    ├── .env
    ├── Dockerfile
    ├── requirements.txt
    └── docker-compose.yml
├── app_write_test.txt
├── Workflow.png
├── .gitmodules
├── deployment
    ├── mongodb-configmap.yaml
    ├── mongodb-secret.yaml
    ├── prefect-agent-deployment.yaml
    ├── streamlit-deployment.yaml
    ├── prefect-orion-deployment.yaml
    ├── mongodb-deployment.yaml
    └── mongo-express-deployment.yaml
├── set_up_infrastructure.sh
├── .gitignore
├── LICENSE.md
├── download_training_data.py
├── Usage.md
├── README.md
├── dashboard.py
├── Preprocessing.py
├── plots.py
├── API.py
├── notebooks
    ├── DailyAnalysis.ipynb
    └── Preprocessing.ipynb
├── prefect-deploy.py
└── environment.yml


/docker_cygnss_deployment/.env:
--------------------------------------------------------------------------------
1 | UID=201207
2 | GID=201207
3 | 


--------------------------------------------------------------------------------
/app_write_test.txt:
--------------------------------------------------------------------------------
1 | app_write_test/tmp/tmpxv3x4wj2prefect/tmp/tmpxv3x4wj2prefect


--------------------------------------------------------------------------------
/Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hereon-KSN/cygnss-deployment/HEAD/Workflow.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "externals/gfz_cygnss"]
2 | 	path = externals/gfz_cygnss
3 | 	url = https://gitlab.dkrz.de/aim/2020-03-gfz-remote-sensing.git
4 | 


--------------------------------------------------------------------------------
/deployment/mongodb-configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: mongodb-configmap
5 | data:
6 |   database_url: mongodb://root:example@mongodb:27017/
7 | 


--------------------------------------------------------------------------------
/deployment/mongodb-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 |   name: mongodb-secret
5 | type: Opaque
6 | data:
7 |   mongo-root-username: dXNlcgo=
8 |   mongo-root-password: ZXhhbXBsZQo=
9 | 


--------------------------------------------------------------------------------
/set_up_infrastructure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | git clone --recurse-submodules https://gitlab.dkrz.de/aim/cygnss-deployment
4 | 
5 | cd cygnss-deployment/docker_cygnss_deployment
6 | 
7 | docker-compose up --build
8 | 


--------------------------------------------------------------------------------
/docker_cygnss_deployment/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9
2 | FROM continuumio/miniconda3
3 | WORKDIR app/
4 | COPY requirements.txt .
5 | RUN pip install --upgrade pip
6 | RUN conda install -c conda-forge cartopy
7 | RUN conda install xarray=0.20.1
8 | RUN pip install -r requirements.txt
9 | 


--------------------------------------------------------------------------------
/docker_cygnss_deployment/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit==1.17.0
 2 | scikit-learn==1.2.1
 3 | pandas==1.5.3
 4 | numpy==1.23.4
 5 | requests==2.28.2
 6 | Pillow==9.4.0
 7 | pymongo==4.3.3
 8 | mlflow
 9 | matplotlib==3.6.3
10 | scipy==1.10.0
11 | h5py==3.8.0
12 | netcdf4==1.6.2
13 | torch==1.13.1
14 | seaborn==0.12.2
15 | pytorch-lightning==1.5.10
16 | cdsapi==0.5.1
17 | podaac-data-subscriber==1.12.0
18 | global-land-mask==1.0.0
19 | prefect==2.6.8
20 | sqlalchemy
21 | dask==2023.1.1
22 | shutils
23 | 


--------------------------------------------------------------------------------
/deployment/prefect-agent-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: prefect-agent-deployment
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: prefect-agent
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: prefect-agent
14 |     spec:
15 |       containers:
16 |       - name: prefect-agent
17 |         image: streamlit:v1
18 |         imagePullPolicy: IfNotPresent
19 |         resources:
20 |           limits:
21 |             memory: "8000Mi"
22 |             cpu: "1000m"
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | torchserve-example/mnist/model-store/mnist.mar
 2 | torchserve-example/mnist/app/__pycache__/*
 3 | torchserve-example/mnist/app/static/test_data/*
 4 | saved_models/*
 5 | data/*
 6 | notebooks/.ipynb_checkpoints/*
 7 | __pycache__/*
 8 | cycnss_frauke.sqlite-journal
 9 | cycnss_test_frauke.sqlite
10 | notebooks/lightning_logs/*
11 | utils/*
12 | lightning_logs/
13 | mlruns/*
14 | mlruns.db
15 | utils/mathematics.py
16 | utils/__pycache__/*
17 | plots/*
18 | docker_cygnss_deployment/volumes/
19 | annotated_raw_data/*
20 | raw_data/*
21 | dev_data/*
22 | prediction/*
23 | 2022-cygnss-deployment/*
24 | 


--------------------------------------------------------------------------------
/deployment/streamlit-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: streamlit-deployment
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: streamlit
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: streamlit
14 |     spec:
15 |       containers:
16 |       - name: streamlit
17 |         image: streamlit:v1
18 |         imagePullPolicy: IfNotPresent
19 |         #resources:
20 |         #  limits:
21 |         #    memory: "8000Mi"
22 |         #    cpu: "1000m"
23 |         ports:
24 |         - containerPort: 8501
25 | 


--------------------------------------------------------------------------------
/deployment/prefect-orion-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: prefect-orion-deployment
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: prefect-orion
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: prefect-orion
14 |     spec:
15 |       containers:
16 |       - name: prefect-orion
17 |         image: prefecthq/prefect:2.6.8-python3.11
18 |         imagePullPolicy: IfNotPresent
19 |         resources:
20 |           limits:
21 |             memory: "700Mi"
22 |             cpu: "500m"
23 |         ports:
24 |         - containerPort: 4200
25 | 


--------------------------------------------------------------------------------
/deployment/mongodb-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: mongodb-deployment
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: mongodb
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: mongodb
14 |     spec:
15 |       containers:
16 |       - name: mongodb
17 |         image: mongo:6.0.3
18 |         imagePullPolicy: IfNotPresent
19 |         ports:
20 |         - containerPort: 27017
21 |         volumeMounts:
22 |         - mountPath: /data/db
23 |           name: mongodb
24 |         env:
25 |           - name: MONGO_INITDB_ROOT_USERNAME
26 |             valueFrom:
27 |               secretKeyRef:
28 |                 name: mongodb-secret
29 |                 key: mongo-root-username
30 |           - name: MONGO_INITDB_ROOT_PASSWORD
31 |             valueFrom:
32 |               secretKeyRef:
33 |                 name: mongodb-secret
34 |                 key: mongo-root-password
35 |       volumes:
36 |       - name: mongodb


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022-2023 Frauke Albrecht, Caroline Arnold, Harsh Grover (DKRZ-AIM)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/deployment/mongo-express-deployment.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: mongodb-express-deployment
 5 |   labels:
 6 |     app: mongodb-express
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: mongodb-express
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: mongodb-express
16 |     spec:
17 |       containers:
18 |       - name: mongodb-express
19 |         image: mongo-express:1.0.0-alpha.4
20 |         imagePullPolicy: IfNotPresent
21 |         ports:
22 |           - containerPort: 8081
23 |         volumeMounts:
24 |           - mountPath: /data/db
25 |             name: mongodb
26 |         env:
27 |           - name: ME_CONFIG_MONGODB_ADMINUSERNAME
28 |             valueFrom:
29 |               secretKeyRef:
30 |                 name: mongodb-secret
31 |                 key: mongo-root-username
32 |           - name: ME_CONFIG_MONGODB_ADMINPASSWORD
33 |             valueFrom:
34 |               secretKeyRef:
35 |                 name: mongodb-secret
36 |                 key: mongo-root-password
37 |           - name: ME_CONFIG_MONGODB_SERVER
38 |             valueFrom:
39 |               configMapKeyRef:
40 |                 name: mongodb-configmap
41 |                 key: database_url
42 |       volumes:
43 |       - name: mongodb


--------------------------------------------------------------------------------
/download_training_data.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from API import download_raw_data
 3 | from datetime import datetime, timedelta, date
 4 | from Preprocessing import pre_processing
 5 | 
 6 | def download_data(year, month, day, raw_data_root):
 7 |     # Using API calls
 8 |     download_raw_data(year, month, day, raw_data_root=raw_data_root)
 9 |     
10 | def main(offset):
11 | 
12 |     # Define the date and pass it to the individual tasks
13 |     download_date = date.today() - timedelta(days=int(offset))
14 |     date_ = download_date.strftime("%Y-%m-%d")
15 | 
16 |     raw_data_root = '/work/ka1176/shared_data/2020-03/raw_data_v3-1'
17 |     annotated_raw_data_root = '/work/ka1176/shared_data/2020-03/annotated_raw_data_v3-1'
18 | 
19 |     print("*"*50)
20 |     print("  Download date", date_)
21 |     print("*"*50)
22 | 
23 |     # Download data for the past 10th day from today, today - 10th day
24 |     download_data(download_date.year, download_date.month, download_date.day, raw_data_root)
25 | 
26 |     # annotate data
27 |     # create filtered hdf5 from preprocessing
28 |     pre_processing(download_date.year, download_date.month, download_date.day, dev_data_dir='/scratch/k/k202141/',
29 |                    raw_data_root=raw_data_root, annotated_raw_data_root=annotated_raw_data_root)
30 | 
31 | if __name__ == "__main__":    
32 | 
33 |     main(sys.argv[1])
34 | 


--------------------------------------------------------------------------------
/Usage.md:
--------------------------------------------------------------------------------
 1 | # Usage
 2 | 
 3 | ## In Script
 4 | 
 5 | ```bash
 6 | cd ~cygnss-deployment
 7 | 
 8 | # download CyGNSS data
 9 | python API.py
10 | 
11 | # download ERA5 data and annotate CyGNSS data with wind speed labels
12 | # preprocss (filter) to create hdf5
13 | python Preprocessing.py
14 | 
15 | # Inference
16 | PYTHONPATH="./externals/gfz_cygnss/":${PYTHONPATH}
17 | export PYTHONPATH
18 | 
19 | python ./externals/gfz_cygnss/gfz_202003/training/cygnssnet.py --load-model-path ./externals/gfz_cygnss/trained_models/ygambdos_yykDM.ckpt --data ./dev_data --save-y-true --prediction-output-path ./prediction/current_predictions.h5
20 | ```
21 | 
22 | ## In Jupyter notebook
23 | 
24 | ### Kernel
25 | 
26 | Create `conda` environment using
27 | 
28 | ```bash
29 | conda env create --file docker/kernel-env-cuda11.yaml
30 | 
31 | conda activate cygnss-d
32 | 
33 | # some packages were not installed correctly
34 | conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
35 | conda install pytorch-lightning -c conda-forge
36 | pip install global-land-mask
37 | ```
38 | Create Jupyterhub kernel from this environment following https://docs.dkrz.de/doc/software%26services/jupyterhub/kernels.html
39 | 
40 | ### Setup for preprocessing
41 | 
42 | #### Earthdata
43 | 
44 | - Retrieve user ID and create `.netrc` as described in ...
45 | - change the persmission of the file: chmod og-rwx ~/.netrc
46 | 
47 | #### ERA5
48 | 
49 | Retrieve user ID and API key and create `cdsapi` as described in ...
50 | 


--------------------------------------------------------------------------------
/docker_cygnss_deployment/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | # Use root/example as user/password credentials
  2 | version: '3.1'
  3 | services:
  4 |   mongodb:    
  5 |     image: mongo:6.0.3 
  6 |     container_name: mongodb
  7 |     restart: always
  8 |     volumes:
  9 |       - mongodbdata:/data/db    
 10 |     environment:
 11 |       MONGO_INITDB_ROOT_USERNAME: root
 12 |       MONGO_INITDB_ROOT_PASSWORD: example
 13 |     networks:
 14 |       - backend
 15 | 
 16 |   mongo-express:
 17 |     image: mongo-express:1.0.0-alpha.4  
 18 |     container_name: mongo-express
 19 |     restart: always
 20 |     ports:
 21 |       - 8081:8081
 22 |     volumes:
 23 |       - mongodbdata:/data/db    
 24 |     environment:
 25 |       ME_CONFIG_MONGODB_ADMINUSERNAME: root
 26 |       ME_CONFIG_MONGODB_ADMINPASSWORD: example
 27 |       ME_CONFIG_MONGODB_URL: mongodb://root:example@mongodb:27017/
 28 |     networks:
 29 |       - backend
 30 | 
 31 |  
 32 |   streamlit:    
 33 |     user: "${UID}:${GID}"
 34 |     build: .    
 35 |     restart: always
 36 |     volumes:
 37 |       - "./../:/app/"         
 38 |       - /home/k/k202156/.netrc:/.netrc
 39 |       - /home/k/k202156/.cdsapirc:/.cdsapirc
 40 |     ports:
 41 |       - "8501:8501"
 42 |       - "5000:5000"   
 43 |       - "80:80"
 44 |     # command: bash -c "streamlit run dashboard.py"      
 45 |     command: bash -c "python prefect-deploy.py && streamlit run dashboard.py --server.port=80 && mlflow ui --backend-store-uri sqlite:///mlruns.db -p 5000"     
 46 |     env_file:
 47 |       - .env 
 48 |     environment:
 49 |       PREFECT_API_URL: http://orion:4200/api
 50 |     depends_on:
 51 |       - mongodb
 52 |     networks:
 53 |       - backend
 54 | 
 55 |   orion:
 56 |     image: prefecthq/prefect:2.6.8-python3.11
 57 |     restart: always
 58 |     ports:
 59 |       - "4200:4200"
 60 |     volumes:
 61 |       - prefect:/root/.prefect      
 62 |     entrypoint: ["prefect", "orion", "start"] 
 63 |     environment:
 64 |       PREFECT_ORION_API_HOST: 0.0.0.0
 65 |       PREFECT_LOGGING_SERVER_LEVEL: WARNING
 66 |       PREFECT_API_URL: http://localhost:4200/api
 67 |       #PREFECT_ORION_DATABASE_CONNECTION_URL: sqlite+aiosqlite:////root/.prefect/orion.db
 68 |       
 69 |     depends_on:
 70 |       - mongodb
 71 |     networks:    
 72 |       - backend 
 73 | 
 74 |   
 75 |   prefect-agent:
 76 |     user: "${UID}:${GID}"
 77 |     restart: always
 78 |     build: .
 79 |     entrypoint: ["prefect", "agent", "start", "-q", "demo"]
 80 |     volumes:
 81 |       - "./../:/app/"
 82 |       - ${HOME}/.netrc:/.netrc
 83 |       - ${HOME}/.cdsapirc:/.cdsapirc      
 84 |     environment:  
 85 |       PREFECT_API_URL: http://orion:4200/api
 86 |       PREFECT_LOGGING_LEVEL: DEBUG
 87 |     env_file:
 88 |       - .env
 89 |     depends_on:
 90 |       - orion    
 91 |     networks:
 92 |       - backend
 93 | 
 94 |     
 95 | networks:
 96 |   backend:
 97 |     driver: bridge
 98 | 
 99 | volumes:
100 |   mongodbdata:
101 |     driver: local
102 |   prefect:
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Web Interface for Wind Speed Prediction
 2 | 
 3 | ### About
 4 | 
 5 | The objective of this repository is to deploy a pre-trained *CyGNSSnet* to predict global ocean wind speed in near time. The results are shown on a web interface, which provides different illustrations of the predicted wind speed and its error compared to [ERA5 windspeed](https://www.ecmwf.int/en/forecasts/datasets/reanalysis-datasets/era5) data. 
 6 | 
 7 | *CyGNSSnet* is a neural net developed to predict wind speed from [CYGNSS](https://podaac.jpl.nasa.gov/dataset/CYGNSS_L2_V3.0)(**Cy**clone **G**lobal **N**avigation **S**atellite **S**ystem) data. The code for *CyGNSSnet* itself is not public. For more information or if you need to access it contact Caroline Arnold (arnold@dkrz.de) or the Helmholtz AI consultant team for Earth and Environment (consultant-helmholtz.ai@dkrz.de). For more information on *CyGNSSnet*, see [Asgarimehr et al, Remote Sensing of Environment (2022)](https://doi.org/10.1016/j.rse.2021.112801)   
 8 | ### Workflow
 9 | 
10 | ![Workflow](/Workflow.png)
11 | 
12 | 
13 | 
14 | ### Quick start
15 | 
16 | To start the deployment run ```sh set_up_infrastructure.sh```.
17 | 
18 | This clones the git repository and starts the deployment using docker-compose.
19 | Make sure you have docker and docker-compose installed. 
20 | 
21 | If you have already the cloned the git repository move to the directory ```docker_cygnss_deployment``` and run 
22 | 
23 | ```
24 | docker-compose up
25 | ``` 
26 | 
27 | To stop the container, run following command:
28 | ```
29 | docker-compose -f ./docker-compose.yml down --remove-orphans
30 | ```
31 | 
32 | Note: In order to run it you need access to the external submodule containing the CyGNSSnet. 
33 | 
34 | The deployment is scheduled using prefect. It is executed every day and downloads the CyGNSS data for the current date minus 10 days. Then the predictions are calculated, stored in a mongodb database and displayed on a streamlit dashboard.
35 | 
36 | To access the streamlit dashboard: http://localhost:8501
37 | 
38 | To access the mongodb database: http://localhost:8081
39 | 
40 | To access the prefect ui: http://localhost:5000
41 | 
42 | 
43 | ### Repository Structure
44 | 
45 | ```
46 | API.py: download CyGNSS data
47 | Preprocessing.py: download ERA5 data and preprocess data
48 | dashboard.py: streamlit dashboard
49 | plots.py: helper functions to create the plots for the streamlit dashboard
50 | prefect-deploy.py: Deployment scheduled for every day
51 | externals/: folder with CyGNSSnet code
52 | notebooks/: folder with some notebooks that were created during the development
53 | docker_cygnss_deployment/: folder with docker files to start deployment
54 | ```
55 |     
56 | ## Data source
57 | 
58 | - CYGNSS. CYGNSS Level 2 Science Data Record Version 3.1. Ver. 3.1. PO.DAAC, CA, USA. accessed 2022/2023 at 10.5067/CYGNS-L2X31
59 | - Copernicus Climate Change Service (C3S) (2017): ERA5: Fifth generation of ECMWF atmospheric reanalyses of the global climate . Copernicus Climate Change Service Climate Data Store (CDS), 2022/2023. https://cds.climate.copernicus.eu/cdsapp#!/home
60 | 


--------------------------------------------------------------------------------
/dashboard.py:
--------------------------------------------------------------------------------
  1 | #import libraries
  2 | import streamlit as st
  3 | import pandas as pd
  4 | import numpy as np
  5 | import requests
  6 | from sklearn.ensemble import RandomForestClassifier
  7 | import json
  8 | import datetime
  9 | from datetime import timedelta
 10 | 
 11 | import streamlit as st
 12 | from pymongo import MongoClient, errors
 13 | from PIL import Image
 14 | import requests
 15 | from io import BytesIO
 16 | 
 17 | 
 18 | 
 19 | def user_input_features():
 20 |     option = st.sidebar.selectbox(
 21 |     'What would you like to see?', ('Results', 'About us'))
 22 |     date_ = st.sidebar.date_input("For which date you want to see the results", datetime.date.today() - timedelta(days=12), min_value = datetime.date(2021,1,1), max_value = datetime.date.today() - timedelta(days=12))
 23 |     
 24 |     
 25 |     return date_, option
 26 | 
 27 | # Initialize connection.
 28 | # Uses st.experimental_singleton to only run once.
 29 | @st.experimental_singleton
 30 | def init_connection():
 31 |     client = MongoClient('mongodb://root:example@mongodb:27017/')
 32 |     return client
 33 | 
 34 | 
 35 | @st.experimental_memo(ttl=600)
 36 | def get_data(date_):
 37 |     cygnss = client.cygnss
 38 |     from_date = date_
 39 |     criteria = {"event_date": {"$eq": from_date}}
 40 |     items = cygnss.cygnss_collection.find(criteria)
 41 |     items = list(items)  # make hashable for st.experimental_memo
 42 |     return items
 43 | 
 44 | 
 45 | date_, option = user_input_features()
 46 | 
 47 | 
 48 | # Pull data from the collection.
 49 | # Uses st.experimental_memo to only rerun when the query changes or after 10 min.
 50 |     # Initializing connection
 51 | client = init_connection()
 52 | 
 53 | date_ = date_.strftime("%Y-%m-%d")
 54 | 
 55 | # drop database if exists, just to not clutter it with multiple values of data
 56 | # client.drop_database('cygnss')
 57 | items = get_data(date_)
 58 | 
 59 | if option == 'About us':
 60 |     
 61 |     
 62 |     st.write("""
 63 |         # About US""")
 64 | 
 65 |     st.write("The objective of this website is to use a pre-trained CyGNSSnet \
 66 |               to predict global ocean wind speed in near time. The results are shown on a web interface, \
 67 |               which provides different illustrations of the predicted wind speed and its error compared to ERA5 windspeed data.\
 68 |               CyGNSSnet is a neural net developed to predict wind speed from CYGNSS(Cyclone Global Navigation Satellite System) data.\
 69 |               The code for CyGNSSnet itself is not public. For more information or if you need to access it contact Caroline Arnold (arnold@dkrz.de)\
 70 |               or the Helmholtz AI consultant team for Earth and Environment (consultant-helmholtz.ai@dkrz.de). For more information on CyGNSSnet,\
 71 |               see Asgarimehr et al, Remote Sensing of Environment (2022)")
 72 | 
 73 | if option == 'Results':
 74 | 
 75 | 
 76 | # Display results.
 77 |     if len(items) == 0:
 78 |         st.write(f" Data does not exist for this date. Choose a different date please!")
 79 | 
 80 |     else:
 81 |         # Creating UI
 82 |         # st.subheader('User Input parameters')
 83 | 
 84 |         st.write("""
 85 |         # Results """)
 86 | 
 87 |         # app heading
 88 |         st.write("""
 89 |         # Ocean Wind Speed""")
 90 |         
 91 |         st.write('Date:', date_)
 92 | 
 93 |         
 94 |         y_bins = ["up to 4m/s", "up to 8m/s", "up to 12m/s",
 95 |                 "up to 16m/s", "up to 20m/s", "up to 100m/s"]
 96 |         for item in items:  # @harsh can this be more than 1 item?
 97 |                 st.write(f"Total RMSE is: {item['rmse']:.3f} m/s ")
 98 |                 d = {'Windspeed': y_bins, 'RMSE': item['bin_rmse'], 'Bias': item['bin_bias'],
 99 |                 'Counts': [int(i) for i in item['bin_counts']]}
100 |                 df = pd.DataFrame(data=d)
101 |                 # hide first column (index) of the table
102 |                 hide_table_row_index = """
103 |                 <style>
104 |                 thead tr th:first-child {display:none}
105 |                 tbody th {display:none}
106 |                 </style>
107 |                 """
108 |                 st.markdown(hide_table_row_index, unsafe_allow_html=True)
109 |                 st.table(data=df)
110 | 
111 |         for item in items:
112 |                 #response = requests.get(item['image_url'])
113 |                 # Image.open(BytesIO(response.content))
114 |                 scatter = Image.open(item['scatterplot_path'])
115 |                 st.markdown(f"## Scatterplot: ERA5 wind speed - model prediction")
116 |                 st.image(scatter, caption="Scatterplot")
117 | 
118 |                 histo = Image.open(item['histogram_path'])
119 |                 st.markdown(f"## Histogram: ERA5 wind speed and predicted wind speed")
120 |                 st.image(histo, caption="Histogram")
121 | 
122 |                 #era_avg = Image.open(item['era_average_path'])
123 |                 # st.markdown(f"## ERA 5 Average")
124 |                 #st.image(era_avg, caption="ERA5 average")
125 | 
126 |                 #rmse_avg = Image.open(item['rmse_average_path'])
127 |                 # st.markdown(f"## RMSE Average")
128 |                 #st.image(rmse_avg, caption="RMSE average")
129 | 
130 |                 today_longavg = Image.open(item['today_longrunavg_path'])
131 |                 st.markdown(f"## RMSE - Today and Longrun Average")
132 |                 st.image(today_longavg, caption="RMSE - Today and Longrun Average")
133 | 
134 |                 today_long_bias = Image.open(item['today_long_bias_path'])
135 |                 st.markdown(f"## BIAS - Today and Longrun Average")
136 |                 st.image(today_long_bias, caption="Bias - Today and Longrun Average")
137 | 
138 |                 sample_counts = Image.open(item['sample_counts_path'])
139 |                 st.markdown(f"## Sample Counts")
140 |                 st.image(sample_counts, caption="Sample Counts")
141 | 
142 |                 rmse_bins_era = Image.open(item['rmse_bins_era_path'])
143 |                 st.markdown(f"## RMSE for different Windspeed Bins")
144 |                 st.image(rmse_bins_era, caption="RMSE for different Windspeed Bins")
145 | 
146 |                 bias_bins_era = Image.open(item['bias_bins_era_path'])
147 |                 st.markdown(f"## Bias for different Windspeed Bins")
148 |                 st.image(bias_bins_era, caption="Bias for different Windspeed Bins")
149 | 
150 | 


--------------------------------------------------------------------------------
/Preprocessing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Preprocessing CyGNSS data
  5 | 
  6 | import os
  7 | import sys
  8 | from datetime import datetime, date, timedelta
  9 | import argparse 
 10 | 
 11 | sys.path.append('externals/gfz_cygnss/')
 12 | from gfz_202003.preprocessing import preprocess as prep
 13 | #sys.path.append('externals/gfz_cygnss/gfz_202003')
 14 | #from preprocessing import preprocess as prep
 15 | 
 16 | import numpy as np
 17 | import xarray as xr
 18 | import hashlib
 19 | 
 20 | def pre_processing(year, month, day, dev_data_dir='/app/dev_data', raw_data_root='/app/raw_data', annotated_raw_data_root='/app/annotated_raw_data'):
 21 |     '''
 22 |     Preprocessing routines for CyGNSSnet
 23 | 
 24 |     (1) Annotate CyGNSS raw data with windspeed labels from ERA5
 25 |     (2) Filter and generate hdf5 file
 26 | 
 27 |     Folder structure:
 28 | 
 29 |     * raw_data
 30 |     * annotated_raw_data
 31 |     * dev_data : filtered, one file test_data.h5
 32 | 
 33 |     Parameters:
 34 |       year, month, day - preprocess the data downloaded for that day
 35 |       dev_data_dir     - directory to store the filtered data for that day
 36 |       raw_data_root    - where to find the downloaded raw data
 37 |       annotated_raw_data_root - where to store the annotated raw data
 38 | 
 39 |     Returns:
 40 |       h5_file - path to the filtered data for that day
 41 |     '''
 42 | 
 43 |     raw_data_sub = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%Y/%j")
 44 | 
 45 |     raw_data_dir = os.path.join(raw_data_root, raw_data_sub)
 46 |     annotated_raw_data_dir = os.path.join(annotated_raw_data_root, raw_data_sub)
 47 |     era5_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc') 
 48 | 
 49 |     if not os.path.isdir(annotated_raw_data_dir):
 50 |         os.makedirs(annotated_raw_data_dir, exist_ok=True)
 51 | 
 52 |     if not os.path.isdir(dev_data_dir):
 53 |         os.makedirs(dev_data_dir, exist_ok=True)
 54 | 
 55 |     start_date = datetime(year, month, day).strftime("%Y-%m-%dT%H:%M:%SZ")
 56 |     end_date   = (datetime(year, month, day) + timedelta(1)).strftime("%Y-%m-%dT%H:%M:%SZ")
 57 | 
 58 |     for cygnss_file in os.listdir(raw_data_dir):
 59 |         if cygnss_file.startswith('cyg') and cygnss_file.endswith('.nc'):
 60 |             print("annotating", cygnss_file)
 61 | 
 62 |             pcf = os.path.join(raw_data_dir, cygnss_file)
 63 |             phf = os.path.join(annotated_raw_data_dir, cygnss_file.replace('.nc', '.md5'))
 64 | 
 65 |             print("create hash", phf)
 66 | 
 67 |             if os.path.exists(phf):
 68 |                 print("-- hash exists, skip")
 69 |                 continue 
 70 | 
 71 |             annotate_dataset(pcf, era5_data, save_dataset=True)       
 72 | 
 73 |             hmd5 = hash_large_file(pcf)
 74 |             with open(phf, 'w') as hf:
 75 |                 hf.write(hmd5)
 76 | 
 77 |     dday = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%j") # need that later
 78 |     
 79 |     args = argparse.Namespace(raw_data_dir=annotated_raw_data_root,
 80 |                         output_dir=dev_data_dir,
 81 |                         v_map=['brcs', 'eff_scatter', 'raw_counts', 'power_analog'],
 82 |                         n_valid_days=0,
 83 |                         n_test_days=1,
 84 |                         n_processes=1,
 85 |                         only_merge=False,
 86 |                         use_land_data=False,
 87 |                         is_ml_ops=True,
 88 |                         version='v3.1',
 89 |                         day=dday,
 90 |                         year=year,
 91 |                         reduce_mode='')
 92 | 
 93 |     prep.generate_input_data(args)
 94 | 
 95 | def hash_large_file(file):
 96 |     '''
 97 |     Read a large file in chunks and compute the MD5 checksum
 98 | 
 99 |     Parameters:
100 |      file - the file to be hashed
101 | 
102 |     Returns:
103 |      hash(file)
104 |     '''
105 |     with open(file,'rb') as f:
106 |         file_hash = hashlib.md5()
107 |         while chunk := f.read(8192):
108 |             file_hash.update(chunk)
109 | 
110 |     print(file_hash.hexdigest())
111 |     return file_hash.hexdigest()
112 | 
113 | def annotate_dataset(cygnss_file, era5_file, save_dataset=False):
114 |     '''
115 |     Annotate a given CyGNSS dataset with ERA5 windspeed labels and save to disk
116 | 
117 |     The ERA5 grid is padded to mimic periodic boundary conditions.
118 | 
119 |     Annotate additional ERA5 parameters (GPM_precipitation)
120 | 
121 |     TODO: hash
122 |     
123 |     Parameters:
124 |     cygnss_file : path to CyGNSS dataset
125 |     era5_file   : path to orresponding ERA5 dataset
126 |     save_dataset : if True, save dataset to disk in annotated_raw_data_dir (default: False)
127 |     
128 |     Returns:
129 |     Annotated CyGNSS dataset
130 |     '''
131 |     
132 |     # necessary because lazy loading prohibits overwriting the netcdf files at the end of this section
133 |     with xr.open_dataset(cygnss_file) as data:
134 |         cygnss_ds = data.load()
135 |         
136 |     with xr.open_dataset(era5_file) as data:
137 |         era5_ds = data.load()
138 |         
139 |     # needs to be shifted by 180 for compatibility with CyGNSS
140 |     era5_ds = era5_ds.assign_coords(longitude=era5_ds.coords['longitude'] + 180)
141 | 
142 |     # pad to the right (> 360 deg lon)
143 |     era5_r = era5_ds.where(era5_ds.longitude < 10, drop=True)
144 |     # pad to the left (< 0 deg lon)
145 |     era5_l = era5_ds.where(era5_ds.longitude > 350, drop=True)
146 |     # shift coordinate outside bounding box
147 |     era5_r = era5_r.assign_coords(longitude=era5_r.coords['longitude'] + 360)
148 |     era5_l = era5_l.assign_coords(longitude=era5_l.coords['longitude'] - 360)
149 | 
150 |     padded_ds = xr.merge([era5_l, era5_ds, era5_r])
151 | 
152 |     interp_ds = padded_ds.interp(longitude=cygnss_ds.sp_lon, latitude=cygnss_ds.sp_lat, time=cygnss_ds.ddm_timestamp_utc, method='nearest')
153 |     
154 |     cygnss_ds['ERA5_u10'] = interp_ds['u10']
155 |     cygnss_ds['ERA5_v10'] = interp_ds['v10']
156 |     cygnss_ds['GPM_precipitation'] = interp_ds['tp']
157 | 
158 |     tmp_attrs = cygnss_ds['ERA5_u10'].attrs
159 |     tmp_attrs['long_name'] = cygnss_ds['ERA5_u10'].long_name + ' (interpolated)'
160 |     cygnss_ds['ERA5_u10'].attrs = tmp_attrs
161 | 
162 |     tmp_attrs = cygnss_ds['ERA5_v10'].attrs
163 |     tmp_attrs['long_name'] = cygnss_ds['ERA5_v10'].long_name + ' (interpolated)'
164 |     cygnss_ds['ERA5_v10'].attrs = tmp_attrs
165 |     
166 |     cygnss_ds = cygnss_ds.drop_vars(['longitude', 'latitude', 'time'])
167 |     
168 |     # dummy values only for preprocessing routine
169 |     cygnss_ds['ERA5_mdts'] = -9999
170 |     cygnss_ds['ERA5_mdww'] = -9999
171 |     cygnss_ds['ERA5_swh'] = -9999
172 |     cygnss_ds['ERA5_shts'] = -9999
173 |     cygnss_ds['ERA5_shww'] = -9999
174 |     cygnss_ds['ERA5_p140121'] = -9999
175 |     cygnss_ds['ERA5_p140124'] = -9999
176 |     cygnss_ds['ERA5_p140127'] = -9999
177 | 
178 |     # additional condition - check for quality flag here
179 |     cygnss_ds = cygnss_ds.where(cygnss_ds['quality_flags'] == 4, drop=True)
180 |     
181 |     if save_dataset:
182 |         cygnss_ds.to_netcdf(cygnss_file.replace('raw_data', 'annotated_raw_data'))
183 |         
184 |     return cygnss_ds
185 | 
186 | if __name__=='__main__':
187 |     pre_processing()
188 | 


--------------------------------------------------------------------------------
/plots.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import pandas as pd
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | from matplotlib import lines, colors, ticker
  7 | import seaborn as sns
  8 | import cartopy.crs as ccrs
  9 | from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
 10 | from mpl_toolkits.axes_grid1 import AxesGrid
 11 | import itertools
 12 | plt.switch_backend('agg')
 13 | 
 14 | deg = 1 # grid resolution (publication: 1)
 15 | 
 16 | grid_lon = np.arange(-180, 181, deg)
 17 | grid_lat = np.arange(-90, 91, deg)
 18 | 
 19 | def average_to_grid2(lon, lat, var, resolution=1, fill_value=-1):
 20 |     '''
 21 |     Grid a time-dependent variable in lon/lat and average over all counts
 22 |     
 23 |     lon - time series of lon coordinate (1D) (0...360)
 24 |     lat - time series of lat coordinate (1D)
 25 |     var - time series of variable (1D)
 26 |     resolution - target grid resolution (default: 1 deg)
 27 |     fill_value - a value that can be used for filling (i.e. that does not show up in var)
 28 |     
 29 |     Returns:
 30 |     2D gridded arrays for lat, lon, count-averaged var
 31 |     '''
 32 | 
 33 |     assert len(lon) == len(lat)
 34 |     assert len(lon) == len(var)
 35 | 
 36 |     grid_lon = np.arange(0, 360+resolution, resolution)
 37 |     grid_lat = np.arange(-90, 90+resolution, resolution)[::-1] # top left is +lat
 38 | 
 39 |     ix_lon = np.digitize(lon, grid_lon)
 40 |     ix_lat = np.digitize(lat, grid_lat)
 41 | 
 42 |     xx, yy = np.meshgrid(grid_lon, grid_lat, indexing='ij')
 43 |     gridded_var = np.empty(xx.shape, dtype='float')
 44 |     gridded_var[:] = fill_value
 45 | 
 46 |     ij = itertools.product(np.unique(ix_lon), np.unique(ix_lat))
 47 | 
 48 |     for i,j in ij:
 49 |         cond = (ix_lon==i) & (ix_lat==j)
 50 |         gridded_var[i,j] = np.mean(var[cond])
 51 | 
 52 |     gridded_var[gridded_var==fill_value] = None
 53 | 
 54 | 
 55 |     return xx, yy, gridded_var
 56 | 
 57 | def make_scatterplot(y_true, y_pred, date_):
 58 |     ymin = 2.5
 59 |     ymax = 25.0
 60 | 
 61 |     fig=plt.figure()
 62 |     ax=fig.add_subplot(111)
 63 | 
 64 |     img=ax.hexbin(y_true, y_pred, cmap='viridis', norm=colors.LogNorm(vmin=1, vmax=25000), mincnt=1)
 65 |     clb=plt.colorbar(img)
 66 |     clb.set_ticks([1, 10, 100, 1000, 10000])
 67 |     clb.set_ticklabels([r'$1$', r'$10$', r'$10^2$', r'$10^3$', r'$10^4$'])
 68 |     clb.set_label('Samples in bin')
 69 |     clb.ax.tick_params()
 70 | 
 71 |     ax.set_xlabel('ERA5 wind speed (m/s)')
 72 |     ax.set_ylabel('Predicted wind speed (m/s)')
 73 | 
 74 |     ax.plot(np.linspace(0, 30), np.linspace(0, 30), 'w:')
 75 | 
 76 |     ax.set_ylim(ymin, 25)
 77 |     ax.set_xlim(ymin, 25)
 78 | 
 79 |     ax.set_xticks([5, 10, 15, 20, 25])
 80 |     ax.set_xticklabels([5, 10, 15, 20, 25])
 81 |     ax.set_yticks([5, 10, 15, 20, 25])
 82 |     ax.set_yticklabels([5, 10, 15, 20, 25])
 83 | 
 84 |     fig.tight_layout()    
 85 |     plt.savefig(f'/app/plots/scatter_{date_}.png')    
 86 | 
 87 | def make_histogram(y_true, y_pred, date_):
 88 |     fig=plt.figure()
 89 |     ax=fig.add_subplot(111)
 90 | 
 91 |     sns.histplot(y_true, ax=ax, color='C7', label='ERA5 wind speed (m/s)')
 92 |     sns.histplot(y_pred, ax=ax, color='C2', label='Predicted wind speed (m/s)')
 93 | 
 94 |     ax.legend(fontsize=12)
 95 | 
 96 |     ax.set_xticks([5, 10, 15, 20, 25])
 97 |     ax.set_xticklabels([5, 10, 15, 20, 25])
 98 |     ax.set_xlabel('ERA5 wind speed (m/s)')
 99 | 
100 |     plt.savefig(f'/app/plots/histo_{date_}.png')
101 | 
102 | def era_average(y_true, sp_lon, sp_lat, date_):
103 |     xx, yy, gridded_y_true = average_to_grid2(sp_lon[:], sp_lat[:], y_true[:], resolution=deg)
104 |     proj = ccrs.PlateCarree(180)
105 |     
106 |     fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))
107 |     cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_y_true[:].T, levels=60, transform=proj, antialiased=False, cmap='magma')
108 |     ax.coastlines()
109 |     gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')
110 |     gl.top_labels = False
111 |     gl.right_labels= False
112 |     clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average ERA5 wind speed (m/s)')
113 | 
114 |     clb.set_ticks(np.arange(2.5, 18, 2.5))
115 |     clb.ax.tick_params(labelsize=8)
116 | 
117 |     gl.xlabel_style = {'size': 8, 'color': 'black'}
118 |     gl.ylabel_style = {'size': 8, 'color': 'black'}
119 | 
120 |     plt.savefig(f'/app/plots/era_average_{date_}.png')    
121 | 
122 | def rmse_average(y_true, y_pred, sp_lon, sp_lat):
123 |     xx, yy, gridded_rmse = average_to_grid2(sp_lon[:], sp_lat[:], np.abs(y_pred[:] - y_true[:]), resolution=deg)
124 |     proj = ccrs.PlateCarree(180)
125 |     fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))
126 |     cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_rmse[:].T, levels=60, transform=proj, antialiased=False, cmap='viridis')
127 |     ax.coastlines()
128 |     gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')
129 |     gl.top_labels = False
130 |     gl.right_labels= False
131 |     clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average RMSE (m/s)')
132 | 
133 |     clb.set_ticks(np.arange(0, np.nanmax(gridded_rmse)+1, 1.0))
134 |     clb.ax.tick_params(labelsize=8)
135 | 
136 |     gl.xlabel_style = {'size': 8, 'color': 'black'}
137 |     gl.ylabel_style = {'size': 8, 'color': 'black'}
138 | 
139 | 
140 | def today_longrunavg(df_mockup, y_bins, date_):
141 |     
142 |     fig=plt.figure(figsize=(10,4))
143 |     ax=fig.add_subplot(111)
144 | 
145 |     sns.barplot(data=df_mockup, x='bins', y='rmse', hue='time', ax=ax)
146 |     ax.legend()
147 | 
148 |     ax.set_xlabel('ERA5 wind speed (m/s)')
149 |     ax.set_ylabel('RMSE (m/s)')
150 | 
151 |     ax.set_xticks(range(len(y_bins)))
152 |     ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
153 | 
154 |     plt.savefig(f'/app/plots/today_longrunavg_{date_}.png')    
155 | 
156 | def today_longrunavg_bias(df_mockup, y_bins, date_):
157 | 
158 |     fig=plt.figure(figsize=(10,4))
159 |     ax=fig.add_subplot(111)
160 | 
161 |     sns.barplot(data=df_mockup, x='bins', y='bias', hue='time', ax=ax)
162 |     ax.legend()
163 | 
164 |     ax.set_xlabel('ERA5 wind speed (m/s)')
165 |     ax.set_ylabel('Bias (m/s)')
166 | 
167 |     ax.set_xticks(range(len(y_bins)))
168 |     ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
169 |         
170 |     plt.savefig(f'/app/plots/today_long_bias_{date_}.png')    
171 | 
172 | def sample_counts(df_rmse, y_bins, date_):
173 | 
174 |     fig=plt.figure(figsize=(10,4))
175 |     ax=fig.add_subplot(111)
176 |     sns.barplot(data=df_rmse, x='bins', y='counts', ax=ax)
177 |     ax.set_xlabel('ERA5 wind speed (m/s)')
178 |     ax.set_ylabel('Sample counts')
179 | 
180 |     ax.set_xticks(range(len(y_bins)))
181 |     ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
182 | 
183 |     plt.savefig(f'/app/plots/sample_counts_{date_}.png')    
184 | 
185 | def rmse_bins_era(df_rmse, y_bins, date_):
186 | 
187 |     fig=plt.figure(figsize=(10,4))
188 |     ax=fig.add_subplot(111)
189 |     sns.barplot(data=df_rmse, x='bins', y='rmse', ax=ax)
190 |     ax.set_xlabel('ERA5 wind speed (m/s)')
191 |     ax.set_ylabel('RMSE (m/s)')
192 | 
193 |     ax.set_xticks(range(len(y_bins)))
194 |     ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
195 | 
196 |     plt.savefig(f'/app/plots/rmse_bins_era_{date_}.png')    
197 | 
198 | def bias_bins_era(df_rmse, y_bins, date_):
199 | 
200 |     fig=plt.figure(figsize=(10,4))
201 |     ax=fig.add_subplot(111)
202 |     sns.barplot(data=df_rmse, x='bins', y='bias', ax=ax)
203 |     ax.set_xlabel('ERA5 wind speed (m/s)')
204 |     ax.set_ylabel('Bias (m/s)')
205 | 
206 |     ax.set_xticks(range(len(y_bins)))
207 |     ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
208 |     
209 |     plt.savefig(f'/app/plots/bias_bins_era_{date_}.png')     
210 | 


--------------------------------------------------------------------------------
/API.py:
--------------------------------------------------------------------------------
  1 | import xarray as xr
  2 | import numpy as np
  3 | import os
  4 | import sys
  5 | from datetime import date, timedelta, datetime
  6 | 
  7 | from subscriber import podaac_access as pa
  8 | import cdsapi
  9 | from urllib.error import HTTPError
 10 | from urllib.request import urlretrieve
 11 | import logging
 12 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 13 | 
 14 | def download_raw_data(year, month, day, raw_data_root='/app/raw_data'):
 15 |     '''
 16 |     Download raw data using API
 17 | 
 18 |     * CyGNSS data
 19 |     * ERA5 data
 20 | 
 21 |     For compliance with the CyGNSSnet preprocessing routines, the data is stored in 
 22 | 
 23 |      > {raw_data_root}/{year}/{day-of-year}
 24 | 
 25 |     Parameters:
 26 |      year, month, day - download data from the full day specified
 27 |      raw_data_root    - root of path to store the data
 28 |     '''
 29 | 
 30 |     raw_data_sub = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%Y/%j")
 31 | 
 32 |     raw_data_dir = os.path.join(raw_data_root, raw_data_sub)
 33 | 
 34 |     print('Downloading data in this directory: ', raw_data_dir)
 35 | 
 36 |     start_date = datetime(year, month, day).strftime("%Y-%m-%dT%H:%M:%SZ")
 37 |     end_date   = (datetime(year, month, day) + timedelta(1)).strftime("%Y-%m-%dT%H:%M:%SZ")
 38 | 
 39 |     print(f'--start-date {start_date}')
 40 |     print(f'--end-date   {end_date}')
 41 | 
 42 |     # PODAAC data
 43 |     adapted_podaac_downloader(start_date, end_date, raw_data_dir)
 44 | 
 45 |     # ERA5 data
 46 |     era5_downloader(year, month, day, raw_data_dir)
 47 | 
 48 | 
 49 | def era5_downloader(year, month, day, raw_data_dir):
 50 |     '''
 51 |     ERA5 data downloader from Copernicus
 52 | 
 53 |     We need to download all the time steps of the current day, as well as the 
 54 |     time step midnight on the following day. These are merged.
 55 | 
 56 |     Parameters:
 57 |       year, month, day - download data from the full day specified
 58 |       data_path  - path to store the data
 59 |     '''
 60 | 
 61 |     print("Start ERA5 download")
 62 |     target_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc')
 63 |     era5_data = os.path.join(raw_data_dir, 'ERA5_today.nc') 
 64 |     tomorrow_era5_data = os.path.join(raw_data_dir, 'ERA5_tomorrow.nc') 
 65 |     cds = cdsapi.Client()
 66 |    
 67 |     # Retrieve today's data
 68 |     cds.retrieve(
 69 |     'reanalysis-era5-single-levels',
 70 |     {
 71 |         'product_type': 'reanalysis',
 72 |         'format': 'netcdf',
 73 |         'variable': [
 74 |             '10m_u_component_of_wind', '10m_v_component_of_wind',
 75 |             'total_precipitation',
 76 |         ],
 77 |         'year': year,
 78 |         'month': month,
 79 |         'day': day,
 80 |         'time': [
 81 |             '00:00', '01:00', '02:00',
 82 |             '03:00', '04:00', '05:00',
 83 |             '06:00', '07:00', '08:00',
 84 |             '09:00', '10:00', '11:00',
 85 |             '12:00', '13:00', '14:00',
 86 |             '15:00', '16:00', '17:00',
 87 |             '18:00', '19:00', '20:00',
 88 |             '21:00', '22:00', '23:00'
 89 |         ],
 90 |         'area': [
 91 |             50, -180, -50, 180,
 92 |         ],
 93 |     },
 94 |     era5_data)
 95 | 
 96 |     # Retrieve tomorrow's data
 97 |     tomorrow = datetime(year, month, day) + timedelta(1)
 98 | 
 99 |     cds.retrieve(
100 |     'reanalysis-era5-single-levels',
101 |     {
102 |         'product_type': 'reanalysis',
103 |         'format': 'netcdf',
104 |         'variable': [
105 |             '10m_u_component_of_wind', '10m_v_component_of_wind',
106 |             'total_precipitation',
107 |         ],
108 |         'year': tomorrow.year,
109 |         'month': tomorrow.month,
110 |         'day': tomorrow.day,
111 |         'time': [
112 |             '00:00', '01:00'
113 |         ],
114 |         'area': [
115 |             50, -180, -50, 180,
116 |         ],
117 |     },
118 |     tomorrow_era5_data)
119 | 
120 |     # Retrieve tomorrow's data
121 |     with xr.open_dataset(era5_data) as f1, xr.open_dataset(tomorrow_era5_data) as f2:
122 |         era5_ds = xr.merge([f1.load(), f2.load()])
123 |         era5_ds.to_netcdf(target_data)
124 | 
125 |     print('SUCCESS: Retrieved ERA5 data')
126 |     
127 | 
128 | def adapted_podaac_downloader(start_date, end_date, data_path):
129 |     '''
130 |     PODAAC data downloader adapted for CyGNSSnet
131 | 
132 |     Adapted from the run routine in
133 |     https://github.com/podaac/data-subscriber/blob/main/subscriber/podaac_data_downloader.py
134 | 
135 |     Parameters:
136 |      start_date - download start date in ISO format
137 |      end_date   - download end date in ISO format
138 |      data_path  - path to store the data
139 |     '''
140 | 
141 |     # Default values
142 |     page_size = 2000
143 |     edl = pa.edl
144 |     cmr = pa.cmr
145 |     token_url = pa.token_url
146 | 
147 |     pa.setup_earthdata_login_auth(edl)
148 |     token = pa.get_token(token_url)
149 |     print('Completed PODAAC authentification')
150 | 
151 |     provider = 'POCLOUD'
152 |     #search_cycles = args.search_cycles [None ?]
153 |     short_name = 'CYGNSS_L1_V3.1'
154 |     extensions = None
155 |     #process_cmd = args.process_cmd [empty ?]
156 | 
157 |     download_limit = None
158 |     ts_shift = timedelta(hours=0)
159 | 
160 |     verbose = True
161 |     force = False
162 | 
163 | 
164 |     if not os.path.isdir(data_path):
165 |         print("NOTE: Making new data directory at " + data_path + "(This is the first run.)")
166 |         os.makedirs(data_path, exist_ok=True)
167 | 
168 |     temporal_range = pa.get_temporal_range(start_date, end_date,
169 |                                                datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))  # noqa E501
170 |     params = [
171 |         ('page_size', page_size),
172 |         ('sort_key', "-start_date"),
173 |         ('provider', provider),
174 |         ('ShortName', short_name),
175 |         ('temporal', temporal_range),
176 |     ]
177 |     print("Temporal Range: " + temporal_range)
178 | 
179 |     # TODO bbox
180 | 
181 |     #if args.bbox is not None:
182 |     #    params.append(('bounding_box', args.bbox))
183 | 
184 |     # If 401 is raised, refresh token and try one more time
185 |     try:
186 |         results = pa.get_search_results(params, verbose)
187 |     except HTTPError as e:
188 |         if e.code == 401:
189 |             token = pa.refresh_token(token, 'podaac-subscriber')
190 |             params['token'] = token
191 |             results = pa.get_search_results(params, verbose)
192 |         else:
193 |             raise e
194 | 
195 |     if verbose:
196 |         print(str(results['hits']) + " granules found for " + short_name)  # noqa E501
197 | 
198 |     downloads_all = []
199 |     downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if
200 |                        u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in
201 |                       results['items']]
202 |     downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in
203 |                           results['items']]
204 |     checksums = pa.extract_checksums(results)
205 | 
206 |     for f in downloads_data:
207 |         downloads_all.append(f)
208 |     for f in downloads_metadata:
209 |         downloads_all.append(f)
210 | 
211 |     downloads = [item for sublist in downloads_all for item in sublist]
212 | 
213 |     if len(downloads) >= page_size:
214 |         logging.warning("Only the most recent " + str(
215 |             page_size) + " granules will be downloaded; try adjusting your search criteria (suggestion: reduce time period or spatial region of search) to ensure you retrieve all granules.")
216 | 
217 |     # filter list based on extension
218 |     if not extensions:
219 |         extensions = pa.extensions
220 |     filtered_downloads = []
221 |     for f in downloads:
222 |         for extension in extensions:
223 |             if f.lower().endswith(extension):
224 |                 filtered_downloads.append(f)
225 | 
226 |     downloads = filtered_downloads
227 | 
228 |     print("Found " + str(len(downloads)) + " total files to download")
229 |     if verbose:
230 |         print("Downloading files with extensions: " + str(extensions))
231 | 
232 |     # NEED TO REFACTOR THIS, A LOT OF STUFF in here
233 |     # Finish by downloading the files to the data directory in a loop.
234 |     # Overwrite `.update` with a new timestamp on success.
235 |     success_cnt = failure_cnt = skip_cnt = 0
236 |     for f in downloads:
237 |         try:
238 |             output_path = os.path.join(data_path, os.path.basename(f))
239 | 
240 |             # decide if we should actually download this file (e.g. we may already have the latest version)
241 |             if(os.path.exists(output_path) and not force and pa.checksum_does_match(output_path, checksums)):
242 |                 print(str(datetime.now()) + " SKIPPED: " + f)
243 |                 skip_cnt += 1
244 |                 continue
245 | 
246 |             urlretrieve(f, output_path)
247 |             #pa.process_file(process_cmd, output_path, args)
248 |             print(str(datetime.now()) + " SUCCESS: " + f)
249 |             success_cnt = success_cnt + 1
250 | 
251 |             #if limit is set and we're at or over it, stop downloading
252 |             if download_limit and success_cnt >= download_limit:
253 |                 break
254 | 
255 |         except Exception:
256 |             logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True)
257 |             failure_cnt = failure_cnt + 1
258 | 
259 |     print("Downloaded Files: " + str(success_cnt))
260 |     print("Failed Files:     " + str(failure_cnt))
261 |     print("Skipped Files:    " + str(skip_cnt))
262 |     pa.delete_token(token_url, token)
263 |     print("END\n\n")
264 | 
265 | if __name__=='__main__':    
266 |     download_data_date = date.today() - timedelta(days=10)
267 |     download_raw_data(year = download_data_date.year, month = download_data_date.month, day = download_data_date.day)
268 | 


--------------------------------------------------------------------------------
/notebooks/DailyAnalysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "6189727d-4f56-49fc-b2f0-b642097206b3",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import h5py\n",
 11 |     "from matplotlib import pyplot as plt\n",
 12 |     "from matplotlib import lines, colors, ticker\n",
 13 |     "import seaborn as sns\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "\n",
 17 |     "from sklearn.metrics import mean_squared_error\n",
 18 |     "\n",
 19 |     "import sys\n",
 20 |     "sys.path.append('../externals/gfz_cygnss/')\n",
 21 |     "import gfz_202003.utils.mathematics as mat"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "dcf836dd-cf50-43af-bd72-34f151b9b006",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "f_pred = h5py.File('/work/ka1176/caroline/gitlab/cygnss-deployment/prediction/current_predictions.h5', 'r')"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "ac9c112a-931d-46e4-9e54-bf8fa164ed9a",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "f_pred.keys()"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "44e45c03-df95-4b80-868d-ebc01a1d6642",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "y_true = f_pred['y_true'][:]\n",
 52 |     "y_pred = f_pred['y_pred'][:]\n",
 53 |     "sp_lon = f_pred['sp_lon'][:]\n",
 54 |     "sp_lat = f_pred['sp_lat'][:]"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "id": "479629bd-1a80-46fb-af33-e806db8be955",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "rmse = mean_squared_error(y_true, y_pred, squared=False)\n",
 65 |     "\n",
 66 |     "print(f'Overall root mean square error (RMSE): {rmse:.4f} m/s')"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "id": "88ef8364-a0b7-4c92-a9b2-2d1485a4c54d",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "y_bins = [4, 8, 12, 16, 20, 100]\n",
 77 |     "y_ix   = np.digitize(y_true, y_bins, right=False)\n",
 78 |     "\n",
 79 |     "all_rmse = np.zeros(len(y_bins))\n",
 80 |     "all_bias = np.zeros(len(y_bins))\n",
 81 |     "all_counts = np.zeros(len(y_bins))"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "794d4be3-e785-4c9a-ac2b-bd5eb7ad795e",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "for i, yy in enumerate(y_bins):\n",
 92 |     "    if np.any(y_ix==i):\n",
 93 |     "        rmse = mean_squared_error(y_true[y_ix==i], y_pred[y_ix==i], squared=False)\n",
 94 |     "        all_rmse[i] = rmse\n",
 95 |     "        all_bias[i] = np.mean(y_pred[y_ix==i] - y_true[y_ix==i])\n",
 96 |     "        all_counts[i] = np.sum(y_ix==i)\n",
 97 |     "        print(f'RMSE in bin {i} (up to {yy} m/s): {rmse:.4f} m/s')\n",
 98 |     "    else:\n",
 99 |     "        all_rmse[i] = None\n",
100 |     "        all_bias[i] = None\n",
101 |     "        all_counts[i] = 0\n",
102 |     "        print(f\"--- No samples in bin {i} (up to {yy} m/s)\")\n",
103 |     "        \n",
104 |     "df_rmse = pd.DataFrame(dict(rmse=all_rmse, bias=all_bias, bins=y_bins, counts=all_counts))"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "id": "718f101f-8c42-4ade-9d5c-e0650490ca9b",
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": []
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "9512aff8-2d0f-48a7-944d-8fa9e0c9cbf0",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "sns.set_style('whitegrid')\n",
123 |     "sns.set_context('talk')"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "f7281124-4ae0-4075-a0f4-5cc526144e36",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "fig=plt.figure()\n",
134 |     "ax=fig.add_subplot(111)\n",
135 |     "\n",
136 |     "sns.histplot(y_true, ax=ax, color='C7', label='ERA5 wind speed (m/s)')\n",
137 |     "sns.histplot(y_pred, ax=ax, color='C2', label='Predicted wind speed (m/s)')\n",
138 |     "\n",
139 |     "ax.legend(fontsize=12)\n",
140 |     "\n",
141 |     "ax.set_xticks([5, 10, 15, 20, 25])\n",
142 |     "ax.set_xticklabels([5, 10, 15, 20, 25])\n",
143 |     "ax.set_xlabel('ERA5 wind speed (m/s)')\n",
144 |     "\n",
145 |     "plt.show()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "bdf3083d-4881-4552-9174-766235fef0a6",
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "ymin = 2.5\n",
156 |     "ymax = 25.0\n",
157 |     "\n",
158 |     "fig=plt.figure()\n",
159 |     "ax=fig.add_subplot(111)\n",
160 |     "\n",
161 |     "img=ax.hexbin(y_true, y_pred, cmap='viridis', norm=colors.LogNorm(vmin=1, vmax=25000), mincnt=1)\n",
162 |     "clb=plt.colorbar(img)\n",
163 |     "clb.set_ticks([1, 10, 100, 1000, 10000])\n",
164 |     "clb.set_ticklabels([r'$1$', r'$10$', r'$10^2$', r'$10^3$', r'$10^4$'])\n",
165 |     "clb.set_label('Samples in bin')\n",
166 |     "clb.ax.tick_params()\n",
167 |     "\n",
168 |     "ax.set_xlabel('ERA5 wind speed (m/s)')\n",
169 |     "ax.set_ylabel('Predicted wind speed (m/s)')\n",
170 |     "\n",
171 |     "ax.plot(np.linspace(0, 30), np.linspace(0, 30), 'r:')\n",
172 |     "\n",
173 |     "ax.set_ylim(ymin, 25)\n",
174 |     "ax.set_xlim(ymin, 25)\n",
175 |     "\n",
176 |     "ax.set_xticks([5, 10, 15, 20, 25])\n",
177 |     "ax.set_xticklabels([5, 10, 15, 20, 25])\n",
178 |     "ax.set_yticks([5, 10, 15, 20, 25])\n",
179 |     "ax.set_yticklabels([5, 10, 15, 20, 25])\n",
180 |     "\n",
181 |     "fig.tight_layout()"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "0460d844-6142-497a-9710-312e2c3da617",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "import cartopy.crs as ccrs\n",
192 |     "from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter\n",
193 |     "from mpl_toolkits.axes_grid1 import AxesGrid"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "id": "a2bc579a-bf63-43ba-8547-6e0173f8903c",
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "deg = 1 # grid resolution (publication: 1)\n",
204 |     "\n",
205 |     "xx, yy, gridded_y_true = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_true[:], resolution=deg)\n",
206 |     "xx, yy, gridded_y_pred = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_pred[:], resolution=deg)\n",
207 |     "xx, yy, gridded_rmse = mat.average_to_grid2(sp_lon[:], sp_lat[:], np.abs(y_pred[:] - y_true[:]), resolution=deg)\n",
208 |     "xx, yy, gridded_bias = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_pred[:] - y_true[:], resolution=deg)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "id": "db6e3e4b-1127-4013-9041-27a4f74412d4",
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "grid_lon = np.arange(-180, 181, deg)\n",
219 |     "grid_lat = np.arange(-90, 91, deg)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "id": "2844ffa6-b5f0-4f26-b11d-1c111929b59d",
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "proj = ccrs.PlateCarree(180)\n",
230 |     "fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))\n",
231 |     "cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_y_true[:].T, levels=60, transform=proj, antialiased=False, cmap='magma')\n",
232 |     "ax.coastlines()\n",
233 |     "gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')\n",
234 |     "gl.top_labels = False\n",
235 |     "gl.right_labels= False\n",
236 |     "clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average ERA5 wind speed (m/s)')\n",
237 |     "\n",
238 |     "clb.set_ticks(np.arange(2.5, 18, 2.5))\n",
239 |     "clb.ax.tick_params(labelsize=8)\n",
240 |     "\n",
241 |     "gl.xlabel_style = {'size': 8, 'color': 'black'}\n",
242 |     "gl.ylabel_style = {'size': 8, 'color': 'black'}\n",
243 |     "\n",
244 |     "plt.show()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "id": "e28af2cb-7400-460e-982e-f03f33ebf67c",
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "proj = ccrs.PlateCarree(180)\n",
255 |     "fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))\n",
256 |     "cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_rmse[:].T, levels=60, transform=proj, antialiased=False, cmap='viridis')\n",
257 |     "ax.coastlines()\n",
258 |     "gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')\n",
259 |     "gl.top_labels = False\n",
260 |     "gl.right_labels= False\n",
261 |     "clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average RMSE (m/s)')\n",
262 |     "\n",
263 |     "clb.set_ticks(np.arange(0, np.nanmax(gridded_rmse)+1, 1.0))\n",
264 |     "clb.ax.tick_params(labelsize=8)\n",
265 |     "\n",
266 |     "gl.xlabel_style = {'size': 8, 'color': 'black'}\n",
267 |     "gl.ylabel_style = {'size': 8, 'color': 'black'}\n",
268 |     "\n",
269 |     "plt.show()"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "id": "937f9d18-12fa-436a-baea-f92af46d5e87",
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": []
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "CyGNSS Deployment",
284 |    "language": "python",
285 |    "name": "cygnss-d"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 3
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython3",
297 |    "version": "3.9.13"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 5
302 | }
303 | 


--------------------------------------------------------------------------------
/prefect-deploy.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pydoc import cli
  3 | import sys
  4 | import shutil
  5 | import time
  6 | import pandas as pd
  7 | import numpy as np
  8 | import h5py
  9 | import torch
 10 | from torch.utils.data import DataLoader, Dataset
 11 | import pytorch_lightning as pl
 12 | from pytorch_lightning.callbacks.model_summary import ModelSummary
 13 | from sklearn.metrics import mean_squared_error
 14 | from collections import namedtuple
 15 | import xarray
 16 | import mlflow
 17 | from prefect import flow, task
 18 | import streamlit as st
 19 | # TODO Fix these imports
 20 | # from prefect.deployments import DeploymentSpec
 21 | #from prefect.flow_runners import SubprocessFlowRunner
 22 | from prefect.orion.schemas.schedules import IntervalSchedule, CronSchedule
 23 | from prefect.deployments import Deployment
 24 | from prefect.filesystems import RemoteFileSystem
 25 | from prefect.infrastructure import DockerContainer
 26 | from prefect.task_runners import SequentialTaskRunner
 27 | from pymongo import MongoClient, errors
 28 | from API import download_raw_data
 29 | from datetime import datetime, timedelta, date
 30 | sys.path.append('/app/externals/gfz_cygnss/')
 31 | sys.path.append('/app/externals/gfz_cygnss/gfz_202003')
 32 | sys.path.append('/app/externals/gfz_cygnss/gfz_202003/training')
 33 | 
 34 | from cygnssnet import ImageNet, DenseNet, CyGNSSNet, CyGNSSDataset, CyGNSSDataModule
 35 | from plots import make_scatterplot, make_histogram, era_average, rmse_average, today_longrunavg, today_longrunavg_bias, sample_counts, rmse_bins_era, bias_bins_era
 36 | #import plots
 37 | from Preprocessing import pre_processing
 38 | 
 39 | @task
 40 | def download_data(year, month, day):
 41 |     # Using API calls
 42 |     download_raw_data(year, month, day)
 43 |     
 44 | @task
 45 | def get_data(client):        
 46 |         cygnss = client.cygnss                        
 47 |         items = cygnss.cygnss_collection.find()        
 48 |         items = list(items)  # make hashable for st.experimental_memo
 49 |         for item in items:
 50 |             print(f"RMSE is: {item['rmse']}")            
 51 |         
 52 |         
 53 | @task
 54 | def drop_database(client):
 55 |     client.drop_database('cygnss')
 56 | 
 57 | @task
 58 | @st.experimental_singleton
 59 | def save_to_db(domain, port, y_pred, rmse, date_, rmse_time):
 60 |     # use a try-except indentation to catch MongoClient() errors
 61 |     try:
 62 |         print('entering mongo db connection')
 63 |         
 64 |      
 65 |         
 66 |         client = MongoClient(
 67 |         host = [ str(domain) + ":" + str(port) ],
 68 |         serverSelectionTimeoutMS = 3000, # 3 second timeout
 69 |         username = "root",
 70 |         password = "example",
 71 |     )
 72 | 
 73 |         # uncomment and if you wanna clear out the data
 74 |         #client.drop_database('cygnss')
 75 | 
 76 |         # print the version of MongoDB server if connection successful
 77 |         print ("server version:", client.server_info()["version"])
 78 |         data = {
 79 |                 "rmse": rmse.tolist(),
 80 |                 "bin_rmse": rmse_time["rmse"].tolist(),
 81 |                 "bin_bias": rmse_time["bias"].tolist(),
 82 |                 "bin_counts": rmse_time["counts"].tolist(),
 83 |                 "event_date": date_,
 84 |                 "scatterplot_path": f"/app/plots/scatter_{date_}.png",
 85 |                 "histogram_path": f"/app/plots/histo_{date_}.png",
 86 |                 "era_average_path": f"/app/plots/era_average_{date_}.png",
 87 |                 "rmse_average_path": f"/app/plots/rmse_average_{date_}.png",
 88 |                 "today_longrunavg_path": f"/app/plots/today_longrunavg_{date_}.png",
 89 |                 "today_long_bias_path": f"/app/plots/today_long_bias_{date_}.png",
 90 |                 "sample_counts_path": f"/app/plots/sample_counts_{date_}.png",
 91 |                 "rmse_bins_era_path": f"/app/plots/rmse_bins_era_{date_}.png",
 92 |                 "bias_bins_era_path": f"/app/plots/bias_bins_era_{date_}.png",
 93 |                 "y_pred": y_pred.tolist()
 94 |                 }
 95 | 
 96 |         cygnss_collection = client["cygnss"].cygnss_collection
 97 | 
 98 | 
 99 |         cygnss_collection = cygnss_collection.insert_many([data])
100 | 
101 |         print(f"Multiple tutorials: {cygnss_collection.inserted_ids}")
102 | 
103 |     except errors.ServerSelectionTimeoutError as err:
104 |         # set the client and DB name list to 'None' and `[]` if exception
105 |         client = None
106 |         # catch pymongo.errors.ServerSelectionTimeoutError
107 |         print (err)
108 |     
109 | 
110 | @task
111 | def get_hyper_params(model_path, model, data_path):
112 |     # Note for future: for fixed model write h_params in config file
113 |     checkpoint = torch.load(os.path.join(model_path, model),
114 |                     map_location=torch.device("cpu"))
115 |     checkpoint['hyper_parameters']["data"] = data_path
116 |     checkpoint['hyper_parameters']["num_workers"] = 1
117 |     col_idx_lat = checkpoint["hyper_parameters"]["v_par_eval"].index('sp_lat')
118 |     col_idx_lon = checkpoint["hyper_parameters"]["v_par_eval"].index('sp_lon')
119 |     args = namedtuple("ObjectName", checkpoint['hyper_parameters'].keys())\
120 |             (*checkpoint['hyper_parameters'].values())
121 |     return args, col_idx_lat, col_idx_lon 
122 | 
123 | @task
124 | def get_backbone(args, input_shapes):
125 |     if args.model=='cnn':
126 |         backbone = ImageNet(args, input_shapes)
127 |     elif args.model=='dense':
128 |         backbone = DenseNet(args, input_shapes)
129 |     return backbone
130 | 
131 | @task 
132 | def make_predictions(test_loader, model):
133 |     trainer = pl.Trainer(enable_progress_bar=False)
134 |     trainer.test(model=model, dataloaders=test_loader)
135 |     y_pred = trainer.predict(model=model, dataloaders=[test_loader])
136 |     y_pred = torch.cat(y_pred).detach().cpu().numpy().squeeze()
137 |     return y_pred
138 | 
139 | @task
140 | def rmse_bins(y_true, y_pred, y_bins):
141 |     # Find the indices for the windspeed bins - below 12 m/s, below 16 m/s, above 16 m/s
142 |     y_ix   = np.digitize(y_true, y_bins, right=False)
143 | 
144 |     all_rmse = np.zeros(len(y_bins))
145 |     all_bias = np.zeros(len(y_bins))
146 |     all_counts = np.zeros(len(y_bins))
147 | 
148 |     for i, yy in enumerate(y_bins):
149 |         if np.any(y_ix==i):
150 |             rmse = mean_squared_error(y_true[y_ix==i], y_pred[y_ix==i], squared=False)
151 |             all_rmse[i] = rmse
152 |             all_bias[i] = np.mean(y_pred[y_ix==i] - y_true[y_ix==i])
153 |             all_counts[i] = np.sum(y_ix==i)
154 |         else:
155 |             all_rmse[i] = None
156 |             all_bias[i] = None
157 |             all_counts[i] = 0
158 |         df_rmse = pd.DataFrame(dict(rmse=all_rmse, bias=all_bias, bins=y_bins, counts=all_counts))
159 |     return df_rmse
160 | 
161 | @task
162 | def rmse_over_time(y_bins, df_rmse):
163 |     # mock up data that represents the long running average rmse
164 |     df_rmse["time"] = "today"
165 |     
166 |     df_mockup = pd.DataFrame(dict(bins=y_bins, 
167 |                    rmse=df_rmse["rmse"] + np.random.rand(len(y_bins))-0.5, 
168 |                    bias=df_rmse["bias"] + np.random.rand(len(y_bins))-0.5,
169 |                    counts=df_rmse["counts"] * 1000))
170 |     df_mockup["time"] = "long-running average"
171 | 
172 |     df_mockup = pd.concat([df_rmse, df_mockup], ignore_index=True)
173 |     return df_mockup
174 | 
175 | @task
176 | def make_plots(y, y_pred, date_, df_mockup, df_rmse, y_bins):
177 |     make_scatterplot(y, y_pred, date_)
178 |     make_histogram(y, y_pred, date_)
179 |     #era_average(y, sp_lon, sp_lat, date_)
180 |     #rmse_average(y, y_pred, sp_lon, sp_lat, date_)
181 |     today_longrunavg(df_mockup, y_bins, date_)
182 |     today_longrunavg_bias(df_mockup, y_bins, date_)
183 |     sample_counts(df_rmse, y_bins, date_)
184 |     rmse_bins_era(df_rmse, y_bins, date_)
185 |     bias_bins_era(df_rmse, y_bins, date_)
186 | 
187 | @task
188 | def remove():
189 |     shutil.rmtree("/app/raw_data", ignore_errors=False, onerror=None)
190 |     shutil.rmtree("/app/annotated_raw_data", ignore_errors=False, onerror=None)
191 |     shutil.rmtree("/app/dev_data", ignore_errors=False, onerror=None)
192 | 
193 | @flow
194 | def main():
195 |     # TODO: Set these settings for prefect, to make paths relative instead of global
196 |     # prefect config set PREFECT_LOCAL_STORAGE_PATH="/your/custom/path"
197 |     # prefect config set PREFECT_HOME="/your/custom/path"
198 | 
199 |     # create directory for plots, if it does not exist
200 |     if not os.path.isdir('/app/plots'):
201 |         os.makedirs('/app/plots', exist_ok=True)
202 | 
203 |     # write a file in app directory to check its write permission and where files are stored
204 |     with open("/app/app_write_test.txt", "w") as file:
205 |         file.write("app_write_test")
206 |         file.write(os.getcwd())
207 |         file.write(os.path.dirname(__file__))
208 |         print(file.name)
209 | 
210 |     # Define the date and pass it to the individual tasks
211 |     download_date = date.today() - timedelta(days=12)
212 |     date_ = download_date.strftime("%Y-%m-%d")
213 | 
214 |     # Download data for the past 10th day from today, today - 10th day
215 |     download_data(year=download_date.year, month=download_date.month, day=download_date.day)
216 | 
217 |     # annotate data
218 |     # create filtered hdf5 from preprocessing
219 |     data_path = '/app/dev_data/'
220 |     pre_processing(download_date.year, download_date.month, download_date.day, data_path)
221 | 
222 |     model_path = '/app/externals/gfz_cygnss/trained_models/'
223 |     model = 'ygambdos_yykDM.ckpt'
224 |     h5_file = h5py.File(os.path.join(data_path, 'test_data.h5'), 'r', rdcc_nbytes=0)
225 | 
226 |     mlflow.set_tracking_uri("sqlite:///mlruns.db") # TODO: change this to other db
227 |     mlflow.set_experiment("cygnss")
228 | 
229 |  
230 |     # get hyperparameters 
231 |     args, col_idx_lat, col_idx_lon  = get_hyper_params.submit(model_path, model, data_path).result()
232 | 
233 |     cdm = CyGNSSDataModule(args)
234 |     cdm.setup(stage='test')
235 |     input_shapes = cdm.get_input_shapes(stage='test')
236 |     backbone = get_backbone.submit(args, input_shapes).result()
237 |   
238 |     # load model
239 |     cygnss_model = CyGNSSNet.load_from_checkpoint(os.path.join(model_path, model),
240 |                                            map_location=torch.device('cpu'), 
241 |                                            args=args, 
242 |                                            backbone=backbone)
243 |     cygnss_model.eval()
244 | 
245 |     test_loader = cdm.test_dataloader()    
246 |     # make predictions
247 |     y_pred = make_predictions(test_loader, cygnss_model)
248 |     
249 |     # get true labels
250 |     dataset = CyGNSSDataset('test', args)
251 |     y = dataset.y
252 | 
253 |     # calculate rmse
254 |     y_bins = [4, 8, 12, 16, 20, 100]
255 |     df_rmse = rmse_bins.submit(y, y_pred, y_bins).result()
256 |     df_mockup = rmse_over_time.submit(y_bins, df_rmse).result()
257 |     with mlflow.start_run():
258 |         rmse = mean_squared_error(y, y_pred, squared=False)
259 |         mlflow.log_metric('rmse', rmse)
260 |    
261 |     # make plots
262 |     sp_lat = test_loader.dataset.v_par_eval[:, col_idx_lat]
263 |     sp_lon = test_loader.dataset.v_par_eval[:, col_idx_lon]
264 |     make_plots(y, y_pred, date_, df_mockup, df_rmse, y_bins)
265 |     DOMAIN = 'mongodb'
266 |     PORT = 27017
267 |     
268 |     # Save results to the mongo database
269 |     save_to_db(domain=DOMAIN, port=PORT, y_pred=y_pred, \
270 |             rmse=rmse, date_=date_, rmse_time=df_rmse)
271 | 
272 |     # delete dowloaded and annotated files
273 |     remove()
274 | 
275 | if __name__ == "__main__":    
276 | 
277 |     deployment = Deployment.build_from_flow(
278 |         schedule = CronSchedule(cron='0 3 * * *', timezone='Europe/Berlin'),
279 |         flow=main,  
280 |         name="cygnss",  
281 |         work_queue_name="demo"
282 |     )
283 |     deployment.apply()
284 |     # main()
285 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: cygnss-d
  2 | channels:
  3 |   - pytorch
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _openmp_mutex=5.1=1_gnu
  9 |   - absl-py=1.2.0=pyhd8ed1ab_0
 10 |   - aiohttp=3.8.1=py39hb9d737c_1
 11 |   - aiosignal=1.2.0=pyhd8ed1ab_0
 12 |   - async-timeout=4.0.2=pyhd8ed1ab_0
 13 |   - attrs=22.1.0=pyh71513ae_1
 14 |   - blas=1.0=mkl
 15 |   - blinker=1.4=py_1
 16 |   - bottleneck=1.3.5=py39h7deecbd_0
 17 |   - brotli=1.0.9=h5eee18b_7
 18 |   - brotli-bin=1.0.9=h5eee18b_7
 19 |   - brotlipy=0.7.0=py39h27cfd23_1003
 20 |   - bzip2=1.0.8=h7b6447c_0
 21 |   - c-ares=1.18.1=h7f8727e_0
 22 |   - ca-certificates=2022.9.24=ha878542_0
 23 |   - cachetools=5.2.0=pyhd8ed1ab_0
 24 |   - cartopy=0.18.0=py39h0d9ca2b_1
 25 |   - certifi=2022.6.15.1=pyhd8ed1ab_0
 26 |   - cffi=1.15.1=py39h74dc2b5_0
 27 |   - cftime=1.5.1.1=py39hce1f21e_0
 28 |   - colorama=0.4.5=pyhd8ed1ab_0
 29 |   - cryptography=37.0.1=py39h9ce1e76_0
 30 |   - cudatoolkit=11.3.1=h2bc3f7f_2
 31 |   - curl=7.84.0=h5eee18b_0
 32 |   - cycler=0.11.0=pyhd3eb1b0_0
 33 |   - dbus=1.13.18=hb2f20db_0
 34 |   - expat=2.4.4=h295c915_0
 35 |   - ffmpeg=4.3=hf484d3e_0
 36 |   - fftw=3.3.9=h27cfd23_1
 37 |   - fontconfig=2.13.1=h6c09931_0
 38 |   - fonttools=4.25.0=pyhd3eb1b0_0
 39 |   - freetype=2.11.0=h70c0345_0
 40 |   - fsspec=2022.11.0=pyhd8ed1ab_0
 41 |   - future=0.18.2=py39h06a4308_1
 42 |   - geos=3.8.0=he6710b0_0
 43 |   - giflib=5.2.1=h7b6447c_0
 44 |   - glib=2.69.1=h4ff587b_1
 45 |   - gmp=6.2.1=h295c915_3
 46 |   - gnutls=3.6.15=he1e5248_0
 47 |   - google-auth=2.11.0=pyh6c4a22f_0
 48 |   - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
 49 |   - gst-plugins-base=1.14.0=h8213a91_2
 50 |   - gstreamer=1.14.0=h28cd5cc_2
 51 |   - h5py=3.7.0=py39h737f45e_0
 52 |   - hdf4=4.2.13=h3ca952b_2
 53 |   - hdf5=1.10.6=h3ffc7dd_1
 54 |   - icu=58.2=he6710b0_3
 55 |   - idna=3.3=pyhd3eb1b0_0
 56 |   - intel-openmp=2021.4.0=h06a4308_3561
 57 |   - jpeg=9e=h7f8727e_0
 58 |   - kiwisolver=1.4.2=py39h295c915_0
 59 |   - krb5=1.19.2=hac12032_0
 60 |   - lame=3.100=h7b6447c_0
 61 |   - lcms2=2.12=h3be6417_0
 62 |   - ld_impl_linux-64=2.38=h1181459_1
 63 |   - lerc=3.0=h295c915_0
 64 |   - libbrotlicommon=1.0.9=h5eee18b_7
 65 |   - libbrotlidec=1.0.9=h5eee18b_7
 66 |   - libbrotlienc=1.0.9=h5eee18b_7
 67 |   - libclang=10.0.1=default_hb85057a_2
 68 |   - libcurl=7.84.0=h91b91d3_0
 69 |   - libdeflate=1.8=h7f8727e_5
 70 |   - libedit=3.1.20210910=h7f8727e_0
 71 |   - libev=4.33=h7f8727e_1
 72 |   - libevent=2.1.12=h8f2d780_0
 73 |   - libffi=3.3=he6710b0_2
 74 |   - libgcc-ng=11.2.0=h1234567_1
 75 |   - libgfortran-ng=11.2.0=h00389a5_1
 76 |   - libgfortran5=11.2.0=h1234567_1
 77 |   - libgomp=11.2.0=h1234567_1
 78 |   - libiconv=1.16=h7f8727e_2
 79 |   - libidn2=2.3.2=h7f8727e_0
 80 |   - libllvm10=10.0.1=hbcb73fb_5
 81 |   - libnetcdf=4.8.1=h42ceab0_1
 82 |   - libnghttp2=1.46.0=hce63b2e_0
 83 |   - libpng=1.6.37=hbc83047_0
 84 |   - libpq=12.9=h16c4e8d_3
 85 |   - libprotobuf=3.15.8=h780b84a_1
 86 |   - libssh2=1.10.0=h8f2d780_0
 87 |   - libstdcxx-ng=11.2.0=h1234567_1
 88 |   - libtasn1=4.16.0=h27cfd23_0
 89 |   - libtiff=4.4.0=hecacb30_0
 90 |   - libunistring=0.9.10=h27cfd23_0
 91 |   - libuuid=1.0.3=h7f8727e_2
 92 |   - libwebp=1.2.2=h55f646e_0
 93 |   - libwebp-base=1.2.2=h7f8727e_0
 94 |   - libxcb=1.15=h7f8727e_0
 95 |   - libxkbcommon=1.0.1=hfa300c1_0
 96 |   - libxml2=2.9.14=h74e7548_0
 97 |   - libxslt=1.1.35=h4e12654_0
 98 |   - libzip=1.8.0=h5cef20c_0
 99 |   - lz4-c=1.9.3=h295c915_1
100 |   - markdown=3.4.1=pyhd8ed1ab_0
101 |   - markupsafe=2.1.1=py39hb9d737c_1
102 |   - matplotlib=3.5.2=py39h06a4308_0
103 |   - matplotlib-base=3.5.2=py39hf590b9c_0
104 |   - mkl=2021.4.0=h06a4308_640
105 |   - mkl-service=2.4.0=py39h7f8727e_0
106 |   - mkl_fft=1.3.1=py39hd3c417c_0
107 |   - mkl_random=1.2.2=py39h51133e4_0
108 |   - multidict=6.0.2=py39hb9d737c_1
109 |   - munkres=1.1.4=py_0
110 |   - ncurses=6.3=h5eee18b_3
111 |   - netcdf4=1.5.7=py39ha0f2276_1
112 |   - nettle=3.7.3=hbbd107a_1
113 |   - ninja=1.10.2=h06a4308_5
114 |   - ninja-base=1.10.2=hd09550d_5
115 |   - nspr=4.33=h295c915_0
116 |   - nss=3.74=h0370c37_0
117 |   - numexpr=2.8.3=py39h807cd23_0
118 |   - numpy=1.22.3=py39he7a7128_0
119 |   - numpy-base=1.22.3=py39hf524024_0
120 |   - oauthlib=3.2.1=pyhd8ed1ab_0
121 |   - openh264=2.1.1=h4ff587b_0
122 |   - openssl=1.1.1s=h7f8727e_0
123 |   - packaging=21.3=pyhd3eb1b0_0
124 |   - pandas=1.4.3=py39h6a678d5_0
125 |   - pcre=8.45=h295c915_0
126 |   - pillow=9.2.0=py39hace64e9_1
127 |   - pip=22.1.2=py39h06a4308_0
128 |   - ply=3.11=py39h06a4308_0
129 |   - proj=6.2.1=hc80f0dc_0
130 |   - pyasn1=0.4.8=py_0
131 |   - pycparser=2.21=pyhd3eb1b0_0
132 |   - pyjwt=2.4.0=pyhd8ed1ab_0
133 |   - pyopenssl=22.0.0=pyhd3eb1b0_0
134 |   - pyparsing=3.0.9=py39h06a4308_0
135 |   - pyqt=5.15.7=py39h6a678d5_1
136 |   - pyqt5-sip=12.11.0=py39h6a678d5_1
137 |   - pyshp=2.3.1=pyhd8ed1ab_0
138 |   - pysocks=1.7.1=py39h06a4308_0
139 |   - python=3.9.13=haa1d7c7_1
140 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
141 |   - python_abi=3.9=2_cp39
142 |   - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0
143 |   - pytorch-mutex=1.0=cuda
144 |   - pytz=2022.1=py39h06a4308_0
145 |   - pyu2f=0.1.5=pyhd8ed1ab_0
146 |   - pyyaml=6.0=py39hb9d737c_4
147 |   - qt-main=5.15.2=h327a75a_7
148 |   - qt-webengine=5.15.9=hd2b0992_4
149 |   - qtwebkit=5.212=h4eab89a_4
150 |   - readline=8.1.2=h7f8727e_1
151 |   - requests=2.28.1=py39h06a4308_0
152 |   - requests-oauthlib=1.3.1=pyhd8ed1ab_0
153 |   - rsa=4.9=pyhd8ed1ab_0
154 |   - scipy=1.7.3=py39h6c91a56_2
155 |   - seaborn=0.11.2=pyhd3eb1b0_0
156 |   - shapely=1.8.4=py39h81ba7c5_0
157 |   - sip=6.6.2=py39h6a678d5_0
158 |   - six=1.16.0=pyhd3eb1b0_1
159 |   - sqlite=3.39.2=h5082296_0
160 |   - tensorboard=2.10.0=pyhd8ed1ab_2
161 |   - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
162 |   - tk=8.6.12=h1ccaba5_0
163 |   - toml=0.10.2=pyhd3eb1b0_0
164 |   - torchaudio=0.12.1=py39_cu113
165 |   - torchmetrics=0.9.3=pyhd8ed1ab_0
166 |   - torchvision=0.13.1=py39_cu113
167 |   - tornado=6.1=py39hb9d737c_3
168 |   - tqdm=4.64.1=pyhd8ed1ab_0
169 |   - typing-extensions=4.3.0=py39h06a4308_0
170 |   - typing_extensions=4.3.0=py39h06a4308_0
171 |   - werkzeug=2.2.2=pyhd8ed1ab_0
172 |   - wheel=0.37.1=pyhd3eb1b0_0
173 |   - xarray=0.20.1=pyhd3eb1b0_1
174 |   - xz=5.2.5=h7f8727e_1
175 |   - yaml=0.2.5=h7f98852_2
176 |   - zipp=3.8.1=pyhd8ed1ab_0
177 |   - zlib=1.2.12=h5eee18b_3
178 |   - zstd=1.5.2=ha4553b6_0
179 |   - pip:
180 |     - aiobotocore==2.4.0
181 |     - aiofiles==22.1.0
182 |     - aiohttp-cors==0.7.0
183 |     - aioitertools==0.11.0
184 |     - aiosqlite==0.17.0
185 |     - alembic==1.8.1
186 |     - altair==4.2.0
187 |     - anyio==3.6.1
188 |     - appdirs==1.4.4
189 |     - argon2-cffi==21.3.0
190 |     - argon2-cffi-bindings==21.2.0
191 |     - asgi-lifespan==1.0.1
192 |     - astor==0.8.1
193 |     - asttokens==2.0.8
194 |     - asyncpg==0.26.0
195 |     - azure-core==1.25.1
196 |     - azure-storage-blob==12.13.1
197 |     - backcall==0.2.0
198 |     - beautifulsoup4==4.11.1
199 |     - bleach==5.0.1
200 |     - blessed==1.19.1
201 |     - boto3==1.24.75
202 |     - botocore==1.27.59
203 |     - cdsapi==0.5.1
204 |     - charset-normalizer==2.1.1
205 |     - click==8.0.4
206 |     - cloudpickle==2.2.0
207 |     - colorful==0.5.4
208 |     - commonmark==0.9.1
209 |     - contextlib2==21.6.0
210 |     - coolname==1.1.0
211 |     - croniter==1.3.7
212 |     - dask==2022.9.1
213 |     - databricks-cli==0.17.3
214 |     - debugpy==1.6.3
215 |     - decorator==5.1.1
216 |     - defusedxml==0.7.1
217 |     - distlib==0.3.6
218 |     - distributed==2022.9.1
219 |     - docker==5.0.3
220 |     - entrypoints==0.4
221 |     - executing==1.0.0
222 |     - fastapi==0.85.0
223 |     - fastjsonschema==2.16.1
224 |     - filelock==3.8.0
225 |     - flask==2.2.2
226 |     - frozenlist==1.3.1
227 |     - gitdb==4.0.9
228 |     - gitpython==3.1.27
229 |     - global-land-mask==1.0.0
230 |     - google-api-core==2.10.1
231 |     - google-cloud-core==2.3.2
232 |     - google-cloud-storage==2.5.0
233 |     - google-crc32c==1.5.0
234 |     - google-resumable-media==2.3.3
235 |     - googleapis-common-protos==1.56.4
236 |     - gpustat==1.0.0
237 |     - greenlet==1.1.3
238 |     - griffe==0.21.0
239 |     - grpcio==1.43.0
240 |     - gunicorn==20.1.0
241 |     - h11==0.12.0
242 |     - heapdict==1.0.1
243 |     - httpcore==0.15.0
244 |     - httpx==0.23.0
245 |     - hyperopt==0.1.2
246 |     - importlib-metadata==4.12.0
247 |     - intake==0.6.6
248 |     - ipykernel==6.15.2
249 |     - ipython==8.5.0
250 |     - ipython-genutils==0.2.0
251 |     - ipywidgets==8.0.2
252 |     - isodate==0.6.1
253 |     - itsdangerous==2.1.2
254 |     - jedi==0.18.1
255 |     - jinja2==3.1.2
256 |     - jmespath==1.0.1
257 |     - joblib==1.2.0
258 |     - json-tricks==3.15.5
259 |     - jsonpatch==1.32
260 |     - jsonpointer==2.3
261 |     - jsonschema==4.16.0
262 |     - jupyter==1.0.0
263 |     - jupyter-client==7.3.5
264 |     - jupyter-console==6.4.4
265 |     - jupyter-core==4.11.1
266 |     - jupyterlab-pygments==0.2.2
267 |     - jupyterlab-widgets==3.0.3
268 |     - kubernetes==24.2.0
269 |     - llvmlite==0.39.1
270 |     - locket==1.0.0
271 |     - lxml==4.9.1
272 |     - mako==1.2.2
273 |     - matplotlib-inline==0.1.6
274 |     - mistune==2.0.4
275 |     - mlflow==2.0.1
276 |     - msgpack==1.0.4
277 |     - msrest==0.7.1
278 |     - nbclient==0.6.8
279 |     - nbconvert==7.0.0
280 |     - nbformat==5.4.0
281 |     - nest-asyncio==1.5.5
282 |     - networkx==2.8.6
283 |     - nexusformat==0.7.7
284 |     - nni==2.9
285 |     - notebook==6.4.12
286 |     - numba==0.56.4
287 |     - nvidia-ml-py==11.495.46
288 |     - opencensus==0.11.0
289 |     - opencensus-context==0.1.3
290 |     - orjson==3.8.0
291 |     - pandocfilters==1.5.0
292 |     - parso==0.8.3
293 |     - partd==1.3.0
294 |     - pathspec==0.10.1
295 |     - pendulum==2.1.2
296 |     - pexpect==4.8.0
297 |     - pickleshare==0.7.5
298 |     - platformdirs==2.5.2
299 |     - podaac-data-subscriber==1.12.0
300 |     - pooch==1.6.0
301 |     - prefect==2.4.0
302 |     - prettytable==3.4.1
303 |     - prometheus-client==0.13.1
304 |     - prometheus-flask-exporter==0.20.3
305 |     - prompt-toolkit==3.0.31
306 |     - protobuf==3.20.2
307 |     - psutil==5.9.2
308 |     - ptyprocess==0.7.0
309 |     - pure-eval==0.2.2
310 |     - py-spy==0.3.14
311 |     - pyarrow==9.0.0
312 |     - pyasn1-modules==0.2.8
313 |     - pydantic==1.10.2
314 |     - pydeck==0.8.0b3
315 |     - pydeprecate==0.3.1
316 |     - pygments==2.13.0
317 |     - pymongo==4.2.0
318 |     - pympler==1.0.1
319 |     - pyrsistent==0.18.1
320 |     - python-slugify==6.1.2
321 |     - pythonwebhdfs==0.2.3
322 |     - pytorch-lightning==1.5.10
323 |     - pytz-deprecation-shim==0.1.0.post0
324 |     - pytzdata==2020.1
325 |     - pyzmq==23.2.1
326 |     - qtconsole==5.3.2
327 |     - qtpy==2.2.0
328 |     - querystring-parser==1.2.4
329 |     - ray==2.0.0
330 |     - readchar==4.0.3
331 |     - responses==0.21.0
332 |     - rfc3986==1.5.0
333 |     - rich==12.5.1
334 |     - s3fs==2022.11.0
335 |     - s3transfer==0.6.0
336 |     - schema==0.7.5
337 |     - scikit-learn==1.1.2
338 |     - semver==2.13.0
339 |     - send2trash==1.8.0
340 |     - setuptools==59.5.0
341 |     - shap==0.41.0
342 |     - simplejson==3.17.6
343 |     - sklearn==0.0
344 |     - slack-sdk==3.18.3
345 |     - slicer==0.0.7
346 |     - smart-open==6.2.0
347 |     - smmap==5.0.0
348 |     - sniffio==1.3.0
349 |     - sortedcontainers==2.4.0
350 |     - soupsieve==2.3.2.post1
351 |     - sqlalchemy==1.4.41
352 |     - sqlparse==0.4.2
353 |     - stack-data==0.5.0
354 |     - starlette==0.20.4
355 |     - streamlit==1.12.2
356 |     - tabulate==0.8.10
357 |     - tblib==1.7.0
358 |     - tenacity==8.0.1
359 |     - tensorboard-data-server==0.6.1
360 |     - terminado==0.15.0
361 |     - text-unidecode==1.3
362 |     - threadpoolctl==3.1.0
363 |     - tinycss2==1.1.1
364 |     - toolz==0.12.0
365 |     - traitlets==5.4.0
366 |     - typeguard==2.13.3
367 |     - typer==0.6.1
368 |     - tzdata==2022.2
369 |     - tzlocal==4.2
370 |     - urllib3==1.26.12
371 |     - uvicorn==0.18.3
372 |     - validators==0.20.0
373 |     - virtualenv==20.16.5
374 |     - watchdog==2.1.9
375 |     - wcwidth==0.2.5
376 |     - webencodings==0.5.1
377 |     - websocket-client==1.4.1
378 |     - websockets==10.3
379 |     - widgetsnbextension==4.0.3
380 |     - wrapt==1.14.1
381 |     - yarl==1.8.1
382 |     - zict==2.2.0
383 | prefix: /home/harsh/anaconda3/envs/cygnss-d
384 | 


--------------------------------------------------------------------------------
/notebooks/Preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "261e2e39-ae3e-4b92-8dc5-b163f61eea25",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Preprocessing CyGNSS data\n",
  9 |     "\n",
 10 |     "Data is downloaded from NASA EarthCloud as described in the `APIs` notebook. For the expected format for CyGNSSnet, additional preprocessing steps are necessary."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 10,
 16 |    "id": "084a2e3e-9f9f-4844-9e28-c60e30314494",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import os\n",
 21 |     "import sys\n",
 22 |     "sys.path.append('../externals/gfz_cygnss/')"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 11,
 28 |    "id": "06128178",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# !pip install tenacity"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 12,
 38 |    "id": "c0bbb084-5e0b-41a9-a337-684f832d6f85",
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "ename": "TypeError",
 43 |      "evalue": "<class 'numpy.typing._dtype_like._SupportsDType'> is not a generic class",
 44 |      "output_type": "error",
 45 |      "traceback": [
 46 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 47 |       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
 48 |       "Input \u001b[0;32mIn [12]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgfz_202003\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocessing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m preprocess \u001b[38;5;28;01mas\u001b[39;00m prep\n",
 49 |       "File \u001b[0;32m~/Downloads/DKRZ/MLOps/2020-03-gfz-remote-sensing/gfz_202003/preprocessing/preprocess.py:9\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01margparse\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mxr\u001b[39;00m\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdatetime\u001b[39;00m\n",
 50 |       "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/__init__.py:1\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m testing, tutorial\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbackends\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m      3\u001b[0m     load_dataarray,\n\u001b[1;32m      4\u001b[0m     load_dataset,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      8\u001b[0m     save_mfdataset,\n\u001b[1;32m      9\u001b[0m )\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbackends\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrasterio_\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m open_rasterio\n",
 51 |       "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/testing.py:9\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m duck_array_ops, formatting, utils\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataarray\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataArray\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n",
 52 |       "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/core/duck_array_ops.py:26\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m take, tensordot, transpose, unravel_index  \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m where \u001b[38;5;28;01mas\u001b[39;00m _where\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dask_array_compat, dask_array_ops, dtypes, npcompat, nputils\n\u001b[1;32m     27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnputils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m nanfirst, nanlast\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpycompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cupy_array_type, dask_array_type, is_duck_dask_array\n",
 53 |       "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/core/npcompat.py:72\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     49\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtyping\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_dtype_like\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _DTypeLikeNested, _ShapeLike, _SupportsDType\n\u001b[1;32m     51\u001b[0m     \u001b[38;5;66;03m# Xarray requires a Mapping[Hashable, dtype] in many places which\u001b[39;00m\n\u001b[1;32m     52\u001b[0m     \u001b[38;5;66;03m# conflics with numpys own DTypeLike (with dtypes for fields).\u001b[39;00m\n\u001b[1;32m     53\u001b[0m     \u001b[38;5;66;03m# https://numpy.org/devdocs/reference/typing.html#numpy.typing.DTypeLike\u001b[39;00m\n\u001b[1;32m     54\u001b[0m     \u001b[38;5;66;03m# This is a copy of this DTypeLike that allows only non-Mapping dtypes.\u001b[39;00m\n\u001b[1;32m     55\u001b[0m     DTypeLikeSave \u001b[38;5;241m=\u001b[39m Union[\n\u001b[1;32m     56\u001b[0m         np\u001b[38;5;241m.\u001b[39mdtype,\n\u001b[1;32m     57\u001b[0m         \u001b[38;5;66;03m# default data type (float64)\u001b[39;00m\n\u001b[1;32m     58\u001b[0m         \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     59\u001b[0m         \u001b[38;5;66;03m# array-scalar types and generic types\u001b[39;00m\n\u001b[1;32m     60\u001b[0m         Type[Any],\n\u001b[1;32m     61\u001b[0m         \u001b[38;5;66;03m# character codes, type strings or comma-separated fields, e.g., 'float64'\u001b[39;00m\n\u001b[1;32m     62\u001b[0m         \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m     63\u001b[0m         \u001b[38;5;66;03m# (flexible_dtype, itemsize)\u001b[39;00m\n\u001b[1;32m     64\u001b[0m         Tuple[_DTypeLikeNested, \u001b[38;5;28mint\u001b[39m],\n\u001b[1;32m     65\u001b[0m         \u001b[38;5;66;03m# (fixed_dtype, shape)\u001b[39;00m\n\u001b[1;32m     66\u001b[0m         Tuple[_DTypeLikeNested, _ShapeLike],\n\u001b[1;32m     67\u001b[0m         \u001b[38;5;66;03m# (base_dtype, new_dtype)\u001b[39;00m\n\u001b[1;32m     68\u001b[0m         Tuple[_DTypeLikeNested, _DTypeLikeNested],\n\u001b[1;32m     69\u001b[0m         \u001b[38;5;66;03m# because numpy does the same?\u001b[39;00m\n\u001b[1;32m     70\u001b[0m         List[Any],\n\u001b[1;32m     71\u001b[0m         \u001b[38;5;66;03m# anything with a dtype attribute\u001b[39;00m\n\u001b[0;32m---> 72\u001b[0m         \u001b[43m_SupportsDType\u001b[49m\u001b[43m[\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m]\u001b[49m,\n\u001b[1;32m     73\u001b[0m     ]\n\u001b[1;32m     74\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[1;32m     75\u001b[0m     \u001b[38;5;66;03m# fall back for numpy < 1.20, ArrayLike adapted from numpy.typing._array_like\u001b[39;00m\n\u001b[1;32m     76\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Protocol\n",
 54 |       "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/typing.py:261\u001b[0m, in \u001b[0;36m_tp_cache.<locals>.inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m    259\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m    260\u001b[0m     \u001b[38;5;28;01mpass\u001b[39;00m  \u001b[38;5;66;03m# All real errors (not unhashable args) are raised below.\u001b[39;00m\n\u001b[0;32m--> 261\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
 55 |       "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/typing.py:897\u001b[0m, in \u001b[0;36mGeneric.__class_getitem__\u001b[0;34m(cls, params)\u001b[0m\n\u001b[1;32m    893\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m    894\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParameters to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m[...] must all be unique\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    895\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    896\u001b[0m     \u001b[38;5;66;03m# Subscripting a regular Generic subclass.\u001b[39;00m\n\u001b[0;32m--> 897\u001b[0m     \u001b[43m_check_generic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    898\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _GenericAlias(\u001b[38;5;28mcls\u001b[39m, params)\n",
 56 |       "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/typing_extensions.py:95\u001b[0m, in \u001b[0;36m_check_generic\u001b[0;34m(cls, parameters, elen)\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m elen \u001b[38;5;129;01mis\u001b[39;00m _marker:\n\u001b[1;32m     94\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__parameters__\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__parameters__:\n\u001b[0;32m---> 95\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not a generic class\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     96\u001b[0m     elen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__parameters__)\n\u001b[1;32m     97\u001b[0m alen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(parameters)\n",
 57 |       "\u001b[0;31mTypeError\u001b[0m: <class 'numpy.typing._dtype_like._SupportsDType'> is not a generic class"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "from gfz_202003.preprocessing import preprocess as prep"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "id": "39eac3de-096b-4b73-8491-232d3e0667b0",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import numpy as np\n",
 73 |     "import h5py\n",
 74 |     "from matplotlib import pyplot as plt\n",
 75 |     "import seaborn as sns\n",
 76 |     "\n",
 77 |     "import datetime\n",
 78 |     "import xarray as xr\n",
 79 |     "\n",
 80 |     "import argparse"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 7,
 86 |    "id": "3cad0fa3-0ba5-4b35-ba86-29f7bee68e71",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "import cdsapi"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "7c8c7e2a-ea30-499e-a259-73aae365be5d",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Download raw CyGNSS data\n",
 99 |     "\n",
100 |     "The CyGNSSnet preprocessing routine expects the raw data files ordered as \n",
101 |     "\n",
102 |     "> `$raw_data_dir/<year>/<day-of-year>/cyg*.nc`\n",
103 |     "\n",
104 |     "Data is always downloaded for one full day for all spacecraft, generating 8 `netcdf` files per day of observations. Below is a routine to specify a date range, followed by downloading the corresponding data and storing it in the appropriate subfolders."
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 6,
110 |    "id": "efcfbe84-843d-4550-b22f-fbfaad434694",
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "raw_data_root = '/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/raw_data'\n",
115 |     "dev_data_root = '/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/dev_data'"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "id": "2fdc36c1-d1e1-4bab-8a04-a91f8759637f",
121 |    "metadata": {},
122 |    "source": [
123 |     "Select a test day and prepared the input parameters for the provided download script"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 7,
129 |    "id": "f47387cf-999d-44a5-9c15-8cf2c7886e07",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "year  = 2021\n",
134 |     "month = 3\n",
135 |     "day   = 17"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "id": "871eddb6-6022-4273-8dba-93c911f78598",
141 |    "metadata": {},
142 |    "source": [
143 |     "Downloaded target directory in the expected format `year/day-of-year`"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 8,
149 |    "id": "84e34efb-f067-4964-89cb-f6ccb556e681",
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/raw_data/2021/168\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "raw_data_sub = datetime.datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\").strftime(\"%Y/%j\")\n",
162 |     "\n",
163 |     "raw_data_dir = os.path.join(raw_data_root, raw_data_sub)\n",
164 |     "\n",
165 |     "print(raw_data_dir)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "id": "cad54be9-afc6-43b9-a841-2bfefddc81f5",
171 |    "metadata": {},
172 |    "source": [
173 |     "Start and end date of download range in the required format. The end date is midnight the next day, this way only the requested day's data is downloaded."
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 9,
179 |    "id": "e7da91cd-1c9b-479d-91a2-3598b58765ac",
180 |    "metadata": {},
181 |    "outputs": [
182 |     {
183 |      "name": "stdout",
184 |      "output_type": "stream",
185 |      "text": [
186 |       "--start-date 2021-06-17T00:00:00Z\n",
187 |       "--end-date   2021-06-18T00:00:00Z\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "start_date = datetime.datetime(year, month, day).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
193 |     "end_date   = datetime.datetime(year, month, day + 1).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
194 |     "\n",
195 |     "print(f'--start-date {start_date}')\n",
196 |     "print(f'--end-date   {end_date}')"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "id": "15055733-3e11-4bd5-9402-e721be9aba0c",
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "dday = datetime.datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\").strftime(\"%j\") # need that later\n",
207 |     "dday"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "99c1420e-e2c3-4c68-a53e-0b98e94d3a45",
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "name": "stdout",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "env: PYTHONPATH=/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber\n",
221 |       "[2022-09-12 16:00:57,433] {podaac_data_downloader.py:243} INFO - Found 7 total files to download\n",
222 |       "[2022-09-12 16:00:59,062] {podaac_access.py:446} WARNING - Computed checksum f11baba7acac4b5b14b3891e83f715c8 does not match expected checksum 10e4ef36d29f030ea7e524f8924389fc\n",
223 |       "[2022-09-12 16:01:46,860] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:01:46.860919 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg06.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
224 |       "[2022-09-12 16:01:48,483] {podaac_access.py:446} WARNING - Computed checksum 9b3100d23550d03cb85056609ecddd5b does not match expected checksum a8851840f3a4bbdc8499ea2f17d5119b\n",
225 |       "[2022-09-12 16:02:39,804] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:02:39.804552 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg08.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
226 |       "[2022-09-12 16:02:41,684] {podaac_access.py:446} WARNING - Computed checksum fdaaa0486c6932b1a62c087edaecd64f does not match expected checksum a08d25babf87b328b96a850bfacbcc53\n",
227 |       "[2022-09-12 16:03:31,252] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:03:31.252143 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg02.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
228 |       "[2022-09-12 16:03:33,101] {podaac_access.py:446} WARNING - Computed checksum 881d6ad8374fea406dc72b27775e124f does not match expected checksum 7eef541250b6f137d8ace0e99e12eaf2\n",
229 |       "[2022-09-12 16:04:15,389] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:04:15.389899 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg03.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
230 |       "[2022-09-12 16:04:17,154] {podaac_access.py:446} WARNING - Computed checksum cf78c6b618423cf8410b43eeddfb5c63 does not match expected checksum 25dd31a5b59b5444a509ead3a359a8a5\n",
231 |       "[2022-09-12 16:05:04,669] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:05:04.669819 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg04.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
232 |       "[2022-09-12 16:05:06,367] {podaac_access.py:446} WARNING - Computed checksum 3dc2ce38484b3438c18d5491d6a68984 does not match expected checksum e7ae44462212498cab741a6dbd4624e8\n",
233 |       "[2022-09-12 16:06:03,144] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:06:03.144241 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg07.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
234 |       "[2022-09-12 16:06:04,762] {podaac_access.py:446} WARNING - Computed checksum 8cc2e314df20dec61110dc4290da3cc1 does not match expected checksum 32fddfe78b55e4ee302cf37fa7d0bf9b\n",
235 |       "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:07:04.082807 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg01.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
236 |       "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:287} INFO - Downloaded Files: 7\n",
237 |       "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:288} INFO - Failed Files:     0\n",
238 |       "[2022-09-12 16:07:04,083] {podaac_data_downloader.py:289} INFO - Skipped Files:    0\n",
239 |       "[2022-09-12 16:07:05,046] {podaac_access.py:122} INFO - CMR token successfully deleted\n",
240 |       "[2022-09-12 16:07:05,047] {podaac_data_downloader.py:299} INFO - END\n",
241 |       "\n",
242 |       "\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "%env PYTHONPATH=/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber\n",
248 |     "!python /home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber/subscriber/podaac_data_downloader.py  -c CYGNSS_L1_V3.1 -d $raw_data_dir --start-date $start_date --end-date $end_date"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "id": "520bceeb-7e10-4802-96c7-96995aa933e2",
254 |    "metadata": {},
255 |    "source": [
256 |     "## Download raw ERA5 data\n",
257 |     "\n",
258 |     "The preprocessing pipeline requires the ERA5 windspeed labels. Download the raw ERA5 data for the same timespan."
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 1,
264 |    "id": "d2511baa-ade0-43bf-8e9b-953251c164fe",
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "ename": "NameError",
269 |      "evalue": "name 'os' is not defined",
270 |      "output_type": "error",
271 |      "traceback": [
272 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
273 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
274 |       "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m era5_data \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(raw_data_dir, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mERA5_windspeed.nc\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
275 |       "\u001b[0;31mNameError\u001b[0m: name 'os' is not defined"
276 |      ]
277 |     }
278 |    ],
279 |    "source": [
280 |     "era5_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc')"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "id": "b8e06265-ffa4-4b9c-a5de-b8c9151a9387",
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "cds = cdsapi.Client()"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 2,
296 |    "id": "3c9e2d27-609c-454b-8131-10427c89ab9d",
297 |    "metadata": {},
298 |    "outputs": [
299 |     {
300 |      "ename": "NameError",
301 |      "evalue": "name 'cds' is not defined",
302 |      "output_type": "error",
303 |      "traceback": [
304 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
305 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
306 |       "Input \u001b[0;32mIn [2]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcds\u001b[49m\u001b[38;5;241m.\u001b[39mretrieve(\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreanalysis-era5-single-levels\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m      3\u001b[0m     {\n\u001b[1;32m      4\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mproduct_type\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreanalysis\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m      5\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mformat\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnetcdf\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m      6\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvariable\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m      7\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10m_u_component_of_wind\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10m_v_component_of_wind\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m      8\u001b[0m         ],\n\u001b[1;32m      9\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m: year,\n\u001b[1;32m     10\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmonth\u001b[39m\u001b[38;5;124m'\u001b[39m: month,\n\u001b[1;32m     11\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m'\u001b[39m: day,\n\u001b[1;32m     12\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m     13\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m00:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m01:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m02:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     14\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m03:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m04:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m05:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     15\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m06:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m07:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m08:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     16\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m09:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m11:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     17\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m12:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m13:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m14:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     18\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m15:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m16:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m17:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     19\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m18:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m19:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m20:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m     20\u001b[0m             \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m21:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m22:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m23:00\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     21\u001b[0m         ],\n\u001b[1;32m     22\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124marea\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m     23\u001b[0m             \u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m180\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m180\u001b[39m,\n\u001b[1;32m     24\u001b[0m         ],\n\u001b[1;32m     25\u001b[0m     },\n\u001b[1;32m     26\u001b[0m     era5_data)\n",
307 |       "\u001b[0;31mNameError\u001b[0m: name 'cds' is not defined"
308 |      ]
309 |     }
310 |    ],
311 |    "source": [
312 |     "cds.retrieve(\n",
313 |     "    'reanalysis-era5-single-levels',\n",
314 |     "    {\n",
315 |     "        'product_type': 'reanalysis',\n",
316 |     "        'format': 'netcdf',\n",
317 |     "        'variable': [\n",
318 |     "            '10m_u_component_of_wind', '10m_v_component_of_wind',\n",
319 |     "        ],\n",
320 |     "        'year': year,\n",
321 |     "        'month': month,\n",
322 |     "        'day': day,\n",
323 |     "        'time': [\n",
324 |     "            '00:00', '01:00', '02:00',\n",
325 |     "            '03:00', '04:00', '05:00',\n",
326 |     "            '06:00', '07:00', '08:00',\n",
327 |     "            '09:00', '10:00', '11:00',\n",
328 |     "            '12:00', '13:00', '14:00',\n",
329 |     "            '15:00', '16:00', '17:00',\n",
330 |     "            '18:00', '19:00', '20:00',\n",
331 |     "            '21:00', '22:00', '23:00'\n",
332 |     "        ],\n",
333 |     "        'area': [\n",
334 |     "            40, -180, -40, 180,\n",
335 |     "        ],\n",
336 |     "    },\n",
337 |     "    era5_data)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 3,
343 |    "id": "4df67e88-6fd4-48f9-8a2c-921d51fe1c13",
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "ename": "NameError",
348 |      "evalue": "name 'xr' is not defined",
349 |      "output_type": "error",
350 |      "traceback": [
351 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
352 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
353 |       "Input \u001b[0;32mIn [3]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m era5_ds \u001b[38;5;241m=\u001b[39m \u001b[43mxr\u001b[49m\u001b[38;5;241m.\u001b[39mopen_dataset(era5_data)\n\u001b[1;32m      2\u001b[0m era5_ds\n",
354 |       "\u001b[0;31mNameError\u001b[0m: name 'xr' is not defined"
355 |      ]
356 |     }
357 |    ],
358 |    "source": [
359 |     "era5_ds = xr.open_dataset(era5_data)\n",
360 |     "era5_ds"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "id": "c958a80b-5845-4a98-8372-1fdc03954a00",
366 |    "metadata": {},
367 |    "source": [
368 |     "## Annotate raw CyGNSS data with windspeed labels"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "id": "ac2b8784-afcd-48ce-9e09-a6b245ae6132",
374 |    "metadata": {},
375 |    "source": [
376 |     "We need to create the data variables `ERA5_u10` and `ERA5_v10` and attach them to the CyGNSS raw data."
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 22,
382 |    "id": "ca049baa-b554-40f1-9269-27469c614a76",
383 |    "metadata": {},
384 |    "outputs": [
385 |     {
386 |      "data": {
387 |       "text/plain": [
388 |        "['cyg07.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
389 |        " 'ERA5_windspeed.nc',\n",
390 |        " 'cyg02.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
391 |        " 'cyg04.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
392 |        " 'cyg01.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
393 |        " 'cyg06.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
394 |        " 'CYGNSS_L1_V3.1.citation.txt',\n",
395 |        " 'cyg03.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
396 |        " 'cyg08.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc']"
397 |       ]
398 |      },
399 |      "execution_count": 22,
400 |      "metadata": {},
401 |      "output_type": "execute_result"
402 |     }
403 |    ],
404 |    "source": [
405 |     "os.listdir(raw_data_dir)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "id": "50a4f34c-1a20-4580-9721-6838de1626a7",
411 |    "metadata": {},
412 |    "source": [
413 |     "Check units for spacetime coordinates\n",
414 |     "* Longitude\n",
415 |     "  * ERA5:   -180 ... 0 ... +180\n",
416 |     "  * CyGNSS: 0 ... 180 ... 360\n",
417 |     "* Latitude\n",
418 |     "  * ERA5 & CyGNSS: -40 ... 0 ... +40\n",
419 |     "* Timestamp\n",
420 |     "\n",
421 |     "\n",
422 |     "--> Need to shift the ERA5 longitude coordinate by 180"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "id": "96ecbfa9-4a93-4821-b65c-05fc8697f8d5",
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "def annotate_dataset(cygnss_file, era5_file, save_dataset=False):\n",
433 |     "    '''\n",
434 |     "    Annotate a given CyGNSS dataset with ERA5 windspeed labels and save to disk\n",
435 |     "    \n",
436 |     "    Parameters:\n",
437 |     "    cygnss_file : path to CyGNSS dataset\n",
438 |     "    era5_file   : path to orresponding ERA5 dataset\n",
439 |     "    save_dataset : if True, save dataset to disk overwriting cygnss_file (default: False)\n",
440 |     "    \n",
441 |     "    Returns:\n",
442 |     "    Annotated CyGNSS dataset\n",
443 |     "    '''\n",
444 |     "    \n",
445 |     "    # necessary because lazy loading prohibits overwriting the netcdf files at the end of this section\n",
446 |     "    with xr.open_dataset(cygnss_file) as data:\n",
447 |     "        cygnss_ds = data.load()\n",
448 |     "        \n",
449 |     "    with xr.open_dataset(era5_file) as data:\n",
450 |     "        era5_ds = data.load()\n",
451 |     "        \n",
452 |     "    # needs to be shifted by 180 for compatibility with CyGNSS\n",
453 |     "    era5_ds = era5_ds.assign_coords(longitude=era5_ds.coords['longitude'] + 180)\n",
454 |     "    \n",
455 |     "    interp_ds = era5_ds.interp(longitude=cygnss_ds.sp_lon, latitude=cygnss_ds.sp_lat, time=cygnss_ds.ddm_timestamp_utc)\n",
456 |     "    \n",
457 |     "    cygnss_ds['ERA5_u10'] = interp_ds['u10']\n",
458 |     "    cygnss_ds['ERA5_v10'] = interp_ds['v10']\n",
459 |     "\n",
460 |     "    tmp_attrs = cygnss_ds['ERA5_u10'].attrs\n",
461 |     "    tmp_attrs['long_name'] = cygnss_ds['ERA5_u10'].long_name + ' (interpolated)'\n",
462 |     "    cygnss_ds['ERA5_u10'].attrs = tmp_attrs\n",
463 |     "\n",
464 |     "    tmp_attrs = cygnss_ds['ERA5_v10'].attrs\n",
465 |     "    tmp_attrs['long_name'] = cygnss_ds['ERA5_v10'].long_name + ' (interpolated)'\n",
466 |     "    cygnss_ds['ERA5_v10'].attrs = tmp_attrs\n",
467 |     "    \n",
468 |     "    cygnss_ds = cygnss_ds.drop_vars(['longitude', 'latitude', 'time'])\n",
469 |     "    \n",
470 |     "    # dummy values only for preprocessing routine\n",
471 |     "    cygnss_ds['GPM_precipitation'] = -9999\n",
472 |     "    cygnss_ds['ERA5_mdts'] = -9999\n",
473 |     "    cygnss_ds['ERA5_mdww'] = -9999\n",
474 |     "    cygnss_ds['ERA5_swh'] = -9999\n",
475 |     "    cygnss_ds['ERA5_shts'] = -9999\n",
476 |     "    cygnss_ds['ERA5_shww'] = -9999\n",
477 |     "    cygnss_ds['ERA5_p140121'] = -9999\n",
478 |     "    cygnss_ds['ERA5_p140124'] = -9999\n",
479 |     "    cygnss_ds['ERA5_p140127'] = -9999\n",
480 |     "    \n",
481 |     "    if save_dataset:\n",
482 |     "        cygnss_ds.to_netcdf(cygnss_file)\n",
483 |     "        \n",
484 |     "    return cygnss_ds"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "id": "3e15d7b1-9d6d-4e1a-9ba6-ea28b9943f28",
491 |    "metadata": {},
492 |    "outputs": [],
493 |    "source": [
494 |     "for cygnss_file in os.listdir(raw_data_dir):\n",
495 |     "    if cygnss_file.startswith('cyg') and cygnss_file.endswith('.nc'):\n",
496 |     "        print(cygnss_file)\n",
497 |     "        annotate_dataset(os.path.join(raw_data_dir, cygnss_file), era5_data, save_dataset=True)"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "markdown",
502 |    "id": "41fa8522-6306-45f9-ad34-609a30995765",
503 |    "metadata": {},
504 |    "source": [
505 |     "## Check raw data"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "id": "d099d73c-53e5-4939-bf93-99105225a0e7",
512 |    "metadata": {},
513 |    "outputs": [],
514 |    "source": [
515 |     "from importlib import reload\n",
516 |     "reload(prep)\n",
517 |     "raw_ds = prep.open_mfdataset(os.path.join(raw_data_dir, cygnss_file))\n",
518 |     "\n",
519 |     "raw_ds"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "id": "b4b7fbce-05e4-418f-bf29-a294846947d0",
526 |    "metadata": {},
527 |    "outputs": [],
528 |    "source": [
529 |     "filtered_ds = prep.apply_quality_filter(raw_ds, is_ml_ops=True)\n",
530 |     "filtered_ds"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "id": "66d21d51-a157-4823-98bc-9160b19627a8",
537 |    "metadata": {},
538 |    "outputs": [],
539 |    "source": [
540 |     "os.listdir('/work/ka1176/shared_data/2020-03/raw_data/2021/014/')"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": null,
546 |    "id": "c05f455c-a047-49ac-8169-6eee7f0ee38e",
547 |    "metadata": {},
548 |    "outputs": [],
549 |    "source": [
550 |     "bu = raw_ds['ddm_brcs_uncert']\n",
551 |     "qf = raw_ds['quality_flags']\n",
552 |     "st = raw_ds['nst_att_status']\n",
553 |     "fom = raw_ds['prn_fig_of_merit']\n",
554 |     "les = raw_ds['ddm_les']\n",
555 |     "rxg = raw_ds['sp_rx_gain']\n",
556 |     "nsca = raw_ds['nbrcs_scatter_area']\n",
557 |     "lsca = raw_ds['les_scatter_area']\n",
558 |     "lat = raw_ds['sp_lat']\n",
559 |     "lon = raw_ds['sp_lon']\n",
560 |     "ws = raw_ds['windspeed']"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "id": "1cb9d80b-5400-4a2c-878b-624fe05040b9",
566 |    "metadata": {},
567 |    "source": [
568 |     "For now, use only the quality flag == 4"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "id": "289ceac1-d478-4276-868b-9f246c8222c2",
575 |    "metadata": {},
576 |    "outputs": [],
577 |    "source": [
578 |     "quality = (bu<1) & (qf == 4) & (st == 0) & (fom > 3) & (rxg > 0) & (les >= 0)"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "code",
583 |    "execution_count": null,
584 |    "id": "8f472924-fcb5-4639-8609-cda8b4fb4a51",
585 |    "metadata": {},
586 |    "outputs": [],
587 |    "source": [
588 |     "np.sum((bu<1) & (st==0)).compute()"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "markdown",
593 |    "id": "fb390261-935f-4573-87f8-50b926f873a0",
594 |    "metadata": {},
595 |    "source": [
596 |     "## Created processed data"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "id": "5e078c5c-068a-43e5-b1dc-23806c0226e5",
603 |    "metadata": {},
604 |    "outputs": [],
605 |    "source": [
606 |     "raw_ds = prep.open_mfdataset(os.path.join(raw_data_dir, 'cyg06*.nc'), channels=[0,1,2,3])"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "id": "1bbd55a2-ac14-4dbb-8aa5-9c64c64ebbd1",
613 |    "metadata": {},
614 |    "outputs": [],
615 |    "source": [
616 |     "dev_data_dir = '/work/ka1176/shared_data/2022-cygnss-deployment/dev_data/'"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "id": "3fef8930-282e-4751-8678-18db21ee13f9",
623 |    "metadata": {},
624 |    "outputs": [],
625 |    "source": [
626 |     "for ff in os.listdir('/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/2021/168/'):\n",
627 |     "    tmp = xr.open_dataset(os.path.join('/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/2021/168/', ff))\n",
628 |     "    if not 'ERA5_u10' in tmp.keys():\n",
629 |     "        print(ff)"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": null,
635 |    "id": "93ebd942-72d9-4344-8496-329f8a9c73c9",
636 |    "metadata": {},
637 |    "outputs": [],
638 |    "source": [
639 |     "tmp"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": null,
645 |    "id": "c24d957d-2db5-4fd4-a88d-e18a53457384",
646 |    "metadata": {},
647 |    "outputs": [],
648 |    "source": [
649 |     "reload(prep)\n",
650 |     "args = argparse.Namespace(raw_data_dir='/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/',\n",
651 |     "                          output_dir=dev_data_dir,\n",
652 |     "                          v_map=['brcs'],\n",
653 |     "                          n_valid_days=0,\n",
654 |     "                          n_test_days=1,\n",
655 |     "                          n_processes=1,\n",
656 |     "                          only_merge=False,\n",
657 |     "                          use_land_data=False,\n",
658 |     "                          is_ml_ops=True,\n",
659 |     "                          version='v3.1',\n",
660 |     "                          day=dday,\n",
661 |     "                          year=year,\n",
662 |     "                          reduce_mode='')\n",
663 |     "\n",
664 |     "prep.generate_input_data(args)"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "markdown",
669 |    "id": "7ec9da62-5cd7-4f1d-b4eb-a11367748f5d",
670 |    "metadata": {},
671 |    "source": [
672 |     "## Check the new CyGNSS data v3.1"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "code",
677 |    "execution_count": null,
678 |    "id": "f6e400c4-a661-409e-8add-19734e6e954c",
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "TODO annotate the samples with date (year month day etc)"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": null,
688 |    "id": "11d01ad2-7a19-458d-817d-efadea14d643",
689 |    "metadata": {},
690 |    "outputs": [],
691 |    "source": [
692 |     "!conda list env"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "code",
697 |    "execution_count": null,
698 |    "id": "00e4867b-2e29-4967-a193-ef2356a151f0",
699 |    "metadata": {},
700 |    "outputs": [],
701 |    "source": []
702 |   },
703 |   {
704 |    "cell_type": "code",
705 |    "execution_count": null,
706 |    "id": "3ebc77ff-5f86-4962-a058-9fc359cf5b3f",
707 |    "metadata": {},
708 |    "outputs": [],
709 |    "source": []
710 |   }
711 |  ],
712 |  "metadata": {
713 |   "kernelspec": {
714 |    "display_name": "CyGNSS Deployment",
715 |    "language": "python",
716 |    "name": "cygnss-d"
717 |   },
718 |   "language_info": {
719 |    "codemirror_mode": {
720 |     "name": "ipython",
721 |     "version": 3
722 |    },
723 |    "file_extension": ".py",
724 |    "mimetype": "text/x-python",
725 |    "name": "python",
726 |    "nbconvert_exporter": "python",
727 |    "pygments_lexer": "ipython3",
728 |    "version": "3.9.13"
729 |   }
730 |  },
731 |  "nbformat": 4,
732 |  "nbformat_minor": 5
733 | }
734 | 


--------------------------------------------------------------------------------