├── docker_cygnss_deployment
├── .env
├── Dockerfile
├── requirements.txt
└── docker-compose.yml
├── app_write_test.txt
├── Workflow.png
├── .gitmodules
├── deployment
├── mongodb-configmap.yaml
├── mongodb-secret.yaml
├── prefect-agent-deployment.yaml
├── streamlit-deployment.yaml
├── prefect-orion-deployment.yaml
├── mongodb-deployment.yaml
└── mongo-express-deployment.yaml
├── set_up_infrastructure.sh
├── .gitignore
├── LICENSE.md
├── download_training_data.py
├── Usage.md
├── README.md
├── dashboard.py
├── Preprocessing.py
├── plots.py
├── API.py
├── notebooks
├── DailyAnalysis.ipynb
└── Preprocessing.ipynb
├── prefect-deploy.py
└── environment.yml
/docker_cygnss_deployment/.env:
--------------------------------------------------------------------------------
1 | UID=201207
2 | GID=201207
3 |
--------------------------------------------------------------------------------
/app_write_test.txt:
--------------------------------------------------------------------------------
1 | app_write_test/tmp/tmpxv3x4wj2prefect/tmp/tmpxv3x4wj2prefect
--------------------------------------------------------------------------------
/Workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hereon-KSN/cygnss-deployment/HEAD/Workflow.png
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "externals/gfz_cygnss"]
2 | path = externals/gfz_cygnss
3 | url = https://gitlab.dkrz.de/aim/2020-03-gfz-remote-sensing.git
4 |
--------------------------------------------------------------------------------
/deployment/mongodb-configmap.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 | name: mongodb-configmap
5 | data:
6 | database_url: mongodb://root:example@mongodb:27017/
7 |
--------------------------------------------------------------------------------
/deployment/mongodb-secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 | name: mongodb-secret
5 | type: Opaque
6 | data:
7 | mongo-root-username: dXNlcgo=
8 | mongo-root-password: ZXhhbXBsZQo=
9 |
--------------------------------------------------------------------------------
/set_up_infrastructure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | git clone --recurse-submodules https://gitlab.dkrz.de/aim/cygnss-deployment
4 |
5 | cd cygnss-deployment/docker_cygnss_deployment
6 |
7 | docker-compose up --build
8 |
--------------------------------------------------------------------------------
/docker_cygnss_deployment/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9
2 | FROM continuumio/miniconda3
3 | WORKDIR app/
4 | COPY requirements.txt .
5 | RUN pip install --upgrade pip
6 | RUN conda install -c conda-forge cartopy
7 | RUN conda install xarray=0.20.1
8 | RUN pip install -r requirements.txt
9 |
--------------------------------------------------------------------------------
/docker_cygnss_deployment/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.17.0
2 | scikit-learn==1.2.1
3 | pandas==1.5.3
4 | numpy==1.23.4
5 | requests==2.28.2
6 | Pillow==9.4.0
7 | pymongo==4.3.3
8 | mlflow
9 | matplotlib==3.6.3
10 | scipy==1.10.0
11 | h5py==3.8.0
12 | netcdf4==1.6.2
13 | torch==1.13.1
14 | seaborn==0.12.2
15 | pytorch-lightning==1.5.10
16 | cdsapi==0.5.1
17 | podaac-data-subscriber==1.12.0
18 | global-land-mask==1.0.0
19 | prefect==2.6.8
20 | sqlalchemy
21 | dask==2023.1.1
22 | shutils
23 |
--------------------------------------------------------------------------------
/deployment/prefect-agent-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: prefect-agent-deployment
5 | spec:
6 | replicas: 1
7 | selector:
8 | matchLabels:
9 | app: prefect-agent
10 | template:
11 | metadata:
12 | labels:
13 | app: prefect-agent
14 | spec:
15 | containers:
16 | - name: prefect-agent
17 | image: streamlit:v1
18 | imagePullPolicy: IfNotPresent
19 | resources:
20 | limits:
21 | memory: "8000Mi"
22 | cpu: "1000m"
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | torchserve-example/mnist/model-store/mnist.mar
2 | torchserve-example/mnist/app/__pycache__/*
3 | torchserve-example/mnist/app/static/test_data/*
4 | saved_models/*
5 | data/*
6 | notebooks/.ipynb_checkpoints/*
7 | __pycache__/*
8 | cycnss_frauke.sqlite-journal
9 | cycnss_test_frauke.sqlite
10 | notebooks/lightning_logs/*
11 | utils/*
12 | lightning_logs/
13 | mlruns/*
14 | mlruns.db
15 | utils/mathematics.py
16 | utils/__pycache__/*
17 | plots/*
18 | docker_cygnss_deployment/volumes/
19 | annotated_raw_data/*
20 | raw_data/*
21 | dev_data/*
22 | prediction/*
23 | 2022-cygnss-deployment/*
24 |
--------------------------------------------------------------------------------
/deployment/streamlit-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: streamlit-deployment
5 | spec:
6 | replicas: 1
7 | selector:
8 | matchLabels:
9 | app: streamlit
10 | template:
11 | metadata:
12 | labels:
13 | app: streamlit
14 | spec:
15 | containers:
16 | - name: streamlit
17 | image: streamlit:v1
18 | imagePullPolicy: IfNotPresent
19 | #resources:
20 | # limits:
21 | # memory: "8000Mi"
22 | # cpu: "1000m"
23 | ports:
24 | - containerPort: 8501
25 |
--------------------------------------------------------------------------------
/deployment/prefect-orion-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: prefect-orion-deployment
5 | spec:
6 | replicas: 1
7 | selector:
8 | matchLabels:
9 | app: prefect-orion
10 | template:
11 | metadata:
12 | labels:
13 | app: prefect-orion
14 | spec:
15 | containers:
16 | - name: prefect-orion
17 | image: prefecthq/prefect:2.6.8-python3.11
18 | imagePullPolicy: IfNotPresent
19 | resources:
20 | limits:
21 | memory: "700Mi"
22 | cpu: "500m"
23 | ports:
24 | - containerPort: 4200
25 |
--------------------------------------------------------------------------------
/deployment/mongodb-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: mongodb-deployment
5 | spec:
6 | replicas: 1
7 | selector:
8 | matchLabels:
9 | app: mongodb
10 | template:
11 | metadata:
12 | labels:
13 | app: mongodb
14 | spec:
15 | containers:
16 | - name: mongodb
17 | image: mongo:6.0.3
18 | imagePullPolicy: IfNotPresent
19 | ports:
20 | - containerPort: 27017
21 | volumeMounts:
22 | - mountPath: /data/db
23 | name: mongodb
24 | env:
25 | - name: MONGO_INITDB_ROOT_USERNAME
26 | valueFrom:
27 | secretKeyRef:
28 | name: mongodb-secret
29 | key: mongo-root-username
30 | - name: MONGO_INITDB_ROOT_PASSWORD
31 | valueFrom:
32 | secretKeyRef:
33 | name: mongodb-secret
34 | key: mongo-root-password
35 | volumes:
36 | - name: mongodb
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022-2023 Frauke Albrecht, Caroline Arnold, Harsh Grover (DKRZ-AIM)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/deployment/mongo-express-deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: mongodb-express-deployment
5 | labels:
6 | app: mongodb-express
7 | spec:
8 | replicas: 1
9 | selector:
10 | matchLabels:
11 | app: mongodb-express
12 | template:
13 | metadata:
14 | labels:
15 | app: mongodb-express
16 | spec:
17 | containers:
18 | - name: mongodb-express
19 | image: mongo-express:1.0.0-alpha.4
20 | imagePullPolicy: IfNotPresent
21 | ports:
22 | - containerPort: 8081
23 | volumeMounts:
24 | - mountPath: /data/db
25 | name: mongodb
26 | env:
27 | - name: ME_CONFIG_MONGODB_ADMINUSERNAME
28 | valueFrom:
29 | secretKeyRef:
30 | name: mongodb-secret
31 | key: mongo-root-username
32 | - name: ME_CONFIG_MONGODB_ADMINPASSWORD
33 | valueFrom:
34 | secretKeyRef:
35 | name: mongodb-secret
36 | key: mongo-root-password
37 | - name: ME_CONFIG_MONGODB_SERVER
38 | valueFrom:
39 | configMapKeyRef:
40 | name: mongodb-configmap
41 | key: database_url
42 | volumes:
43 | - name: mongodb
--------------------------------------------------------------------------------
/download_training_data.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from API import download_raw_data
3 | from datetime import datetime, timedelta, date
4 | from Preprocessing import pre_processing
5 |
6 | def download_data(year, month, day, raw_data_root):
7 | # Using API calls
8 | download_raw_data(year, month, day, raw_data_root=raw_data_root)
9 |
10 | def main(offset):
11 |
12 | # Define the date and pass it to the individual tasks
13 | download_date = date.today() - timedelta(days=int(offset))
14 | date_ = download_date.strftime("%Y-%m-%d")
15 |
16 | raw_data_root = '/work/ka1176/shared_data/2020-03/raw_data_v3-1'
17 | annotated_raw_data_root = '/work/ka1176/shared_data/2020-03/annotated_raw_data_v3-1'
18 |
19 | print("*"*50)
20 | print(" Download date", date_)
21 | print("*"*50)
22 |
23 | # Download data for the past 10th day from today, today - 10th day
24 | download_data(download_date.year, download_date.month, download_date.day, raw_data_root)
25 |
26 | # annotate data
27 | # create filtered hdf5 from preprocessing
28 | pre_processing(download_date.year, download_date.month, download_date.day, dev_data_dir='/scratch/k/k202141/',
29 | raw_data_root=raw_data_root, annotated_raw_data_root=annotated_raw_data_root)
30 |
31 | if __name__ == "__main__":
32 |
33 | main(sys.argv[1])
34 |
--------------------------------------------------------------------------------
/Usage.md:
--------------------------------------------------------------------------------
1 | # Usage
2 |
3 | ## In Script
4 |
5 | ```bash
6 | cd ~cygnss-deployment
7 |
8 | # download CyGNSS data
9 | python API.py
10 |
11 | # download ERA5 data and annotate CyGNSS data with wind speed labels
12 | # preprocss (filter) to create hdf5
13 | python Preprocessing.py
14 |
15 | # Inference
16 | PYTHONPATH="./externals/gfz_cygnss/":${PYTHONPATH}
17 | export PYTHONPATH
18 |
19 | python ./externals/gfz_cygnss/gfz_202003/training/cygnssnet.py --load-model-path ./externals/gfz_cygnss/trained_models/ygambdos_yykDM.ckpt --data ./dev_data --save-y-true --prediction-output-path ./prediction/current_predictions.h5
20 | ```
21 |
22 | ## In Jupyter notebook
23 |
24 | ### Kernel
25 |
26 | Create `conda` environment using
27 |
28 | ```bash
29 | conda env create --file docker/kernel-env-cuda11.yaml
30 |
31 | conda activate cygnss-d
32 |
33 | # some packages were not installed correctly
34 | conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
35 | conda install pytorch-lightning -c conda-forge
36 | pip install global-land-mask
37 | ```
38 | Create Jupyterhub kernel from this environment following https://docs.dkrz.de/doc/software%26services/jupyterhub/kernels.html
39 |
40 | ### Setup for preprocessing
41 |
42 | #### Earthdata
43 |
44 | - Retrieve user ID and create `.netrc` as described in ...
45 | - change the persmission of the file: chmod og-rwx ~/.netrc
46 |
47 | #### ERA5
48 |
49 | Retrieve user ID and API key and create `cdsapi` as described in ...
50 |
--------------------------------------------------------------------------------
/docker_cygnss_deployment/docker-compose.yml:
--------------------------------------------------------------------------------
1 | # Use root/example as user/password credentials
2 | version: '3.1'
3 | services:
4 | mongodb:
5 | image: mongo:6.0.3
6 | container_name: mongodb
7 | restart: always
8 | volumes:
9 | - mongodbdata:/data/db
10 | environment:
11 | MONGO_INITDB_ROOT_USERNAME: root
12 | MONGO_INITDB_ROOT_PASSWORD: example
13 | networks:
14 | - backend
15 |
16 | mongo-express:
17 | image: mongo-express:1.0.0-alpha.4
18 | container_name: mongo-express
19 | restart: always
20 | ports:
21 | - 8081:8081
22 | volumes:
23 | - mongodbdata:/data/db
24 | environment:
25 | ME_CONFIG_MONGODB_ADMINUSERNAME: root
26 | ME_CONFIG_MONGODB_ADMINPASSWORD: example
27 | ME_CONFIG_MONGODB_URL: mongodb://root:example@mongodb:27017/
28 | networks:
29 | - backend
30 |
31 |
32 | streamlit:
33 | user: "${UID}:${GID}"
34 | build: .
35 | restart: always
36 | volumes:
37 | - "./../:/app/"
38 | - /home/k/k202156/.netrc:/.netrc
39 | - /home/k/k202156/.cdsapirc:/.cdsapirc
40 | ports:
41 | - "8501:8501"
42 | - "5000:5000"
43 | - "80:80"
44 | # command: bash -c "streamlit run dashboard.py"
45 | command: bash -c "python prefect-deploy.py && streamlit run dashboard.py --server.port=80 && mlflow ui --backend-store-uri sqlite:///mlruns.db -p 5000"
46 | env_file:
47 | - .env
48 | environment:
49 | PREFECT_API_URL: http://orion:4200/api
50 | depends_on:
51 | - mongodb
52 | networks:
53 | - backend
54 |
55 | orion:
56 | image: prefecthq/prefect:2.6.8-python3.11
57 | restart: always
58 | ports:
59 | - "4200:4200"
60 | volumes:
61 | - prefect:/root/.prefect
62 | entrypoint: ["prefect", "orion", "start"]
63 | environment:
64 | PREFECT_ORION_API_HOST: 0.0.0.0
65 | PREFECT_LOGGING_SERVER_LEVEL: WARNING
66 | PREFECT_API_URL: http://localhost:4200/api
67 | #PREFECT_ORION_DATABASE_CONNECTION_URL: sqlite+aiosqlite:////root/.prefect/orion.db
68 |
69 | depends_on:
70 | - mongodb
71 | networks:
72 | - backend
73 |
74 |
75 | prefect-agent:
76 | user: "${UID}:${GID}"
77 | restart: always
78 | build: .
79 | entrypoint: ["prefect", "agent", "start", "-q", "demo"]
80 | volumes:
81 | - "./../:/app/"
82 | - ${HOME}/.netrc:/.netrc
83 | - ${HOME}/.cdsapirc:/.cdsapirc
84 | environment:
85 | PREFECT_API_URL: http://orion:4200/api
86 | PREFECT_LOGGING_LEVEL: DEBUG
87 | env_file:
88 | - .env
89 | depends_on:
90 | - orion
91 | networks:
92 | - backend
93 |
94 |
95 | networks:
96 | backend:
97 | driver: bridge
98 |
99 | volumes:
100 | mongodbdata:
101 | driver: local
102 | prefect:
103 |
104 |
105 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Web Interface for Wind Speed Prediction
2 |
3 | ### About
4 |
5 | The objective of this repository is to deploy a pre-trained *CyGNSSnet* to predict global ocean wind speed in near time. The results are shown on a web interface, which provides different illustrations of the predicted wind speed and its error compared to [ERA5 windspeed](https://www.ecmwf.int/en/forecasts/datasets/reanalysis-datasets/era5) data.
6 |
7 | *CyGNSSnet* is a neural net developed to predict wind speed from [CYGNSS](https://podaac.jpl.nasa.gov/dataset/CYGNSS_L2_V3.0)(**Cy**clone **G**lobal **N**avigation **S**atellite **S**ystem) data. The code for *CyGNSSnet* itself is not public. For more information or if you need to access it contact Caroline Arnold (arnold@dkrz.de) or the Helmholtz AI consultant team for Earth and Environment (consultant-helmholtz.ai@dkrz.de). For more information on *CyGNSSnet*, see [Asgarimehr et al, Remote Sensing of Environment (2022)](https://doi.org/10.1016/j.rse.2021.112801)
8 | ### Workflow
9 |
10 | 
11 |
12 |
13 |
14 | ### Quick start
15 |
16 | To start the deployment run ```sh set_up_infrastructure.sh```.
17 |
18 | This clones the git repository and starts the deployment using docker-compose.
19 | Make sure you have docker and docker-compose installed.
20 |
21 | If you have already the cloned the git repository move to the directory ```docker_cygnss_deployment``` and run
22 |
23 | ```
24 | docker-compose up
25 | ```
26 |
27 | To stop the container, run following command:
28 | ```
29 | docker-compose -f ./docker-compose.yml down --remove-orphans
30 | ```
31 |
32 | Note: In order to run it you need access to the external submodule containing the CyGNSSnet.
33 |
34 | The deployment is scheduled using prefect. It is executed every day and downloads the CyGNSS data for the current date minus 10 days. Then the predictions are calculated, stored in a mongodb database and displayed on a streamlit dashboard.
35 |
36 | To access the streamlit dashboard: http://localhost:8501
37 |
38 | To access the mongodb database: http://localhost:8081
39 |
40 | To access the prefect ui: http://localhost:5000
41 |
42 |
43 | ### Repository Structure
44 |
45 | ```
46 | API.py: download CyGNSS data
47 | Preprocessing.py: download ERA5 data and preprocess data
48 | dashboard.py: streamlit dashboard
49 | plots.py: helper functions to create the plots for the streamlit dashboard
50 | prefect-deploy.py: Deployment scheduled for every day
51 | externals/: folder with CyGNSSnet code
52 | notebooks/: folder with some notebooks that were created during the development
53 | docker_cygnss_deployment/: folder with docker files to start deployment
54 | ```
55 |
56 | ## Data source
57 |
58 | - CYGNSS. CYGNSS Level 2 Science Data Record Version 3.1. Ver. 3.1. PO.DAAC, CA, USA. accessed 2022/2023 at 10.5067/CYGNS-L2X31
59 | - Copernicus Climate Change Service (C3S) (2017): ERA5: Fifth generation of ECMWF atmospheric reanalyses of the global climate . Copernicus Climate Change Service Climate Data Store (CDS), 2022/2023. https://cds.climate.copernicus.eu/cdsapp#!/home
60 |
--------------------------------------------------------------------------------
/dashboard.py:
--------------------------------------------------------------------------------
1 | #import libraries
2 | import streamlit as st
3 | import pandas as pd
4 | import numpy as np
5 | import requests
6 | from sklearn.ensemble import RandomForestClassifier
7 | import json
8 | import datetime
9 | from datetime import timedelta
10 |
11 | import streamlit as st
12 | from pymongo import MongoClient, errors
13 | from PIL import Image
14 | import requests
15 | from io import BytesIO
16 |
17 |
18 |
19 | def user_input_features():
20 | option = st.sidebar.selectbox(
21 | 'What would you like to see?', ('Results', 'About us'))
22 | date_ = st.sidebar.date_input("For which date you want to see the results", datetime.date.today() - timedelta(days=12), min_value = datetime.date(2021,1,1), max_value = datetime.date.today() - timedelta(days=12))
23 |
24 |
25 | return date_, option
26 |
27 | # Initialize connection.
28 | # Uses st.experimental_singleton to only run once.
29 | @st.experimental_singleton
30 | def init_connection():
31 | client = MongoClient('mongodb://root:example@mongodb:27017/')
32 | return client
33 |
34 |
35 | @st.experimental_memo(ttl=600)
36 | def get_data(date_):
37 | cygnss = client.cygnss
38 | from_date = date_
39 | criteria = {"event_date": {"$eq": from_date}}
40 | items = cygnss.cygnss_collection.find(criteria)
41 | items = list(items) # make hashable for st.experimental_memo
42 | return items
43 |
44 |
45 | date_, option = user_input_features()
46 |
47 |
48 | # Pull data from the collection.
49 | # Uses st.experimental_memo to only rerun when the query changes or after 10 min.
50 | # Initializing connection
51 | client = init_connection()
52 |
53 | date_ = date_.strftime("%Y-%m-%d")
54 |
55 | # drop database if exists, just to not clutter it with multiple values of data
56 | # client.drop_database('cygnss')
57 | items = get_data(date_)
58 |
59 | if option == 'About us':
60 |
61 |
62 | st.write("""
63 | # About US""")
64 |
65 | st.write("The objective of this website is to use a pre-trained CyGNSSnet \
66 | to predict global ocean wind speed in near time. The results are shown on a web interface, \
67 | which provides different illustrations of the predicted wind speed and its error compared to ERA5 windspeed data.\
68 | CyGNSSnet is a neural net developed to predict wind speed from CYGNSS(Cyclone Global Navigation Satellite System) data.\
69 | The code for CyGNSSnet itself is not public. For more information or if you need to access it contact Caroline Arnold (arnold@dkrz.de)\
70 | or the Helmholtz AI consultant team for Earth and Environment (consultant-helmholtz.ai@dkrz.de). For more information on CyGNSSnet,\
71 | see Asgarimehr et al, Remote Sensing of Environment (2022)")
72 |
73 | if option == 'Results':
74 |
75 |
76 | # Display results.
77 | if len(items) == 0:
78 | st.write(f" Data does not exist for this date. Choose a different date please!")
79 |
80 | else:
81 | # Creating UI
82 | # st.subheader('User Input parameters')
83 |
84 | st.write("""
85 | # Results """)
86 |
87 | # app heading
88 | st.write("""
89 | # Ocean Wind Speed""")
90 |
91 | st.write('Date:', date_)
92 |
93 |
94 | y_bins = ["up to 4m/s", "up to 8m/s", "up to 12m/s",
95 | "up to 16m/s", "up to 20m/s", "up to 100m/s"]
96 | for item in items: # @harsh can this be more than 1 item?
97 | st.write(f"Total RMSE is: {item['rmse']:.3f} m/s ")
98 | d = {'Windspeed': y_bins, 'RMSE': item['bin_rmse'], 'Bias': item['bin_bias'],
99 | 'Counts': [int(i) for i in item['bin_counts']]}
100 | df = pd.DataFrame(data=d)
101 | # hide first column (index) of the table
102 | hide_table_row_index = """
103 |
107 | """
108 | st.markdown(hide_table_row_index, unsafe_allow_html=True)
109 | st.table(data=df)
110 |
111 | for item in items:
112 | #response = requests.get(item['image_url'])
113 | # Image.open(BytesIO(response.content))
114 | scatter = Image.open(item['scatterplot_path'])
115 | st.markdown(f"## Scatterplot: ERA5 wind speed - model prediction")
116 | st.image(scatter, caption="Scatterplot")
117 |
118 | histo = Image.open(item['histogram_path'])
119 | st.markdown(f"## Histogram: ERA5 wind speed and predicted wind speed")
120 | st.image(histo, caption="Histogram")
121 |
122 | #era_avg = Image.open(item['era_average_path'])
123 | # st.markdown(f"## ERA 5 Average")
124 | #st.image(era_avg, caption="ERA5 average")
125 |
126 | #rmse_avg = Image.open(item['rmse_average_path'])
127 | # st.markdown(f"## RMSE Average")
128 | #st.image(rmse_avg, caption="RMSE average")
129 |
130 | today_longavg = Image.open(item['today_longrunavg_path'])
131 | st.markdown(f"## RMSE - Today and Longrun Average")
132 | st.image(today_longavg, caption="RMSE - Today and Longrun Average")
133 |
134 | today_long_bias = Image.open(item['today_long_bias_path'])
135 | st.markdown(f"## BIAS - Today and Longrun Average")
136 | st.image(today_long_bias, caption="Bias - Today and Longrun Average")
137 |
138 | sample_counts = Image.open(item['sample_counts_path'])
139 | st.markdown(f"## Sample Counts")
140 | st.image(sample_counts, caption="Sample Counts")
141 |
142 | rmse_bins_era = Image.open(item['rmse_bins_era_path'])
143 | st.markdown(f"## RMSE for different Windspeed Bins")
144 | st.image(rmse_bins_era, caption="RMSE for different Windspeed Bins")
145 |
146 | bias_bins_era = Image.open(item['bias_bins_era_path'])
147 | st.markdown(f"## Bias for different Windspeed Bins")
148 | st.image(bias_bins_era, caption="Bias for different Windspeed Bins")
149 |
150 |
--------------------------------------------------------------------------------
/Preprocessing.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Preprocessing CyGNSS data
5 |
6 | import os
7 | import sys
8 | from datetime import datetime, date, timedelta
9 | import argparse
10 |
11 | sys.path.append('externals/gfz_cygnss/')
12 | from gfz_202003.preprocessing import preprocess as prep
13 | #sys.path.append('externals/gfz_cygnss/gfz_202003')
14 | #from preprocessing import preprocess as prep
15 |
16 | import numpy as np
17 | import xarray as xr
18 | import hashlib
19 |
20 | def pre_processing(year, month, day, dev_data_dir='/app/dev_data', raw_data_root='/app/raw_data', annotated_raw_data_root='/app/annotated_raw_data'):
21 | '''
22 | Preprocessing routines for CyGNSSnet
23 |
24 | (1) Annotate CyGNSS raw data with windspeed labels from ERA5
25 | (2) Filter and generate hdf5 file
26 |
27 | Folder structure:
28 |
29 | * raw_data
30 | * annotated_raw_data
31 | * dev_data : filtered, one file test_data.h5
32 |
33 | Parameters:
34 | year, month, day - preprocess the data downloaded for that day
35 | dev_data_dir - directory to store the filtered data for that day
36 | raw_data_root - where to find the downloaded raw data
37 | annotated_raw_data_root - where to store the annotated raw data
38 |
39 | Returns:
40 | h5_file - path to the filtered data for that day
41 | '''
42 |
43 | raw_data_sub = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%Y/%j")
44 |
45 | raw_data_dir = os.path.join(raw_data_root, raw_data_sub)
46 | annotated_raw_data_dir = os.path.join(annotated_raw_data_root, raw_data_sub)
47 | era5_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc')
48 |
49 | if not os.path.isdir(annotated_raw_data_dir):
50 | os.makedirs(annotated_raw_data_dir, exist_ok=True)
51 |
52 | if not os.path.isdir(dev_data_dir):
53 | os.makedirs(dev_data_dir, exist_ok=True)
54 |
55 | start_date = datetime(year, month, day).strftime("%Y-%m-%dT%H:%M:%SZ")
56 | end_date = (datetime(year, month, day) + timedelta(1)).strftime("%Y-%m-%dT%H:%M:%SZ")
57 |
58 | for cygnss_file in os.listdir(raw_data_dir):
59 | if cygnss_file.startswith('cyg') and cygnss_file.endswith('.nc'):
60 | print("annotating", cygnss_file)
61 |
62 | pcf = os.path.join(raw_data_dir, cygnss_file)
63 | phf = os.path.join(annotated_raw_data_dir, cygnss_file.replace('.nc', '.md5'))
64 |
65 | print("create hash", phf)
66 |
67 | if os.path.exists(phf):
68 | print("-- hash exists, skip")
69 | continue
70 |
71 | annotate_dataset(pcf, era5_data, save_dataset=True)
72 |
73 | hmd5 = hash_large_file(pcf)
74 | with open(phf, 'w') as hf:
75 | hf.write(hmd5)
76 |
77 | dday = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%j") # need that later
78 |
79 | args = argparse.Namespace(raw_data_dir=annotated_raw_data_root,
80 | output_dir=dev_data_dir,
81 | v_map=['brcs', 'eff_scatter', 'raw_counts', 'power_analog'],
82 | n_valid_days=0,
83 | n_test_days=1,
84 | n_processes=1,
85 | only_merge=False,
86 | use_land_data=False,
87 | is_ml_ops=True,
88 | version='v3.1',
89 | day=dday,
90 | year=year,
91 | reduce_mode='')
92 |
93 | prep.generate_input_data(args)
94 |
95 | def hash_large_file(file):
96 | '''
97 | Read a large file in chunks and compute the MD5 checksum
98 |
99 | Parameters:
100 | file - the file to be hashed
101 |
102 | Returns:
103 | hash(file)
104 | '''
105 | with open(file,'rb') as f:
106 | file_hash = hashlib.md5()
107 | while chunk := f.read(8192):
108 | file_hash.update(chunk)
109 |
110 | print(file_hash.hexdigest())
111 | return file_hash.hexdigest()
112 |
113 | def annotate_dataset(cygnss_file, era5_file, save_dataset=False):
114 | '''
115 | Annotate a given CyGNSS dataset with ERA5 windspeed labels and save to disk
116 |
117 | The ERA5 grid is padded to mimic periodic boundary conditions.
118 |
119 | Annotate additional ERA5 parameters (GPM_precipitation)
120 |
121 | TODO: hash
122 |
123 | Parameters:
124 | cygnss_file : path to CyGNSS dataset
125 | era5_file : path to orresponding ERA5 dataset
126 | save_dataset : if True, save dataset to disk in annotated_raw_data_dir (default: False)
127 |
128 | Returns:
129 | Annotated CyGNSS dataset
130 | '''
131 |
132 | # necessary because lazy loading prohibits overwriting the netcdf files at the end of this section
133 | with xr.open_dataset(cygnss_file) as data:
134 | cygnss_ds = data.load()
135 |
136 | with xr.open_dataset(era5_file) as data:
137 | era5_ds = data.load()
138 |
139 | # needs to be shifted by 180 for compatibility with CyGNSS
140 | era5_ds = era5_ds.assign_coords(longitude=era5_ds.coords['longitude'] + 180)
141 |
142 | # pad to the right (> 360 deg lon)
143 | era5_r = era5_ds.where(era5_ds.longitude < 10, drop=True)
144 | # pad to the left (< 0 deg lon)
145 | era5_l = era5_ds.where(era5_ds.longitude > 350, drop=True)
146 | # shift coordinate outside bounding box
147 | era5_r = era5_r.assign_coords(longitude=era5_r.coords['longitude'] + 360)
148 | era5_l = era5_l.assign_coords(longitude=era5_l.coords['longitude'] - 360)
149 |
150 | padded_ds = xr.merge([era5_l, era5_ds, era5_r])
151 |
152 | interp_ds = padded_ds.interp(longitude=cygnss_ds.sp_lon, latitude=cygnss_ds.sp_lat, time=cygnss_ds.ddm_timestamp_utc, method='nearest')
153 |
154 | cygnss_ds['ERA5_u10'] = interp_ds['u10']
155 | cygnss_ds['ERA5_v10'] = interp_ds['v10']
156 | cygnss_ds['GPM_precipitation'] = interp_ds['tp']
157 |
158 | tmp_attrs = cygnss_ds['ERA5_u10'].attrs
159 | tmp_attrs['long_name'] = cygnss_ds['ERA5_u10'].long_name + ' (interpolated)'
160 | cygnss_ds['ERA5_u10'].attrs = tmp_attrs
161 |
162 | tmp_attrs = cygnss_ds['ERA5_v10'].attrs
163 | tmp_attrs['long_name'] = cygnss_ds['ERA5_v10'].long_name + ' (interpolated)'
164 | cygnss_ds['ERA5_v10'].attrs = tmp_attrs
165 |
166 | cygnss_ds = cygnss_ds.drop_vars(['longitude', 'latitude', 'time'])
167 |
168 | # dummy values only for preprocessing routine
169 | cygnss_ds['ERA5_mdts'] = -9999
170 | cygnss_ds['ERA5_mdww'] = -9999
171 | cygnss_ds['ERA5_swh'] = -9999
172 | cygnss_ds['ERA5_shts'] = -9999
173 | cygnss_ds['ERA5_shww'] = -9999
174 | cygnss_ds['ERA5_p140121'] = -9999
175 | cygnss_ds['ERA5_p140124'] = -9999
176 | cygnss_ds['ERA5_p140127'] = -9999
177 |
178 | # additional condition - check for quality flag here
179 | cygnss_ds = cygnss_ds.where(cygnss_ds['quality_flags'] == 4, drop=True)
180 |
181 | if save_dataset:
182 | cygnss_ds.to_netcdf(cygnss_file.replace('raw_data', 'annotated_raw_data'))
183 |
184 | return cygnss_ds
185 |
186 | if __name__=='__main__':
187 | pre_processing()
188 |
--------------------------------------------------------------------------------
/plots.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import pandas as pd
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | from matplotlib import lines, colors, ticker
7 | import seaborn as sns
8 | import cartopy.crs as ccrs
9 | from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
10 | from mpl_toolkits.axes_grid1 import AxesGrid
11 | import itertools
12 | plt.switch_backend('agg')
13 |
14 | deg = 1 # grid resolution (publication: 1)
15 |
16 | grid_lon = np.arange(-180, 181, deg)
17 | grid_lat = np.arange(-90, 91, deg)
18 |
19 | def average_to_grid2(lon, lat, var, resolution=1, fill_value=-1):
20 | '''
21 | Grid a time-dependent variable in lon/lat and average over all counts
22 |
23 | lon - time series of lon coordinate (1D) (0...360)
24 | lat - time series of lat coordinate (1D)
25 | var - time series of variable (1D)
26 | resolution - target grid resolution (default: 1 deg)
27 | fill_value - a value that can be used for filling (i.e. that does not show up in var)
28 |
29 | Returns:
30 | 2D gridded arrays for lat, lon, count-averaged var
31 | '''
32 |
33 | assert len(lon) == len(lat)
34 | assert len(lon) == len(var)
35 |
36 | grid_lon = np.arange(0, 360+resolution, resolution)
37 | grid_lat = np.arange(-90, 90+resolution, resolution)[::-1] # top left is +lat
38 |
39 | ix_lon = np.digitize(lon, grid_lon)
40 | ix_lat = np.digitize(lat, grid_lat)
41 |
42 | xx, yy = np.meshgrid(grid_lon, grid_lat, indexing='ij')
43 | gridded_var = np.empty(xx.shape, dtype='float')
44 | gridded_var[:] = fill_value
45 |
46 | ij = itertools.product(np.unique(ix_lon), np.unique(ix_lat))
47 |
48 | for i,j in ij:
49 | cond = (ix_lon==i) & (ix_lat==j)
50 | gridded_var[i,j] = np.mean(var[cond])
51 |
52 | gridded_var[gridded_var==fill_value] = None
53 |
54 |
55 | return xx, yy, gridded_var
56 |
57 | def make_scatterplot(y_true, y_pred, date_):
58 | ymin = 2.5
59 | ymax = 25.0
60 |
61 | fig=plt.figure()
62 | ax=fig.add_subplot(111)
63 |
64 | img=ax.hexbin(y_true, y_pred, cmap='viridis', norm=colors.LogNorm(vmin=1, vmax=25000), mincnt=1)
65 | clb=plt.colorbar(img)
66 | clb.set_ticks([1, 10, 100, 1000, 10000])
67 | clb.set_ticklabels([r'$1$', r'$10$', r'$10^2$', r'$10^3$', r'$10^4$'])
68 | clb.set_label('Samples in bin')
69 | clb.ax.tick_params()
70 |
71 | ax.set_xlabel('ERA5 wind speed (m/s)')
72 | ax.set_ylabel('Predicted wind speed (m/s)')
73 |
74 | ax.plot(np.linspace(0, 30), np.linspace(0, 30), 'w:')
75 |
76 | ax.set_ylim(ymin, 25)
77 | ax.set_xlim(ymin, 25)
78 |
79 | ax.set_xticks([5, 10, 15, 20, 25])
80 | ax.set_xticklabels([5, 10, 15, 20, 25])
81 | ax.set_yticks([5, 10, 15, 20, 25])
82 | ax.set_yticklabels([5, 10, 15, 20, 25])
83 |
84 | fig.tight_layout()
85 | plt.savefig(f'/app/plots/scatter_{date_}.png')
86 |
87 | def make_histogram(y_true, y_pred, date_):
88 | fig=plt.figure()
89 | ax=fig.add_subplot(111)
90 |
91 | sns.histplot(y_true, ax=ax, color='C7', label='ERA5 wind speed (m/s)')
92 | sns.histplot(y_pred, ax=ax, color='C2', label='Predicted wind speed (m/s)')
93 |
94 | ax.legend(fontsize=12)
95 |
96 | ax.set_xticks([5, 10, 15, 20, 25])
97 | ax.set_xticklabels([5, 10, 15, 20, 25])
98 | ax.set_xlabel('ERA5 wind speed (m/s)')
99 |
100 | plt.savefig(f'/app/plots/histo_{date_}.png')
101 |
102 | def era_average(y_true, sp_lon, sp_lat, date_):
103 | xx, yy, gridded_y_true = average_to_grid2(sp_lon[:], sp_lat[:], y_true[:], resolution=deg)
104 | proj = ccrs.PlateCarree(180)
105 |
106 | fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))
107 | cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_y_true[:].T, levels=60, transform=proj, antialiased=False, cmap='magma')
108 | ax.coastlines()
109 | gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')
110 | gl.top_labels = False
111 | gl.right_labels= False
112 | clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average ERA5 wind speed (m/s)')
113 |
114 | clb.set_ticks(np.arange(2.5, 18, 2.5))
115 | clb.ax.tick_params(labelsize=8)
116 |
117 | gl.xlabel_style = {'size': 8, 'color': 'black'}
118 | gl.ylabel_style = {'size': 8, 'color': 'black'}
119 |
120 | plt.savefig(f'/app/plots/era_average_{date_}.png')
121 |
122 | def rmse_average(y_true, y_pred, sp_lon, sp_lat):
123 | xx, yy, gridded_rmse = average_to_grid2(sp_lon[:], sp_lat[:], np.abs(y_pred[:] - y_true[:]), resolution=deg)
124 | proj = ccrs.PlateCarree(180)
125 | fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))
126 | cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_rmse[:].T, levels=60, transform=proj, antialiased=False, cmap='viridis')
127 | ax.coastlines()
128 | gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')
129 | gl.top_labels = False
130 | gl.right_labels= False
131 | clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average RMSE (m/s)')
132 |
133 | clb.set_ticks(np.arange(0, np.nanmax(gridded_rmse)+1, 1.0))
134 | clb.ax.tick_params(labelsize=8)
135 |
136 | gl.xlabel_style = {'size': 8, 'color': 'black'}
137 | gl.ylabel_style = {'size': 8, 'color': 'black'}
138 |
139 |
140 | def today_longrunavg(df_mockup, y_bins, date_):
141 |
142 | fig=plt.figure(figsize=(10,4))
143 | ax=fig.add_subplot(111)
144 |
145 | sns.barplot(data=df_mockup, x='bins', y='rmse', hue='time', ax=ax)
146 | ax.legend()
147 |
148 | ax.set_xlabel('ERA5 wind speed (m/s)')
149 | ax.set_ylabel('RMSE (m/s)')
150 |
151 | ax.set_xticks(range(len(y_bins)))
152 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
153 |
154 | plt.savefig(f'/app/plots/today_longrunavg_{date_}.png')
155 |
156 | def today_longrunavg_bias(df_mockup, y_bins, date_):
157 |
158 | fig=plt.figure(figsize=(10,4))
159 | ax=fig.add_subplot(111)
160 |
161 | sns.barplot(data=df_mockup, x='bins', y='bias', hue='time', ax=ax)
162 | ax.legend()
163 |
164 | ax.set_xlabel('ERA5 wind speed (m/s)')
165 | ax.set_ylabel('Bias (m/s)')
166 |
167 | ax.set_xticks(range(len(y_bins)))
168 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
169 |
170 | plt.savefig(f'/app/plots/today_long_bias_{date_}.png')
171 |
172 | def sample_counts(df_rmse, y_bins, date_):
173 |
174 | fig=plt.figure(figsize=(10,4))
175 | ax=fig.add_subplot(111)
176 | sns.barplot(data=df_rmse, x='bins', y='counts', ax=ax)
177 | ax.set_xlabel('ERA5 wind speed (m/s)')
178 | ax.set_ylabel('Sample counts')
179 |
180 | ax.set_xticks(range(len(y_bins)))
181 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
182 |
183 | plt.savefig(f'/app/plots/sample_counts_{date_}.png')
184 |
185 | def rmse_bins_era(df_rmse, y_bins, date_):
186 |
187 | fig=plt.figure(figsize=(10,4))
188 | ax=fig.add_subplot(111)
189 | sns.barplot(data=df_rmse, x='bins', y='rmse', ax=ax)
190 | ax.set_xlabel('ERA5 wind speed (m/s)')
191 | ax.set_ylabel('RMSE (m/s)')
192 |
193 | ax.set_xticks(range(len(y_bins)))
194 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
195 |
196 | plt.savefig(f'/app/plots/rmse_bins_era_{date_}.png')
197 |
198 | def bias_bins_era(df_rmse, y_bins, date_):
199 |
200 | fig=plt.figure(figsize=(10,4))
201 | ax=fig.add_subplot(111)
202 | sns.barplot(data=df_rmse, x='bins', y='bias', ax=ax)
203 | ax.set_xlabel('ERA5 wind speed (m/s)')
204 | ax.set_ylabel('Bias (m/s)')
205 |
206 | ax.set_xticks(range(len(y_bins)))
207 | ax.set_xticklabels([f'< {yy} m/s' for yy in y_bins])
208 |
209 | plt.savefig(f'/app/plots/bias_bins_era_{date_}.png')
210 |
--------------------------------------------------------------------------------
/API.py:
--------------------------------------------------------------------------------
1 | import xarray as xr
2 | import numpy as np
3 | import os
4 | import sys
5 | from datetime import date, timedelta, datetime
6 |
7 | from subscriber import podaac_access as pa
8 | import cdsapi
9 | from urllib.error import HTTPError
10 | from urllib.request import urlretrieve
11 | import logging
12 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
13 |
14 | def download_raw_data(year, month, day, raw_data_root='/app/raw_data'):
15 | '''
16 | Download raw data using API
17 |
18 | * CyGNSS data
19 | * ERA5 data
20 |
21 | For compliance with the CyGNSSnet preprocessing routines, the data is stored in
22 |
23 | > {raw_data_root}/{year}/{day-of-year}
24 |
25 | Parameters:
26 | year, month, day - download data from the full day specified
27 | raw_data_root - root of path to store the data
28 | '''
29 |
30 | raw_data_sub = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%Y/%j")
31 |
32 | raw_data_dir = os.path.join(raw_data_root, raw_data_sub)
33 |
34 | print('Downloading data in this directory: ', raw_data_dir)
35 |
36 | start_date = datetime(year, month, day).strftime("%Y-%m-%dT%H:%M:%SZ")
37 | end_date = (datetime(year, month, day) + timedelta(1)).strftime("%Y-%m-%dT%H:%M:%SZ")
38 |
39 | print(f'--start-date {start_date}')
40 | print(f'--end-date {end_date}')
41 |
42 | # PODAAC data
43 | adapted_podaac_downloader(start_date, end_date, raw_data_dir)
44 |
45 | # ERA5 data
46 | era5_downloader(year, month, day, raw_data_dir)
47 |
48 |
49 | def era5_downloader(year, month, day, raw_data_dir):
50 | '''
51 | ERA5 data downloader from Copernicus
52 |
53 | We need to download all the time steps of the current day, as well as the
54 | time step midnight on the following day. These are merged.
55 |
56 | Parameters:
57 | year, month, day - download data from the full day specified
58 | data_path - path to store the data
59 | '''
60 |
61 | print("Start ERA5 download")
62 | target_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc')
63 | era5_data = os.path.join(raw_data_dir, 'ERA5_today.nc')
64 | tomorrow_era5_data = os.path.join(raw_data_dir, 'ERA5_tomorrow.nc')
65 | cds = cdsapi.Client()
66 |
67 | # Retrieve today's data
68 | cds.retrieve(
69 | 'reanalysis-era5-single-levels',
70 | {
71 | 'product_type': 'reanalysis',
72 | 'format': 'netcdf',
73 | 'variable': [
74 | '10m_u_component_of_wind', '10m_v_component_of_wind',
75 | 'total_precipitation',
76 | ],
77 | 'year': year,
78 | 'month': month,
79 | 'day': day,
80 | 'time': [
81 | '00:00', '01:00', '02:00',
82 | '03:00', '04:00', '05:00',
83 | '06:00', '07:00', '08:00',
84 | '09:00', '10:00', '11:00',
85 | '12:00', '13:00', '14:00',
86 | '15:00', '16:00', '17:00',
87 | '18:00', '19:00', '20:00',
88 | '21:00', '22:00', '23:00'
89 | ],
90 | 'area': [
91 | 50, -180, -50, 180,
92 | ],
93 | },
94 | era5_data)
95 |
96 | # Retrieve tomorrow's data
97 | tomorrow = datetime(year, month, day) + timedelta(1)
98 |
99 | cds.retrieve(
100 | 'reanalysis-era5-single-levels',
101 | {
102 | 'product_type': 'reanalysis',
103 | 'format': 'netcdf',
104 | 'variable': [
105 | '10m_u_component_of_wind', '10m_v_component_of_wind',
106 | 'total_precipitation',
107 | ],
108 | 'year': tomorrow.year,
109 | 'month': tomorrow.month,
110 | 'day': tomorrow.day,
111 | 'time': [
112 | '00:00', '01:00'
113 | ],
114 | 'area': [
115 | 50, -180, -50, 180,
116 | ],
117 | },
118 | tomorrow_era5_data)
119 |
120 | # Retrieve tomorrow's data
121 | with xr.open_dataset(era5_data) as f1, xr.open_dataset(tomorrow_era5_data) as f2:
122 | era5_ds = xr.merge([f1.load(), f2.load()])
123 | era5_ds.to_netcdf(target_data)
124 |
125 | print('SUCCESS: Retrieved ERA5 data')
126 |
127 |
128 | def adapted_podaac_downloader(start_date, end_date, data_path):
129 | '''
130 | PODAAC data downloader adapted for CyGNSSnet
131 |
132 | Adapted from the run routine in
133 | https://github.com/podaac/data-subscriber/blob/main/subscriber/podaac_data_downloader.py
134 |
135 | Parameters:
136 | start_date - download start date in ISO format
137 | end_date - download end date in ISO format
138 | data_path - path to store the data
139 | '''
140 |
141 | # Default values
142 | page_size = 2000
143 | edl = pa.edl
144 | cmr = pa.cmr
145 | token_url = pa.token_url
146 |
147 | pa.setup_earthdata_login_auth(edl)
148 | token = pa.get_token(token_url)
149 | print('Completed PODAAC authentification')
150 |
151 | provider = 'POCLOUD'
152 | #search_cycles = args.search_cycles [None ?]
153 | short_name = 'CYGNSS_L1_V3.1'
154 | extensions = None
155 | #process_cmd = args.process_cmd [empty ?]
156 |
157 | download_limit = None
158 | ts_shift = timedelta(hours=0)
159 |
160 | verbose = True
161 | force = False
162 |
163 |
164 | if not os.path.isdir(data_path):
165 | print("NOTE: Making new data directory at " + data_path + "(This is the first run.)")
166 | os.makedirs(data_path, exist_ok=True)
167 |
168 | temporal_range = pa.get_temporal_range(start_date, end_date,
169 | datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) # noqa E501
170 | params = [
171 | ('page_size', page_size),
172 | ('sort_key', "-start_date"),
173 | ('provider', provider),
174 | ('ShortName', short_name),
175 | ('temporal', temporal_range),
176 | ]
177 | print("Temporal Range: " + temporal_range)
178 |
179 | # TODO bbox
180 |
181 | #if args.bbox is not None:
182 | # params.append(('bounding_box', args.bbox))
183 |
184 | # If 401 is raised, refresh token and try one more time
185 | try:
186 | results = pa.get_search_results(params, verbose)
187 | except HTTPError as e:
188 | if e.code == 401:
189 | token = pa.refresh_token(token, 'podaac-subscriber')
190 | params['token'] = token
191 | results = pa.get_search_results(params, verbose)
192 | else:
193 | raise e
194 |
195 | if verbose:
196 | print(str(results['hits']) + " granules found for " + short_name) # noqa E501
197 |
198 | downloads_all = []
199 | downloads_data = [[u['URL'] for u in r['umm']['RelatedUrls'] if
200 | u['Type'] == "GET DATA" and ('Subtype' not in u or u['Subtype'] != "OPENDAP DATA")] for r in
201 | results['items']]
202 | downloads_metadata = [[u['URL'] for u in r['umm']['RelatedUrls'] if u['Type'] == "EXTENDED METADATA"] for r in
203 | results['items']]
204 | checksums = pa.extract_checksums(results)
205 |
206 | for f in downloads_data:
207 | downloads_all.append(f)
208 | for f in downloads_metadata:
209 | downloads_all.append(f)
210 |
211 | downloads = [item for sublist in downloads_all for item in sublist]
212 |
213 | if len(downloads) >= page_size:
214 | logging.warning("Only the most recent " + str(
215 | page_size) + " granules will be downloaded; try adjusting your search criteria (suggestion: reduce time period or spatial region of search) to ensure you retrieve all granules.")
216 |
217 | # filter list based on extension
218 | if not extensions:
219 | extensions = pa.extensions
220 | filtered_downloads = []
221 | for f in downloads:
222 | for extension in extensions:
223 | if f.lower().endswith(extension):
224 | filtered_downloads.append(f)
225 |
226 | downloads = filtered_downloads
227 |
228 | print("Found " + str(len(downloads)) + " total files to download")
229 | if verbose:
230 | print("Downloading files with extensions: " + str(extensions))
231 |
232 | # NEED TO REFACTOR THIS, A LOT OF STUFF in here
233 | # Finish by downloading the files to the data directory in a loop.
234 | # Overwrite `.update` with a new timestamp on success.
235 | success_cnt = failure_cnt = skip_cnt = 0
236 | for f in downloads:
237 | try:
238 | output_path = os.path.join(data_path, os.path.basename(f))
239 |
240 | # decide if we should actually download this file (e.g. we may already have the latest version)
241 | if(os.path.exists(output_path) and not force and pa.checksum_does_match(output_path, checksums)):
242 | print(str(datetime.now()) + " SKIPPED: " + f)
243 | skip_cnt += 1
244 | continue
245 |
246 | urlretrieve(f, output_path)
247 | #pa.process_file(process_cmd, output_path, args)
248 | print(str(datetime.now()) + " SUCCESS: " + f)
249 | success_cnt = success_cnt + 1
250 |
251 | #if limit is set and we're at or over it, stop downloading
252 | if download_limit and success_cnt >= download_limit:
253 | break
254 |
255 | except Exception:
256 | logging.warning(str(datetime.now()) + " FAILURE: " + f, exc_info=True)
257 | failure_cnt = failure_cnt + 1
258 |
259 | print("Downloaded Files: " + str(success_cnt))
260 | print("Failed Files: " + str(failure_cnt))
261 | print("Skipped Files: " + str(skip_cnt))
262 | pa.delete_token(token_url, token)
263 | print("END\n\n")
264 |
265 | if __name__=='__main__':
266 | download_data_date = date.today() - timedelta(days=10)
267 | download_raw_data(year = download_data_date.year, month = download_data_date.month, day = download_data_date.day)
268 |
--------------------------------------------------------------------------------
/notebooks/DailyAnalysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "6189727d-4f56-49fc-b2f0-b642097206b3",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import h5py\n",
11 | "from matplotlib import pyplot as plt\n",
12 | "from matplotlib import lines, colors, ticker\n",
13 | "import seaborn as sns\n",
14 | "import numpy as np\n",
15 | "import pandas as pd\n",
16 | "\n",
17 | "from sklearn.metrics import mean_squared_error\n",
18 | "\n",
19 | "import sys\n",
20 | "sys.path.append('../externals/gfz_cygnss/')\n",
21 | "import gfz_202003.utils.mathematics as mat"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "id": "dcf836dd-cf50-43af-bd72-34f151b9b006",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "f_pred = h5py.File('/work/ka1176/caroline/gitlab/cygnss-deployment/prediction/current_predictions.h5', 'r')"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "id": "ac9c112a-931d-46e4-9e54-bf8fa164ed9a",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "f_pred.keys()"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "id": "44e45c03-df95-4b80-868d-ebc01a1d6642",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "y_true = f_pred['y_true'][:]\n",
52 | "y_pred = f_pred['y_pred'][:]\n",
53 | "sp_lon = f_pred['sp_lon'][:]\n",
54 | "sp_lat = f_pred['sp_lat'][:]"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "id": "479629bd-1a80-46fb-af33-e806db8be955",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "rmse = mean_squared_error(y_true, y_pred, squared=False)\n",
65 | "\n",
66 | "print(f'Overall root mean square error (RMSE): {rmse:.4f} m/s')"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "id": "88ef8364-a0b7-4c92-a9b2-2d1485a4c54d",
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "y_bins = [4, 8, 12, 16, 20, 100]\n",
77 | "y_ix = np.digitize(y_true, y_bins, right=False)\n",
78 | "\n",
79 | "all_rmse = np.zeros(len(y_bins))\n",
80 | "all_bias = np.zeros(len(y_bins))\n",
81 | "all_counts = np.zeros(len(y_bins))"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "id": "794d4be3-e785-4c9a-ac2b-bd5eb7ad795e",
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "for i, yy in enumerate(y_bins):\n",
92 | " if np.any(y_ix==i):\n",
93 | " rmse = mean_squared_error(y_true[y_ix==i], y_pred[y_ix==i], squared=False)\n",
94 | " all_rmse[i] = rmse\n",
95 | " all_bias[i] = np.mean(y_pred[y_ix==i] - y_true[y_ix==i])\n",
96 | " all_counts[i] = np.sum(y_ix==i)\n",
97 | " print(f'RMSE in bin {i} (up to {yy} m/s): {rmse:.4f} m/s')\n",
98 | " else:\n",
99 | " all_rmse[i] = None\n",
100 | " all_bias[i] = None\n",
101 | " all_counts[i] = 0\n",
102 | " print(f\"--- No samples in bin {i} (up to {yy} m/s)\")\n",
103 | " \n",
104 | "df_rmse = pd.DataFrame(dict(rmse=all_rmse, bias=all_bias, bins=y_bins, counts=all_counts))"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "id": "718f101f-8c42-4ade-9d5c-e0650490ca9b",
111 | "metadata": {},
112 | "outputs": [],
113 | "source": []
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "id": "9512aff8-2d0f-48a7-944d-8fa9e0c9cbf0",
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "sns.set_style('whitegrid')\n",
123 | "sns.set_context('talk')"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "id": "f7281124-4ae0-4075-a0f4-5cc526144e36",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "fig=plt.figure()\n",
134 | "ax=fig.add_subplot(111)\n",
135 | "\n",
136 | "sns.histplot(y_true, ax=ax, color='C7', label='ERA5 wind speed (m/s)')\n",
137 | "sns.histplot(y_pred, ax=ax, color='C2', label='Predicted wind speed (m/s)')\n",
138 | "\n",
139 | "ax.legend(fontsize=12)\n",
140 | "\n",
141 | "ax.set_xticks([5, 10, 15, 20, 25])\n",
142 | "ax.set_xticklabels([5, 10, 15, 20, 25])\n",
143 | "ax.set_xlabel('ERA5 wind speed (m/s)')\n",
144 | "\n",
145 | "plt.show()"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "id": "bdf3083d-4881-4552-9174-766235fef0a6",
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "ymin = 2.5\n",
156 | "ymax = 25.0\n",
157 | "\n",
158 | "fig=plt.figure()\n",
159 | "ax=fig.add_subplot(111)\n",
160 | "\n",
161 | "img=ax.hexbin(y_true, y_pred, cmap='viridis', norm=colors.LogNorm(vmin=1, vmax=25000), mincnt=1)\n",
162 | "clb=plt.colorbar(img)\n",
163 | "clb.set_ticks([1, 10, 100, 1000, 10000])\n",
164 | "clb.set_ticklabels([r'$1$', r'$10$', r'$10^2$', r'$10^3$', r'$10^4$'])\n",
165 | "clb.set_label('Samples in bin')\n",
166 | "clb.ax.tick_params()\n",
167 | "\n",
168 | "ax.set_xlabel('ERA5 wind speed (m/s)')\n",
169 | "ax.set_ylabel('Predicted wind speed (m/s)')\n",
170 | "\n",
171 | "ax.plot(np.linspace(0, 30), np.linspace(0, 30), 'r:')\n",
172 | "\n",
173 | "ax.set_ylim(ymin, 25)\n",
174 | "ax.set_xlim(ymin, 25)\n",
175 | "\n",
176 | "ax.set_xticks([5, 10, 15, 20, 25])\n",
177 | "ax.set_xticklabels([5, 10, 15, 20, 25])\n",
178 | "ax.set_yticks([5, 10, 15, 20, 25])\n",
179 | "ax.set_yticklabels([5, 10, 15, 20, 25])\n",
180 | "\n",
181 | "fig.tight_layout()"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "id": "0460d844-6142-497a-9710-312e2c3da617",
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "import cartopy.crs as ccrs\n",
192 | "from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter\n",
193 | "from mpl_toolkits.axes_grid1 import AxesGrid"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "id": "a2bc579a-bf63-43ba-8547-6e0173f8903c",
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "deg = 1 # grid resolution (publication: 1)\n",
204 | "\n",
205 | "xx, yy, gridded_y_true = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_true[:], resolution=deg)\n",
206 | "xx, yy, gridded_y_pred = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_pred[:], resolution=deg)\n",
207 | "xx, yy, gridded_rmse = mat.average_to_grid2(sp_lon[:], sp_lat[:], np.abs(y_pred[:] - y_true[:]), resolution=deg)\n",
208 | "xx, yy, gridded_bias = mat.average_to_grid2(sp_lon[:], sp_lat[:], y_pred[:] - y_true[:], resolution=deg)"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": null,
214 | "id": "db6e3e4b-1127-4013-9041-27a4f74412d4",
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "grid_lon = np.arange(-180, 181, deg)\n",
219 | "grid_lat = np.arange(-90, 91, deg)"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "id": "2844ffa6-b5f0-4f26-b11d-1c111929b59d",
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "proj = ccrs.PlateCarree(180)\n",
230 | "fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))\n",
231 | "cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_y_true[:].T, levels=60, transform=proj, antialiased=False, cmap='magma')\n",
232 | "ax.coastlines()\n",
233 | "gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')\n",
234 | "gl.top_labels = False\n",
235 | "gl.right_labels= False\n",
236 | "clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average ERA5 wind speed (m/s)')\n",
237 | "\n",
238 | "clb.set_ticks(np.arange(2.5, 18, 2.5))\n",
239 | "clb.ax.tick_params(labelsize=8)\n",
240 | "\n",
241 | "gl.xlabel_style = {'size': 8, 'color': 'black'}\n",
242 | "gl.ylabel_style = {'size': 8, 'color': 'black'}\n",
243 | "\n",
244 | "plt.show()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "id": "e28af2cb-7400-460e-982e-f03f33ebf67c",
251 | "metadata": {},
252 | "outputs": [],
253 | "source": [
254 | "proj = ccrs.PlateCarree(180)\n",
255 | "fig, ax = plt.subplots(1, 1, figsize=(6,4), gridspec_kw=dict(hspace=0.05, wspace=0.1), subplot_kw=dict(projection=proj))\n",
256 | "cmap = ax.contourf(grid_lon[:], grid_lat[::-1][:], gridded_rmse[:].T, levels=60, transform=proj, antialiased=False, cmap='viridis')\n",
257 | "ax.coastlines()\n",
258 | "gl = ax.gridlines(crs=proj, draw_labels=True, linewidth=0, color='gray', alpha=0.5, linestyle=':')\n",
259 | "gl.top_labels = False\n",
260 | "gl.right_labels= False\n",
261 | "clb = plt.colorbar(cmap, ax=ax, orientation='horizontal', shrink=1, label='Average RMSE (m/s)')\n",
262 | "\n",
263 | "clb.set_ticks(np.arange(0, np.nanmax(gridded_rmse)+1, 1.0))\n",
264 | "clb.ax.tick_params(labelsize=8)\n",
265 | "\n",
266 | "gl.xlabel_style = {'size': 8, 'color': 'black'}\n",
267 | "gl.ylabel_style = {'size': 8, 'color': 'black'}\n",
268 | "\n",
269 | "plt.show()"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "id": "937f9d18-12fa-436a-baea-f92af46d5e87",
276 | "metadata": {},
277 | "outputs": [],
278 | "source": []
279 | }
280 | ],
281 | "metadata": {
282 | "kernelspec": {
283 | "display_name": "CyGNSS Deployment",
284 | "language": "python",
285 | "name": "cygnss-d"
286 | },
287 | "language_info": {
288 | "codemirror_mode": {
289 | "name": "ipython",
290 | "version": 3
291 | },
292 | "file_extension": ".py",
293 | "mimetype": "text/x-python",
294 | "name": "python",
295 | "nbconvert_exporter": "python",
296 | "pygments_lexer": "ipython3",
297 | "version": "3.9.13"
298 | }
299 | },
300 | "nbformat": 4,
301 | "nbformat_minor": 5
302 | }
303 |
--------------------------------------------------------------------------------
/prefect-deploy.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pydoc import cli
3 | import sys
4 | import shutil
5 | import time
6 | import pandas as pd
7 | import numpy as np
8 | import h5py
9 | import torch
10 | from torch.utils.data import DataLoader, Dataset
11 | import pytorch_lightning as pl
12 | from pytorch_lightning.callbacks.model_summary import ModelSummary
13 | from sklearn.metrics import mean_squared_error
14 | from collections import namedtuple
15 | import xarray
16 | import mlflow
17 | from prefect import flow, task
18 | import streamlit as st
19 | # TODO Fix these imports
20 | # from prefect.deployments import DeploymentSpec
21 | #from prefect.flow_runners import SubprocessFlowRunner
22 | from prefect.orion.schemas.schedules import IntervalSchedule, CronSchedule
23 | from prefect.deployments import Deployment
24 | from prefect.filesystems import RemoteFileSystem
25 | from prefect.infrastructure import DockerContainer
26 | from prefect.task_runners import SequentialTaskRunner
27 | from pymongo import MongoClient, errors
28 | from API import download_raw_data
29 | from datetime import datetime, timedelta, date
30 | sys.path.append('/app/externals/gfz_cygnss/')
31 | sys.path.append('/app/externals/gfz_cygnss/gfz_202003')
32 | sys.path.append('/app/externals/gfz_cygnss/gfz_202003/training')
33 |
34 | from cygnssnet import ImageNet, DenseNet, CyGNSSNet, CyGNSSDataset, CyGNSSDataModule
35 | from plots import make_scatterplot, make_histogram, era_average, rmse_average, today_longrunavg, today_longrunavg_bias, sample_counts, rmse_bins_era, bias_bins_era
36 | #import plots
37 | from Preprocessing import pre_processing
38 |
39 | @task
40 | def download_data(year, month, day):
41 | # Using API calls
42 | download_raw_data(year, month, day)
43 |
44 | @task
45 | def get_data(client):
46 | cygnss = client.cygnss
47 | items = cygnss.cygnss_collection.find()
48 | items = list(items) # make hashable for st.experimental_memo
49 | for item in items:
50 | print(f"RMSE is: {item['rmse']}")
51 |
52 |
53 | @task
54 | def drop_database(client):
55 | client.drop_database('cygnss')
56 |
57 | @task
58 | @st.experimental_singleton
59 | def save_to_db(domain, port, y_pred, rmse, date_, rmse_time):
60 | # use a try-except indentation to catch MongoClient() errors
61 | try:
62 | print('entering mongo db connection')
63 |
64 |
65 |
66 | client = MongoClient(
67 | host = [ str(domain) + ":" + str(port) ],
68 | serverSelectionTimeoutMS = 3000, # 3 second timeout
69 | username = "root",
70 | password = "example",
71 | )
72 |
73 | # uncomment and if you wanna clear out the data
74 | #client.drop_database('cygnss')
75 |
76 | # print the version of MongoDB server if connection successful
77 | print ("server version:", client.server_info()["version"])
78 | data = {
79 | "rmse": rmse.tolist(),
80 | "bin_rmse": rmse_time["rmse"].tolist(),
81 | "bin_bias": rmse_time["bias"].tolist(),
82 | "bin_counts": rmse_time["counts"].tolist(),
83 | "event_date": date_,
84 | "scatterplot_path": f"/app/plots/scatter_{date_}.png",
85 | "histogram_path": f"/app/plots/histo_{date_}.png",
86 | "era_average_path": f"/app/plots/era_average_{date_}.png",
87 | "rmse_average_path": f"/app/plots/rmse_average_{date_}.png",
88 | "today_longrunavg_path": f"/app/plots/today_longrunavg_{date_}.png",
89 | "today_long_bias_path": f"/app/plots/today_long_bias_{date_}.png",
90 | "sample_counts_path": f"/app/plots/sample_counts_{date_}.png",
91 | "rmse_bins_era_path": f"/app/plots/rmse_bins_era_{date_}.png",
92 | "bias_bins_era_path": f"/app/plots/bias_bins_era_{date_}.png",
93 | "y_pred": y_pred.tolist()
94 | }
95 |
96 | cygnss_collection = client["cygnss"].cygnss_collection
97 |
98 |
99 | cygnss_collection = cygnss_collection.insert_many([data])
100 |
101 | print(f"Multiple tutorials: {cygnss_collection.inserted_ids}")
102 |
103 | except errors.ServerSelectionTimeoutError as err:
104 | # set the client and DB name list to 'None' and `[]` if exception
105 | client = None
106 | # catch pymongo.errors.ServerSelectionTimeoutError
107 | print (err)
108 |
109 |
110 | @task
111 | def get_hyper_params(model_path, model, data_path):
112 | # Note for future: for fixed model write h_params in config file
113 | checkpoint = torch.load(os.path.join(model_path, model),
114 | map_location=torch.device("cpu"))
115 | checkpoint['hyper_parameters']["data"] = data_path
116 | checkpoint['hyper_parameters']["num_workers"] = 1
117 | col_idx_lat = checkpoint["hyper_parameters"]["v_par_eval"].index('sp_lat')
118 | col_idx_lon = checkpoint["hyper_parameters"]["v_par_eval"].index('sp_lon')
119 | args = namedtuple("ObjectName", checkpoint['hyper_parameters'].keys())\
120 | (*checkpoint['hyper_parameters'].values())
121 | return args, col_idx_lat, col_idx_lon
122 |
123 | @task
124 | def get_backbone(args, input_shapes):
125 | if args.model=='cnn':
126 | backbone = ImageNet(args, input_shapes)
127 | elif args.model=='dense':
128 | backbone = DenseNet(args, input_shapes)
129 | return backbone
130 |
131 | @task
132 | def make_predictions(test_loader, model):
133 | trainer = pl.Trainer(enable_progress_bar=False)
134 | trainer.test(model=model, dataloaders=test_loader)
135 | y_pred = trainer.predict(model=model, dataloaders=[test_loader])
136 | y_pred = torch.cat(y_pred).detach().cpu().numpy().squeeze()
137 | return y_pred
138 |
139 | @task
140 | def rmse_bins(y_true, y_pred, y_bins):
141 | # Find the indices for the windspeed bins - below 12 m/s, below 16 m/s, above 16 m/s
142 | y_ix = np.digitize(y_true, y_bins, right=False)
143 |
144 | all_rmse = np.zeros(len(y_bins))
145 | all_bias = np.zeros(len(y_bins))
146 | all_counts = np.zeros(len(y_bins))
147 |
148 | for i, yy in enumerate(y_bins):
149 | if np.any(y_ix==i):
150 | rmse = mean_squared_error(y_true[y_ix==i], y_pred[y_ix==i], squared=False)
151 | all_rmse[i] = rmse
152 | all_bias[i] = np.mean(y_pred[y_ix==i] - y_true[y_ix==i])
153 | all_counts[i] = np.sum(y_ix==i)
154 | else:
155 | all_rmse[i] = None
156 | all_bias[i] = None
157 | all_counts[i] = 0
158 | df_rmse = pd.DataFrame(dict(rmse=all_rmse, bias=all_bias, bins=y_bins, counts=all_counts))
159 | return df_rmse
160 |
161 | @task
162 | def rmse_over_time(y_bins, df_rmse):
163 | # mock up data that represents the long running average rmse
164 | df_rmse["time"] = "today"
165 |
166 | df_mockup = pd.DataFrame(dict(bins=y_bins,
167 | rmse=df_rmse["rmse"] + np.random.rand(len(y_bins))-0.5,
168 | bias=df_rmse["bias"] + np.random.rand(len(y_bins))-0.5,
169 | counts=df_rmse["counts"] * 1000))
170 | df_mockup["time"] = "long-running average"
171 |
172 | df_mockup = pd.concat([df_rmse, df_mockup], ignore_index=True)
173 | return df_mockup
174 |
175 | @task
176 | def make_plots(y, y_pred, date_, df_mockup, df_rmse, y_bins):
177 | make_scatterplot(y, y_pred, date_)
178 | make_histogram(y, y_pred, date_)
179 | #era_average(y, sp_lon, sp_lat, date_)
180 | #rmse_average(y, y_pred, sp_lon, sp_lat, date_)
181 | today_longrunavg(df_mockup, y_bins, date_)
182 | today_longrunavg_bias(df_mockup, y_bins, date_)
183 | sample_counts(df_rmse, y_bins, date_)
184 | rmse_bins_era(df_rmse, y_bins, date_)
185 | bias_bins_era(df_rmse, y_bins, date_)
186 |
187 | @task
188 | def remove():
189 | shutil.rmtree("/app/raw_data", ignore_errors=False, onerror=None)
190 | shutil.rmtree("/app/annotated_raw_data", ignore_errors=False, onerror=None)
191 | shutil.rmtree("/app/dev_data", ignore_errors=False, onerror=None)
192 |
193 | @flow
194 | def main():
195 | # TODO: Set these settings for prefect, to make paths relative instead of global
196 | # prefect config set PREFECT_LOCAL_STORAGE_PATH="/your/custom/path"
197 | # prefect config set PREFECT_HOME="/your/custom/path"
198 |
199 | # create directory for plots, if it does not exist
200 | if not os.path.isdir('/app/plots'):
201 | os.makedirs('/app/plots', exist_ok=True)
202 |
203 | # write a file in app directory to check its write permission and where files are stored
204 | with open("/app/app_write_test.txt", "w") as file:
205 | file.write("app_write_test")
206 | file.write(os.getcwd())
207 | file.write(os.path.dirname(__file__))
208 | print(file.name)
209 |
210 | # Define the date and pass it to the individual tasks
211 | download_date = date.today() - timedelta(days=12)
212 | date_ = download_date.strftime("%Y-%m-%d")
213 |
214 | # Download data for the past 10th day from today, today - 10th day
215 | download_data(year=download_date.year, month=download_date.month, day=download_date.day)
216 |
217 | # annotate data
218 | # create filtered hdf5 from preprocessing
219 | data_path = '/app/dev_data/'
220 | pre_processing(download_date.year, download_date.month, download_date.day, data_path)
221 |
222 | model_path = '/app/externals/gfz_cygnss/trained_models/'
223 | model = 'ygambdos_yykDM.ckpt'
224 | h5_file = h5py.File(os.path.join(data_path, 'test_data.h5'), 'r', rdcc_nbytes=0)
225 |
226 | mlflow.set_tracking_uri("sqlite:///mlruns.db") # TODO: change this to other db
227 | mlflow.set_experiment("cygnss")
228 |
229 |
230 | # get hyperparameters
231 | args, col_idx_lat, col_idx_lon = get_hyper_params.submit(model_path, model, data_path).result()
232 |
233 | cdm = CyGNSSDataModule(args)
234 | cdm.setup(stage='test')
235 | input_shapes = cdm.get_input_shapes(stage='test')
236 | backbone = get_backbone.submit(args, input_shapes).result()
237 |
238 | # load model
239 | cygnss_model = CyGNSSNet.load_from_checkpoint(os.path.join(model_path, model),
240 | map_location=torch.device('cpu'),
241 | args=args,
242 | backbone=backbone)
243 | cygnss_model.eval()
244 |
245 | test_loader = cdm.test_dataloader()
246 | # make predictions
247 | y_pred = make_predictions(test_loader, cygnss_model)
248 |
249 | # get true labels
250 | dataset = CyGNSSDataset('test', args)
251 | y = dataset.y
252 |
253 | # calculate rmse
254 | y_bins = [4, 8, 12, 16, 20, 100]
255 | df_rmse = rmse_bins.submit(y, y_pred, y_bins).result()
256 | df_mockup = rmse_over_time.submit(y_bins, df_rmse).result()
257 | with mlflow.start_run():
258 | rmse = mean_squared_error(y, y_pred, squared=False)
259 | mlflow.log_metric('rmse', rmse)
260 |
261 | # make plots
262 | sp_lat = test_loader.dataset.v_par_eval[:, col_idx_lat]
263 | sp_lon = test_loader.dataset.v_par_eval[:, col_idx_lon]
264 | make_plots(y, y_pred, date_, df_mockup, df_rmse, y_bins)
265 | DOMAIN = 'mongodb'
266 | PORT = 27017
267 |
268 | # Save results to the mongo database
269 | save_to_db(domain=DOMAIN, port=PORT, y_pred=y_pred, \
270 | rmse=rmse, date_=date_, rmse_time=df_rmse)
271 |
272 | # delete dowloaded and annotated files
273 | remove()
274 |
275 | if __name__ == "__main__":
276 |
277 | deployment = Deployment.build_from_flow(
278 | schedule = CronSchedule(cron='0 3 * * *', timezone='Europe/Berlin'),
279 | flow=main,
280 | name="cygnss",
281 | work_queue_name="demo"
282 | )
283 | deployment.apply()
284 | # main()
285 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: cygnss-d
2 | channels:
3 | - pytorch
4 | - conda-forge
5 | - defaults
6 | dependencies:
7 | - _libgcc_mutex=0.1=main
8 | - _openmp_mutex=5.1=1_gnu
9 | - absl-py=1.2.0=pyhd8ed1ab_0
10 | - aiohttp=3.8.1=py39hb9d737c_1
11 | - aiosignal=1.2.0=pyhd8ed1ab_0
12 | - async-timeout=4.0.2=pyhd8ed1ab_0
13 | - attrs=22.1.0=pyh71513ae_1
14 | - blas=1.0=mkl
15 | - blinker=1.4=py_1
16 | - bottleneck=1.3.5=py39h7deecbd_0
17 | - brotli=1.0.9=h5eee18b_7
18 | - brotli-bin=1.0.9=h5eee18b_7
19 | - brotlipy=0.7.0=py39h27cfd23_1003
20 | - bzip2=1.0.8=h7b6447c_0
21 | - c-ares=1.18.1=h7f8727e_0
22 | - ca-certificates=2022.9.24=ha878542_0
23 | - cachetools=5.2.0=pyhd8ed1ab_0
24 | - cartopy=0.18.0=py39h0d9ca2b_1
25 | - certifi=2022.6.15.1=pyhd8ed1ab_0
26 | - cffi=1.15.1=py39h74dc2b5_0
27 | - cftime=1.5.1.1=py39hce1f21e_0
28 | - colorama=0.4.5=pyhd8ed1ab_0
29 | - cryptography=37.0.1=py39h9ce1e76_0
30 | - cudatoolkit=11.3.1=h2bc3f7f_2
31 | - curl=7.84.0=h5eee18b_0
32 | - cycler=0.11.0=pyhd3eb1b0_0
33 | - dbus=1.13.18=hb2f20db_0
34 | - expat=2.4.4=h295c915_0
35 | - ffmpeg=4.3=hf484d3e_0
36 | - fftw=3.3.9=h27cfd23_1
37 | - fontconfig=2.13.1=h6c09931_0
38 | - fonttools=4.25.0=pyhd3eb1b0_0
39 | - freetype=2.11.0=h70c0345_0
40 | - fsspec=2022.11.0=pyhd8ed1ab_0
41 | - future=0.18.2=py39h06a4308_1
42 | - geos=3.8.0=he6710b0_0
43 | - giflib=5.2.1=h7b6447c_0
44 | - glib=2.69.1=h4ff587b_1
45 | - gmp=6.2.1=h295c915_3
46 | - gnutls=3.6.15=he1e5248_0
47 | - google-auth=2.11.0=pyh6c4a22f_0
48 | - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
49 | - gst-plugins-base=1.14.0=h8213a91_2
50 | - gstreamer=1.14.0=h28cd5cc_2
51 | - h5py=3.7.0=py39h737f45e_0
52 | - hdf4=4.2.13=h3ca952b_2
53 | - hdf5=1.10.6=h3ffc7dd_1
54 | - icu=58.2=he6710b0_3
55 | - idna=3.3=pyhd3eb1b0_0
56 | - intel-openmp=2021.4.0=h06a4308_3561
57 | - jpeg=9e=h7f8727e_0
58 | - kiwisolver=1.4.2=py39h295c915_0
59 | - krb5=1.19.2=hac12032_0
60 | - lame=3.100=h7b6447c_0
61 | - lcms2=2.12=h3be6417_0
62 | - ld_impl_linux-64=2.38=h1181459_1
63 | - lerc=3.0=h295c915_0
64 | - libbrotlicommon=1.0.9=h5eee18b_7
65 | - libbrotlidec=1.0.9=h5eee18b_7
66 | - libbrotlienc=1.0.9=h5eee18b_7
67 | - libclang=10.0.1=default_hb85057a_2
68 | - libcurl=7.84.0=h91b91d3_0
69 | - libdeflate=1.8=h7f8727e_5
70 | - libedit=3.1.20210910=h7f8727e_0
71 | - libev=4.33=h7f8727e_1
72 | - libevent=2.1.12=h8f2d780_0
73 | - libffi=3.3=he6710b0_2
74 | - libgcc-ng=11.2.0=h1234567_1
75 | - libgfortran-ng=11.2.0=h00389a5_1
76 | - libgfortran5=11.2.0=h1234567_1
77 | - libgomp=11.2.0=h1234567_1
78 | - libiconv=1.16=h7f8727e_2
79 | - libidn2=2.3.2=h7f8727e_0
80 | - libllvm10=10.0.1=hbcb73fb_5
81 | - libnetcdf=4.8.1=h42ceab0_1
82 | - libnghttp2=1.46.0=hce63b2e_0
83 | - libpng=1.6.37=hbc83047_0
84 | - libpq=12.9=h16c4e8d_3
85 | - libprotobuf=3.15.8=h780b84a_1
86 | - libssh2=1.10.0=h8f2d780_0
87 | - libstdcxx-ng=11.2.0=h1234567_1
88 | - libtasn1=4.16.0=h27cfd23_0
89 | - libtiff=4.4.0=hecacb30_0
90 | - libunistring=0.9.10=h27cfd23_0
91 | - libuuid=1.0.3=h7f8727e_2
92 | - libwebp=1.2.2=h55f646e_0
93 | - libwebp-base=1.2.2=h7f8727e_0
94 | - libxcb=1.15=h7f8727e_0
95 | - libxkbcommon=1.0.1=hfa300c1_0
96 | - libxml2=2.9.14=h74e7548_0
97 | - libxslt=1.1.35=h4e12654_0
98 | - libzip=1.8.0=h5cef20c_0
99 | - lz4-c=1.9.3=h295c915_1
100 | - markdown=3.4.1=pyhd8ed1ab_0
101 | - markupsafe=2.1.1=py39hb9d737c_1
102 | - matplotlib=3.5.2=py39h06a4308_0
103 | - matplotlib-base=3.5.2=py39hf590b9c_0
104 | - mkl=2021.4.0=h06a4308_640
105 | - mkl-service=2.4.0=py39h7f8727e_0
106 | - mkl_fft=1.3.1=py39hd3c417c_0
107 | - mkl_random=1.2.2=py39h51133e4_0
108 | - multidict=6.0.2=py39hb9d737c_1
109 | - munkres=1.1.4=py_0
110 | - ncurses=6.3=h5eee18b_3
111 | - netcdf4=1.5.7=py39ha0f2276_1
112 | - nettle=3.7.3=hbbd107a_1
113 | - ninja=1.10.2=h06a4308_5
114 | - ninja-base=1.10.2=hd09550d_5
115 | - nspr=4.33=h295c915_0
116 | - nss=3.74=h0370c37_0
117 | - numexpr=2.8.3=py39h807cd23_0
118 | - numpy=1.22.3=py39he7a7128_0
119 | - numpy-base=1.22.3=py39hf524024_0
120 | - oauthlib=3.2.1=pyhd8ed1ab_0
121 | - openh264=2.1.1=h4ff587b_0
122 | - openssl=1.1.1s=h7f8727e_0
123 | - packaging=21.3=pyhd3eb1b0_0
124 | - pandas=1.4.3=py39h6a678d5_0
125 | - pcre=8.45=h295c915_0
126 | - pillow=9.2.0=py39hace64e9_1
127 | - pip=22.1.2=py39h06a4308_0
128 | - ply=3.11=py39h06a4308_0
129 | - proj=6.2.1=hc80f0dc_0
130 | - pyasn1=0.4.8=py_0
131 | - pycparser=2.21=pyhd3eb1b0_0
132 | - pyjwt=2.4.0=pyhd8ed1ab_0
133 | - pyopenssl=22.0.0=pyhd3eb1b0_0
134 | - pyparsing=3.0.9=py39h06a4308_0
135 | - pyqt=5.15.7=py39h6a678d5_1
136 | - pyqt5-sip=12.11.0=py39h6a678d5_1
137 | - pyshp=2.3.1=pyhd8ed1ab_0
138 | - pysocks=1.7.1=py39h06a4308_0
139 | - python=3.9.13=haa1d7c7_1
140 | - python-dateutil=2.8.2=pyhd3eb1b0_0
141 | - python_abi=3.9=2_cp39
142 | - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0
143 | - pytorch-mutex=1.0=cuda
144 | - pytz=2022.1=py39h06a4308_0
145 | - pyu2f=0.1.5=pyhd8ed1ab_0
146 | - pyyaml=6.0=py39hb9d737c_4
147 | - qt-main=5.15.2=h327a75a_7
148 | - qt-webengine=5.15.9=hd2b0992_4
149 | - qtwebkit=5.212=h4eab89a_4
150 | - readline=8.1.2=h7f8727e_1
151 | - requests=2.28.1=py39h06a4308_0
152 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0
153 | - rsa=4.9=pyhd8ed1ab_0
154 | - scipy=1.7.3=py39h6c91a56_2
155 | - seaborn=0.11.2=pyhd3eb1b0_0
156 | - shapely=1.8.4=py39h81ba7c5_0
157 | - sip=6.6.2=py39h6a678d5_0
158 | - six=1.16.0=pyhd3eb1b0_1
159 | - sqlite=3.39.2=h5082296_0
160 | - tensorboard=2.10.0=pyhd8ed1ab_2
161 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
162 | - tk=8.6.12=h1ccaba5_0
163 | - toml=0.10.2=pyhd3eb1b0_0
164 | - torchaudio=0.12.1=py39_cu113
165 | - torchmetrics=0.9.3=pyhd8ed1ab_0
166 | - torchvision=0.13.1=py39_cu113
167 | - tornado=6.1=py39hb9d737c_3
168 | - tqdm=4.64.1=pyhd8ed1ab_0
169 | - typing-extensions=4.3.0=py39h06a4308_0
170 | - typing_extensions=4.3.0=py39h06a4308_0
171 | - werkzeug=2.2.2=pyhd8ed1ab_0
172 | - wheel=0.37.1=pyhd3eb1b0_0
173 | - xarray=0.20.1=pyhd3eb1b0_1
174 | - xz=5.2.5=h7f8727e_1
175 | - yaml=0.2.5=h7f98852_2
176 | - zipp=3.8.1=pyhd8ed1ab_0
177 | - zlib=1.2.12=h5eee18b_3
178 | - zstd=1.5.2=ha4553b6_0
179 | - pip:
180 | - aiobotocore==2.4.0
181 | - aiofiles==22.1.0
182 | - aiohttp-cors==0.7.0
183 | - aioitertools==0.11.0
184 | - aiosqlite==0.17.0
185 | - alembic==1.8.1
186 | - altair==4.2.0
187 | - anyio==3.6.1
188 | - appdirs==1.4.4
189 | - argon2-cffi==21.3.0
190 | - argon2-cffi-bindings==21.2.0
191 | - asgi-lifespan==1.0.1
192 | - astor==0.8.1
193 | - asttokens==2.0.8
194 | - asyncpg==0.26.0
195 | - azure-core==1.25.1
196 | - azure-storage-blob==12.13.1
197 | - backcall==0.2.0
198 | - beautifulsoup4==4.11.1
199 | - bleach==5.0.1
200 | - blessed==1.19.1
201 | - boto3==1.24.75
202 | - botocore==1.27.59
203 | - cdsapi==0.5.1
204 | - charset-normalizer==2.1.1
205 | - click==8.0.4
206 | - cloudpickle==2.2.0
207 | - colorful==0.5.4
208 | - commonmark==0.9.1
209 | - contextlib2==21.6.0
210 | - coolname==1.1.0
211 | - croniter==1.3.7
212 | - dask==2022.9.1
213 | - databricks-cli==0.17.3
214 | - debugpy==1.6.3
215 | - decorator==5.1.1
216 | - defusedxml==0.7.1
217 | - distlib==0.3.6
218 | - distributed==2022.9.1
219 | - docker==5.0.3
220 | - entrypoints==0.4
221 | - executing==1.0.0
222 | - fastapi==0.85.0
223 | - fastjsonschema==2.16.1
224 | - filelock==3.8.0
225 | - flask==2.2.2
226 | - frozenlist==1.3.1
227 | - gitdb==4.0.9
228 | - gitpython==3.1.27
229 | - global-land-mask==1.0.0
230 | - google-api-core==2.10.1
231 | - google-cloud-core==2.3.2
232 | - google-cloud-storage==2.5.0
233 | - google-crc32c==1.5.0
234 | - google-resumable-media==2.3.3
235 | - googleapis-common-protos==1.56.4
236 | - gpustat==1.0.0
237 | - greenlet==1.1.3
238 | - griffe==0.21.0
239 | - grpcio==1.43.0
240 | - gunicorn==20.1.0
241 | - h11==0.12.0
242 | - heapdict==1.0.1
243 | - httpcore==0.15.0
244 | - httpx==0.23.0
245 | - hyperopt==0.1.2
246 | - importlib-metadata==4.12.0
247 | - intake==0.6.6
248 | - ipykernel==6.15.2
249 | - ipython==8.5.0
250 | - ipython-genutils==0.2.0
251 | - ipywidgets==8.0.2
252 | - isodate==0.6.1
253 | - itsdangerous==2.1.2
254 | - jedi==0.18.1
255 | - jinja2==3.1.2
256 | - jmespath==1.0.1
257 | - joblib==1.2.0
258 | - json-tricks==3.15.5
259 | - jsonpatch==1.32
260 | - jsonpointer==2.3
261 | - jsonschema==4.16.0
262 | - jupyter==1.0.0
263 | - jupyter-client==7.3.5
264 | - jupyter-console==6.4.4
265 | - jupyter-core==4.11.1
266 | - jupyterlab-pygments==0.2.2
267 | - jupyterlab-widgets==3.0.3
268 | - kubernetes==24.2.0
269 | - llvmlite==0.39.1
270 | - locket==1.0.0
271 | - lxml==4.9.1
272 | - mako==1.2.2
273 | - matplotlib-inline==0.1.6
274 | - mistune==2.0.4
275 | - mlflow==2.0.1
276 | - msgpack==1.0.4
277 | - msrest==0.7.1
278 | - nbclient==0.6.8
279 | - nbconvert==7.0.0
280 | - nbformat==5.4.0
281 | - nest-asyncio==1.5.5
282 | - networkx==2.8.6
283 | - nexusformat==0.7.7
284 | - nni==2.9
285 | - notebook==6.4.12
286 | - numba==0.56.4
287 | - nvidia-ml-py==11.495.46
288 | - opencensus==0.11.0
289 | - opencensus-context==0.1.3
290 | - orjson==3.8.0
291 | - pandocfilters==1.5.0
292 | - parso==0.8.3
293 | - partd==1.3.0
294 | - pathspec==0.10.1
295 | - pendulum==2.1.2
296 | - pexpect==4.8.0
297 | - pickleshare==0.7.5
298 | - platformdirs==2.5.2
299 | - podaac-data-subscriber==1.12.0
300 | - pooch==1.6.0
301 | - prefect==2.4.0
302 | - prettytable==3.4.1
303 | - prometheus-client==0.13.1
304 | - prometheus-flask-exporter==0.20.3
305 | - prompt-toolkit==3.0.31
306 | - protobuf==3.20.2
307 | - psutil==5.9.2
308 | - ptyprocess==0.7.0
309 | - pure-eval==0.2.2
310 | - py-spy==0.3.14
311 | - pyarrow==9.0.0
312 | - pyasn1-modules==0.2.8
313 | - pydantic==1.10.2
314 | - pydeck==0.8.0b3
315 | - pydeprecate==0.3.1
316 | - pygments==2.13.0
317 | - pymongo==4.2.0
318 | - pympler==1.0.1
319 | - pyrsistent==0.18.1
320 | - python-slugify==6.1.2
321 | - pythonwebhdfs==0.2.3
322 | - pytorch-lightning==1.5.10
323 | - pytz-deprecation-shim==0.1.0.post0
324 | - pytzdata==2020.1
325 | - pyzmq==23.2.1
326 | - qtconsole==5.3.2
327 | - qtpy==2.2.0
328 | - querystring-parser==1.2.4
329 | - ray==2.0.0
330 | - readchar==4.0.3
331 | - responses==0.21.0
332 | - rfc3986==1.5.0
333 | - rich==12.5.1
334 | - s3fs==2022.11.0
335 | - s3transfer==0.6.0
336 | - schema==0.7.5
337 | - scikit-learn==1.1.2
338 | - semver==2.13.0
339 | - send2trash==1.8.0
340 | - setuptools==59.5.0
341 | - shap==0.41.0
342 | - simplejson==3.17.6
343 | - sklearn==0.0
344 | - slack-sdk==3.18.3
345 | - slicer==0.0.7
346 | - smart-open==6.2.0
347 | - smmap==5.0.0
348 | - sniffio==1.3.0
349 | - sortedcontainers==2.4.0
350 | - soupsieve==2.3.2.post1
351 | - sqlalchemy==1.4.41
352 | - sqlparse==0.4.2
353 | - stack-data==0.5.0
354 | - starlette==0.20.4
355 | - streamlit==1.12.2
356 | - tabulate==0.8.10
357 | - tblib==1.7.0
358 | - tenacity==8.0.1
359 | - tensorboard-data-server==0.6.1
360 | - terminado==0.15.0
361 | - text-unidecode==1.3
362 | - threadpoolctl==3.1.0
363 | - tinycss2==1.1.1
364 | - toolz==0.12.0
365 | - traitlets==5.4.0
366 | - typeguard==2.13.3
367 | - typer==0.6.1
368 | - tzdata==2022.2
369 | - tzlocal==4.2
370 | - urllib3==1.26.12
371 | - uvicorn==0.18.3
372 | - validators==0.20.0
373 | - virtualenv==20.16.5
374 | - watchdog==2.1.9
375 | - wcwidth==0.2.5
376 | - webencodings==0.5.1
377 | - websocket-client==1.4.1
378 | - websockets==10.3
379 | - widgetsnbextension==4.0.3
380 | - wrapt==1.14.1
381 | - yarl==1.8.1
382 | - zict==2.2.0
383 | prefix: /home/harsh/anaconda3/envs/cygnss-d
384 |
--------------------------------------------------------------------------------
/notebooks/Preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "261e2e39-ae3e-4b92-8dc5-b163f61eea25",
6 | "metadata": {},
7 | "source": [
8 | "# Preprocessing CyGNSS data\n",
9 | "\n",
10 | "Data is downloaded from NASA EarthCloud as described in the `APIs` notebook. For the expected format for CyGNSSnet, additional preprocessing steps are necessary."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 10,
16 | "id": "084a2e3e-9f9f-4844-9e28-c60e30314494",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import os\n",
21 | "import sys\n",
22 | "sys.path.append('../externals/gfz_cygnss/')"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 11,
28 | "id": "06128178",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# !pip install tenacity"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 12,
38 | "id": "c0bbb084-5e0b-41a9-a337-684f832d6f85",
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "ename": "TypeError",
43 | "evalue": " is not a generic class",
44 | "output_type": "error",
45 | "traceback": [
46 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
47 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
48 | "Input \u001b[0;32mIn [12]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgfz_202003\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpreprocessing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m preprocess \u001b[38;5;28;01mas\u001b[39;00m prep\n",
49 | "File \u001b[0;32m~/Downloads/DKRZ/MLOps/2020-03-gfz-remote-sensing/gfz_202003/preprocessing/preprocess.py:9\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01margparse\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mxr\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdatetime\u001b[39;00m\n",
50 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/__init__.py:1\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m testing, tutorial\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbackends\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 3\u001b[0m load_dataarray,\n\u001b[1;32m 4\u001b[0m load_dataset,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8\u001b[0m save_mfdataset,\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbackends\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrasterio_\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m open_rasterio\n",
51 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/testing.py:9\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m duck_array_ops, formatting, utils\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataarray\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataArray\n\u001b[1;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxarray\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n",
52 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/core/duck_array_ops.py:26\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m take, tensordot, transpose, unravel_index \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m where \u001b[38;5;28;01mas\u001b[39;00m _where\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dask_array_compat, dask_array_ops, dtypes, npcompat, nputils\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnputils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m nanfirst, nanlast\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpycompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m cupy_array_type, dask_array_type, is_duck_dask_array\n",
53 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/xarray/core/npcompat.py:72\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtyping\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_dtype_like\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _DTypeLikeNested, _ShapeLike, _SupportsDType\n\u001b[1;32m 51\u001b[0m \u001b[38;5;66;03m# Xarray requires a Mapping[Hashable, dtype] in many places which\u001b[39;00m\n\u001b[1;32m 52\u001b[0m \u001b[38;5;66;03m# conflics with numpys own DTypeLike (with dtypes for fields).\u001b[39;00m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;66;03m# https://numpy.org/devdocs/reference/typing.html#numpy.typing.DTypeLike\u001b[39;00m\n\u001b[1;32m 54\u001b[0m \u001b[38;5;66;03m# This is a copy of this DTypeLike that allows only non-Mapping dtypes.\u001b[39;00m\n\u001b[1;32m 55\u001b[0m DTypeLikeSave \u001b[38;5;241m=\u001b[39m Union[\n\u001b[1;32m 56\u001b[0m np\u001b[38;5;241m.\u001b[39mdtype,\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# default data type (float64)\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 59\u001b[0m \u001b[38;5;66;03m# array-scalar types and generic types\u001b[39;00m\n\u001b[1;32m 60\u001b[0m Type[Any],\n\u001b[1;32m 61\u001b[0m \u001b[38;5;66;03m# character codes, type strings or comma-separated fields, e.g., 'float64'\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# (flexible_dtype, itemsize)\u001b[39;00m\n\u001b[1;32m 64\u001b[0m Tuple[_DTypeLikeNested, \u001b[38;5;28mint\u001b[39m],\n\u001b[1;32m 65\u001b[0m \u001b[38;5;66;03m# (fixed_dtype, shape)\u001b[39;00m\n\u001b[1;32m 66\u001b[0m Tuple[_DTypeLikeNested, _ShapeLike],\n\u001b[1;32m 67\u001b[0m \u001b[38;5;66;03m# (base_dtype, new_dtype)\u001b[39;00m\n\u001b[1;32m 68\u001b[0m Tuple[_DTypeLikeNested, _DTypeLikeNested],\n\u001b[1;32m 69\u001b[0m \u001b[38;5;66;03m# because numpy does the same?\u001b[39;00m\n\u001b[1;32m 70\u001b[0m List[Any],\n\u001b[1;32m 71\u001b[0m \u001b[38;5;66;03m# anything with a dtype attribute\u001b[39;00m\n\u001b[0;32m---> 72\u001b[0m \u001b[43m_SupportsDType\u001b[49m\u001b[43m[\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m]\u001b[49m,\n\u001b[1;32m 73\u001b[0m ]\n\u001b[1;32m 74\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[1;32m 75\u001b[0m \u001b[38;5;66;03m# fall back for numpy < 1.20, ArrayLike adapted from numpy.typing._array_like\u001b[39;00m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Protocol\n",
54 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/typing.py:261\u001b[0m, in \u001b[0;36m_tp_cache..inner\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 260\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m \u001b[38;5;66;03m# All real errors (not unhashable args) are raised below.\u001b[39;00m\n\u001b[0;32m--> 261\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
55 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/typing.py:897\u001b[0m, in \u001b[0;36mGeneric.__class_getitem__\u001b[0;34m(cls, params)\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 894\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParameters to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m[...] must all be unique\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 895\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 896\u001b[0m \u001b[38;5;66;03m# Subscripting a regular Generic subclass.\u001b[39;00m\n\u001b[0;32m--> 897\u001b[0m \u001b[43m_check_generic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 898\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _GenericAlias(\u001b[38;5;28mcls\u001b[39m, params)\n",
56 | "File \u001b[0;32m~/anaconda3/envs/mypython3/lib/python3.8/site-packages/typing_extensions.py:95\u001b[0m, in \u001b[0;36m_check_generic\u001b[0;34m(cls, parameters, elen)\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m elen \u001b[38;5;129;01mis\u001b[39;00m _marker:\n\u001b[1;32m 94\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__parameters__\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__parameters__:\n\u001b[0;32m---> 95\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m is not a generic class\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 96\u001b[0m elen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m__parameters__)\n\u001b[1;32m 97\u001b[0m alen \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(parameters)\n",
57 | "\u001b[0;31mTypeError\u001b[0m: is not a generic class"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "from gfz_202003.preprocessing import preprocess as prep"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "id": "39eac3de-096b-4b73-8491-232d3e0667b0",
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "import numpy as np\n",
73 | "import h5py\n",
74 | "from matplotlib import pyplot as plt\n",
75 | "import seaborn as sns\n",
76 | "\n",
77 | "import datetime\n",
78 | "import xarray as xr\n",
79 | "\n",
80 | "import argparse"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 7,
86 | "id": "3cad0fa3-0ba5-4b35-ba86-29f7bee68e71",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "import cdsapi"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "id": "7c8c7e2a-ea30-499e-a259-73aae365be5d",
96 | "metadata": {},
97 | "source": [
98 | "## Download raw CyGNSS data\n",
99 | "\n",
100 | "The CyGNSSnet preprocessing routine expects the raw data files ordered as \n",
101 | "\n",
102 | "> `$raw_data_dir///cyg*.nc`\n",
103 | "\n",
104 | "Data is always downloaded for one full day for all spacecraft, generating 8 `netcdf` files per day of observations. Below is a routine to specify a date range, followed by downloading the corresponding data and storing it in the appropriate subfolders."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 6,
110 | "id": "efcfbe84-843d-4550-b22f-fbfaad434694",
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "raw_data_root = '/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/raw_data'\n",
115 | "dev_data_root = '/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/dev_data'"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "id": "2fdc36c1-d1e1-4bab-8a04-a91f8759637f",
121 | "metadata": {},
122 | "source": [
123 | "Select a test day and prepared the input parameters for the provided download script"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 7,
129 | "id": "f47387cf-999d-44a5-9c15-8cf2c7886e07",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "year = 2021\n",
134 | "month = 3\n",
135 | "day = 17"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "id": "871eddb6-6022-4273-8dba-93c911f78598",
141 | "metadata": {},
142 | "source": [
143 | "Downloaded target directory in the expected format `year/day-of-year`"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 8,
149 | "id": "84e34efb-f067-4964-89cb-f6ccb556e681",
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "name": "stdout",
154 | "output_type": "stream",
155 | "text": [
156 | "/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/raw_data/2021/168\n"
157 | ]
158 | }
159 | ],
160 | "source": [
161 | "raw_data_sub = datetime.datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\").strftime(\"%Y/%j\")\n",
162 | "\n",
163 | "raw_data_dir = os.path.join(raw_data_root, raw_data_sub)\n",
164 | "\n",
165 | "print(raw_data_dir)"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "id": "cad54be9-afc6-43b9-a841-2bfefddc81f5",
171 | "metadata": {},
172 | "source": [
173 | "Start and end date of download range in the required format. The end date is midnight the next day, this way only the requested day's data is downloaded."
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 9,
179 | "id": "e7da91cd-1c9b-479d-91a2-3598b58765ac",
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "name": "stdout",
184 | "output_type": "stream",
185 | "text": [
186 | "--start-date 2021-06-17T00:00:00Z\n",
187 | "--end-date 2021-06-18T00:00:00Z\n"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "start_date = datetime.datetime(year, month, day).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
193 | "end_date = datetime.datetime(year, month, day + 1).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
194 | "\n",
195 | "print(f'--start-date {start_date}')\n",
196 | "print(f'--end-date {end_date}')"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "id": "15055733-3e11-4bd5-9402-e721be9aba0c",
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "dday = datetime.datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\").strftime(\"%j\") # need that later\n",
207 | "dday"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "id": "99c1420e-e2c3-4c68-a53e-0b98e94d3a45",
214 | "metadata": {},
215 | "outputs": [
216 | {
217 | "name": "stdout",
218 | "output_type": "stream",
219 | "text": [
220 | "env: PYTHONPATH=/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber\n",
221 | "[2022-09-12 16:00:57,433] {podaac_data_downloader.py:243} INFO - Found 7 total files to download\n",
222 | "[2022-09-12 16:00:59,062] {podaac_access.py:446} WARNING - Computed checksum f11baba7acac4b5b14b3891e83f715c8 does not match expected checksum 10e4ef36d29f030ea7e524f8924389fc\n",
223 | "[2022-09-12 16:01:46,860] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:01:46.860919 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg06.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
224 | "[2022-09-12 16:01:48,483] {podaac_access.py:446} WARNING - Computed checksum 9b3100d23550d03cb85056609ecddd5b does not match expected checksum a8851840f3a4bbdc8499ea2f17d5119b\n",
225 | "[2022-09-12 16:02:39,804] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:02:39.804552 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg08.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
226 | "[2022-09-12 16:02:41,684] {podaac_access.py:446} WARNING - Computed checksum fdaaa0486c6932b1a62c087edaecd64f does not match expected checksum a08d25babf87b328b96a850bfacbcc53\n",
227 | "[2022-09-12 16:03:31,252] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:03:31.252143 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg02.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
228 | "[2022-09-12 16:03:33,101] {podaac_access.py:446} WARNING - Computed checksum 881d6ad8374fea406dc72b27775e124f does not match expected checksum 7eef541250b6f137d8ace0e99e12eaf2\n",
229 | "[2022-09-12 16:04:15,389] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:04:15.389899 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg03.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
230 | "[2022-09-12 16:04:17,154] {podaac_access.py:446} WARNING - Computed checksum cf78c6b618423cf8410b43eeddfb5c63 does not match expected checksum 25dd31a5b59b5444a509ead3a359a8a5\n",
231 | "[2022-09-12 16:05:04,669] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:05:04.669819 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg04.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
232 | "[2022-09-12 16:05:06,367] {podaac_access.py:446} WARNING - Computed checksum 3dc2ce38484b3438c18d5491d6a68984 does not match expected checksum e7ae44462212498cab741a6dbd4624e8\n",
233 | "[2022-09-12 16:06:03,144] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:06:03.144241 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg07.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
234 | "[2022-09-12 16:06:04,762] {podaac_access.py:446} WARNING - Computed checksum 8cc2e314df20dec61110dc4290da3cc1 does not match expected checksum 32fddfe78b55e4ee302cf37fa7d0bf9b\n",
235 | "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:276} INFO - 2022-09-12 16:07:04.082807 SUCCESS: https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/CYGNSS_L1_V3.1/cyg01.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc\n",
236 | "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:287} INFO - Downloaded Files: 7\n",
237 | "[2022-09-12 16:07:04,082] {podaac_data_downloader.py:288} INFO - Failed Files: 0\n",
238 | "[2022-09-12 16:07:04,083] {podaac_data_downloader.py:289} INFO - Skipped Files: 0\n",
239 | "[2022-09-12 16:07:05,046] {podaac_access.py:122} INFO - CMR token successfully deleted\n",
240 | "[2022-09-12 16:07:05,047] {podaac_data_downloader.py:299} INFO - END\n",
241 | "\n",
242 | "\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "%env PYTHONPATH=/home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber\n",
248 | "!python /home/harsh/Downloads/DKRZ/MLOps/2022-cygnss-deployment/data-subscriber/subscriber/podaac_data_downloader.py -c CYGNSS_L1_V3.1 -d $raw_data_dir --start-date $start_date --end-date $end_date"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "id": "520bceeb-7e10-4802-96c7-96995aa933e2",
254 | "metadata": {},
255 | "source": [
256 | "## Download raw ERA5 data\n",
257 | "\n",
258 | "The preprocessing pipeline requires the ERA5 windspeed labels. Download the raw ERA5 data for the same timespan."
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 1,
264 | "id": "d2511baa-ade0-43bf-8e9b-953251c164fe",
265 | "metadata": {},
266 | "outputs": [
267 | {
268 | "ename": "NameError",
269 | "evalue": "name 'os' is not defined",
270 | "output_type": "error",
271 | "traceback": [
272 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
273 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
274 | "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m era5_data \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(raw_data_dir, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mERA5_windspeed.nc\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
275 | "\u001b[0;31mNameError\u001b[0m: name 'os' is not defined"
276 | ]
277 | }
278 | ],
279 | "source": [
280 | "era5_data = os.path.join(raw_data_dir, 'ERA5_windspeed.nc')"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "id": "b8e06265-ffa4-4b9c-a5de-b8c9151a9387",
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "cds = cdsapi.Client()"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 2,
296 | "id": "3c9e2d27-609c-454b-8131-10427c89ab9d",
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "ename": "NameError",
301 | "evalue": "name 'cds' is not defined",
302 | "output_type": "error",
303 | "traceback": [
304 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
305 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
306 | "Input \u001b[0;32mIn [2]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcds\u001b[49m\u001b[38;5;241m.\u001b[39mretrieve(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreanalysis-era5-single-levels\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 3\u001b[0m {\n\u001b[1;32m 4\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mproduct_type\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mreanalysis\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mformat\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnetcdf\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvariable\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10m_u_component_of_wind\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10m_v_component_of_wind\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 8\u001b[0m ],\n\u001b[1;32m 9\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m: year,\n\u001b[1;32m 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmonth\u001b[39m\u001b[38;5;124m'\u001b[39m: month,\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mday\u001b[39m\u001b[38;5;124m'\u001b[39m: day,\n\u001b[1;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m00:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m01:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m02:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m03:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m04:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m05:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 15\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m06:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m07:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m08:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 16\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m09:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m11:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 17\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m12:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m13:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m14:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 18\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m15:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m16:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m17:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 19\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m18:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m19:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m20:00\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m 20\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m21:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m22:00\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m23:00\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 21\u001b[0m ],\n\u001b[1;32m 22\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124marea\u001b[39m\u001b[38;5;124m'\u001b[39m: [\n\u001b[1;32m 23\u001b[0m \u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m180\u001b[39m, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m40\u001b[39m, \u001b[38;5;241m180\u001b[39m,\n\u001b[1;32m 24\u001b[0m ],\n\u001b[1;32m 25\u001b[0m },\n\u001b[1;32m 26\u001b[0m era5_data)\n",
307 | "\u001b[0;31mNameError\u001b[0m: name 'cds' is not defined"
308 | ]
309 | }
310 | ],
311 | "source": [
312 | "cds.retrieve(\n",
313 | " 'reanalysis-era5-single-levels',\n",
314 | " {\n",
315 | " 'product_type': 'reanalysis',\n",
316 | " 'format': 'netcdf',\n",
317 | " 'variable': [\n",
318 | " '10m_u_component_of_wind', '10m_v_component_of_wind',\n",
319 | " ],\n",
320 | " 'year': year,\n",
321 | " 'month': month,\n",
322 | " 'day': day,\n",
323 | " 'time': [\n",
324 | " '00:00', '01:00', '02:00',\n",
325 | " '03:00', '04:00', '05:00',\n",
326 | " '06:00', '07:00', '08:00',\n",
327 | " '09:00', '10:00', '11:00',\n",
328 | " '12:00', '13:00', '14:00',\n",
329 | " '15:00', '16:00', '17:00',\n",
330 | " '18:00', '19:00', '20:00',\n",
331 | " '21:00', '22:00', '23:00'\n",
332 | " ],\n",
333 | " 'area': [\n",
334 | " 40, -180, -40, 180,\n",
335 | " ],\n",
336 | " },\n",
337 | " era5_data)"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 3,
343 | "id": "4df67e88-6fd4-48f9-8a2c-921d51fe1c13",
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "ename": "NameError",
348 | "evalue": "name 'xr' is not defined",
349 | "output_type": "error",
350 | "traceback": [
351 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
352 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
353 | "Input \u001b[0;32mIn [3]\u001b[0m, in \u001b[0;36m| \u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m era5_ds \u001b[38;5;241m=\u001b[39m \u001b[43mxr\u001b[49m\u001b[38;5;241m.\u001b[39mopen_dataset(era5_data)\n\u001b[1;32m 2\u001b[0m era5_ds\n",
354 | "\u001b[0;31mNameError\u001b[0m: name 'xr' is not defined"
355 | ]
356 | }
357 | ],
358 | "source": [
359 | "era5_ds = xr.open_dataset(era5_data)\n",
360 | "era5_ds"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "id": "c958a80b-5845-4a98-8372-1fdc03954a00",
366 | "metadata": {},
367 | "source": [
368 | "## Annotate raw CyGNSS data with windspeed labels"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "id": "ac2b8784-afcd-48ce-9e09-a6b245ae6132",
374 | "metadata": {},
375 | "source": [
376 | "We need to create the data variables `ERA5_u10` and `ERA5_v10` and attach them to the CyGNSS raw data."
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 22,
382 | "id": "ca049baa-b554-40f1-9269-27469c614a76",
383 | "metadata": {},
384 | "outputs": [
385 | {
386 | "data": {
387 | "text/plain": [
388 | "['cyg07.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
389 | " 'ERA5_windspeed.nc',\n",
390 | " 'cyg02.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
391 | " 'cyg04.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
392 | " 'cyg01.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
393 | " 'cyg06.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
394 | " 'CYGNSS_L1_V3.1.citation.txt',\n",
395 | " 'cyg03.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc',\n",
396 | " 'cyg08.ddmi.s20210617-000000-e20210617-235959.l1.power-brcs.a31.d32.nc']"
397 | ]
398 | },
399 | "execution_count": 22,
400 | "metadata": {},
401 | "output_type": "execute_result"
402 | }
403 | ],
404 | "source": [
405 | "os.listdir(raw_data_dir)"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "id": "50a4f34c-1a20-4580-9721-6838de1626a7",
411 | "metadata": {},
412 | "source": [
413 | "Check units for spacetime coordinates\n",
414 | "* Longitude\n",
415 | " * ERA5: -180 ... 0 ... +180\n",
416 | " * CyGNSS: 0 ... 180 ... 360\n",
417 | "* Latitude\n",
418 | " * ERA5 & CyGNSS: -40 ... 0 ... +40\n",
419 | "* Timestamp\n",
420 | "\n",
421 | "\n",
422 | "--> Need to shift the ERA5 longitude coordinate by 180"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "id": "96ecbfa9-4a93-4821-b65c-05fc8697f8d5",
429 | "metadata": {},
430 | "outputs": [],
431 | "source": [
432 | "def annotate_dataset(cygnss_file, era5_file, save_dataset=False):\n",
433 | " '''\n",
434 | " Annotate a given CyGNSS dataset with ERA5 windspeed labels and save to disk\n",
435 | " \n",
436 | " Parameters:\n",
437 | " cygnss_file : path to CyGNSS dataset\n",
438 | " era5_file : path to orresponding ERA5 dataset\n",
439 | " save_dataset : if True, save dataset to disk overwriting cygnss_file (default: False)\n",
440 | " \n",
441 | " Returns:\n",
442 | " Annotated CyGNSS dataset\n",
443 | " '''\n",
444 | " \n",
445 | " # necessary because lazy loading prohibits overwriting the netcdf files at the end of this section\n",
446 | " with xr.open_dataset(cygnss_file) as data:\n",
447 | " cygnss_ds = data.load()\n",
448 | " \n",
449 | " with xr.open_dataset(era5_file) as data:\n",
450 | " era5_ds = data.load()\n",
451 | " \n",
452 | " # needs to be shifted by 180 for compatibility with CyGNSS\n",
453 | " era5_ds = era5_ds.assign_coords(longitude=era5_ds.coords['longitude'] + 180)\n",
454 | " \n",
455 | " interp_ds = era5_ds.interp(longitude=cygnss_ds.sp_lon, latitude=cygnss_ds.sp_lat, time=cygnss_ds.ddm_timestamp_utc)\n",
456 | " \n",
457 | " cygnss_ds['ERA5_u10'] = interp_ds['u10']\n",
458 | " cygnss_ds['ERA5_v10'] = interp_ds['v10']\n",
459 | "\n",
460 | " tmp_attrs = cygnss_ds['ERA5_u10'].attrs\n",
461 | " tmp_attrs['long_name'] = cygnss_ds['ERA5_u10'].long_name + ' (interpolated)'\n",
462 | " cygnss_ds['ERA5_u10'].attrs = tmp_attrs\n",
463 | "\n",
464 | " tmp_attrs = cygnss_ds['ERA5_v10'].attrs\n",
465 | " tmp_attrs['long_name'] = cygnss_ds['ERA5_v10'].long_name + ' (interpolated)'\n",
466 | " cygnss_ds['ERA5_v10'].attrs = tmp_attrs\n",
467 | " \n",
468 | " cygnss_ds = cygnss_ds.drop_vars(['longitude', 'latitude', 'time'])\n",
469 | " \n",
470 | " # dummy values only for preprocessing routine\n",
471 | " cygnss_ds['GPM_precipitation'] = -9999\n",
472 | " cygnss_ds['ERA5_mdts'] = -9999\n",
473 | " cygnss_ds['ERA5_mdww'] = -9999\n",
474 | " cygnss_ds['ERA5_swh'] = -9999\n",
475 | " cygnss_ds['ERA5_shts'] = -9999\n",
476 | " cygnss_ds['ERA5_shww'] = -9999\n",
477 | " cygnss_ds['ERA5_p140121'] = -9999\n",
478 | " cygnss_ds['ERA5_p140124'] = -9999\n",
479 | " cygnss_ds['ERA5_p140127'] = -9999\n",
480 | " \n",
481 | " if save_dataset:\n",
482 | " cygnss_ds.to_netcdf(cygnss_file)\n",
483 | " \n",
484 | " return cygnss_ds"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "id": "3e15d7b1-9d6d-4e1a-9ba6-ea28b9943f28",
491 | "metadata": {},
492 | "outputs": [],
493 | "source": [
494 | "for cygnss_file in os.listdir(raw_data_dir):\n",
495 | " if cygnss_file.startswith('cyg') and cygnss_file.endswith('.nc'):\n",
496 | " print(cygnss_file)\n",
497 | " annotate_dataset(os.path.join(raw_data_dir, cygnss_file), era5_data, save_dataset=True)"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "id": "41fa8522-6306-45f9-ad34-609a30995765",
503 | "metadata": {},
504 | "source": [
505 | "## Check raw data"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "id": "d099d73c-53e5-4939-bf93-99105225a0e7",
512 | "metadata": {},
513 | "outputs": [],
514 | "source": [
515 | "from importlib import reload\n",
516 | "reload(prep)\n",
517 | "raw_ds = prep.open_mfdataset(os.path.join(raw_data_dir, cygnss_file))\n",
518 | "\n",
519 | "raw_ds"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "id": "b4b7fbce-05e4-418f-bf29-a294846947d0",
526 | "metadata": {},
527 | "outputs": [],
528 | "source": [
529 | "filtered_ds = prep.apply_quality_filter(raw_ds, is_ml_ops=True)\n",
530 | "filtered_ds"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "id": "66d21d51-a157-4823-98bc-9160b19627a8",
537 | "metadata": {},
538 | "outputs": [],
539 | "source": [
540 | "os.listdir('/work/ka1176/shared_data/2020-03/raw_data/2021/014/')"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": null,
546 | "id": "c05f455c-a047-49ac-8169-6eee7f0ee38e",
547 | "metadata": {},
548 | "outputs": [],
549 | "source": [
550 | "bu = raw_ds['ddm_brcs_uncert']\n",
551 | "qf = raw_ds['quality_flags']\n",
552 | "st = raw_ds['nst_att_status']\n",
553 | "fom = raw_ds['prn_fig_of_merit']\n",
554 | "les = raw_ds['ddm_les']\n",
555 | "rxg = raw_ds['sp_rx_gain']\n",
556 | "nsca = raw_ds['nbrcs_scatter_area']\n",
557 | "lsca = raw_ds['les_scatter_area']\n",
558 | "lat = raw_ds['sp_lat']\n",
559 | "lon = raw_ds['sp_lon']\n",
560 | "ws = raw_ds['windspeed']"
561 | ]
562 | },
563 | {
564 | "cell_type": "markdown",
565 | "id": "1cb9d80b-5400-4a2c-878b-624fe05040b9",
566 | "metadata": {},
567 | "source": [
568 | "For now, use only the quality flag == 4"
569 | ]
570 | },
571 | {
572 | "cell_type": "code",
573 | "execution_count": null,
574 | "id": "289ceac1-d478-4276-868b-9f246c8222c2",
575 | "metadata": {},
576 | "outputs": [],
577 | "source": [
578 | "quality = (bu<1) & (qf == 4) & (st == 0) & (fom > 3) & (rxg > 0) & (les >= 0)"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": null,
584 | "id": "8f472924-fcb5-4639-8609-cda8b4fb4a51",
585 | "metadata": {},
586 | "outputs": [],
587 | "source": [
588 | "np.sum((bu<1) & (st==0)).compute()"
589 | ]
590 | },
591 | {
592 | "cell_type": "markdown",
593 | "id": "fb390261-935f-4573-87f8-50b926f873a0",
594 | "metadata": {},
595 | "source": [
596 | "## Created processed data"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": null,
602 | "id": "5e078c5c-068a-43e5-b1dc-23806c0226e5",
603 | "metadata": {},
604 | "outputs": [],
605 | "source": [
606 | "raw_ds = prep.open_mfdataset(os.path.join(raw_data_dir, 'cyg06*.nc'), channels=[0,1,2,3])"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": null,
612 | "id": "1bbd55a2-ac14-4dbb-8aa5-9c64c64ebbd1",
613 | "metadata": {},
614 | "outputs": [],
615 | "source": [
616 | "dev_data_dir = '/work/ka1176/shared_data/2022-cygnss-deployment/dev_data/'"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": null,
622 | "id": "3fef8930-282e-4751-8678-18db21ee13f9",
623 | "metadata": {},
624 | "outputs": [],
625 | "source": [
626 | "for ff in os.listdir('/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/2021/168/'):\n",
627 | " tmp = xr.open_dataset(os.path.join('/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/2021/168/', ff))\n",
628 | " if not 'ERA5_u10' in tmp.keys():\n",
629 | " print(ff)"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": null,
635 | "id": "93ebd942-72d9-4344-8496-329f8a9c73c9",
636 | "metadata": {},
637 | "outputs": [],
638 | "source": [
639 | "tmp"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": null,
645 | "id": "c24d957d-2db5-4fd4-a88d-e18a53457384",
646 | "metadata": {},
647 | "outputs": [],
648 | "source": [
649 | "reload(prep)\n",
650 | "args = argparse.Namespace(raw_data_dir='/work/ka1176/shared_data/2022-cygnss-deployment/raw_data/',\n",
651 | " output_dir=dev_data_dir,\n",
652 | " v_map=['brcs'],\n",
653 | " n_valid_days=0,\n",
654 | " n_test_days=1,\n",
655 | " n_processes=1,\n",
656 | " only_merge=False,\n",
657 | " use_land_data=False,\n",
658 | " is_ml_ops=True,\n",
659 | " version='v3.1',\n",
660 | " day=dday,\n",
661 | " year=year,\n",
662 | " reduce_mode='')\n",
663 | "\n",
664 | "prep.generate_input_data(args)"
665 | ]
666 | },
667 | {
668 | "cell_type": "markdown",
669 | "id": "7ec9da62-5cd7-4f1d-b4eb-a11367748f5d",
670 | "metadata": {},
671 | "source": [
672 | "## Check the new CyGNSS data v3.1"
673 | ]
674 | },
675 | {
676 | "cell_type": "code",
677 | "execution_count": null,
678 | "id": "f6e400c4-a661-409e-8add-19734e6e954c",
679 | "metadata": {},
680 | "outputs": [],
681 | "source": [
682 | "TODO annotate the samples with date (year month day etc)"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": null,
688 | "id": "11d01ad2-7a19-458d-817d-efadea14d643",
689 | "metadata": {},
690 | "outputs": [],
691 | "source": [
692 | "!conda list env"
693 | ]
694 | },
695 | {
696 | "cell_type": "code",
697 | "execution_count": null,
698 | "id": "00e4867b-2e29-4967-a193-ef2356a151f0",
699 | "metadata": {},
700 | "outputs": [],
701 | "source": []
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": null,
706 | "id": "3ebc77ff-5f86-4962-a058-9fc359cf5b3f",
707 | "metadata": {},
708 | "outputs": [],
709 | "source": []
710 | }
711 | ],
712 | "metadata": {
713 | "kernelspec": {
714 | "display_name": "CyGNSS Deployment",
715 | "language": "python",
716 | "name": "cygnss-d"
717 | },
718 | "language_info": {
719 | "codemirror_mode": {
720 | "name": "ipython",
721 | "version": 3
722 | },
723 | "file_extension": ".py",
724 | "mimetype": "text/x-python",
725 | "name": "python",
726 | "nbconvert_exporter": "python",
727 | "pygments_lexer": "ipython3",
728 | "version": "3.9.13"
729 | }
730 | },
731 | "nbformat": 4,
732 | "nbformat_minor": 5
733 | }
734 |
--------------------------------------------------------------------------------
| | | |