├── sparkxarray ├── tests │ ├── __init__.py │ └── test_reader.py ├── utils │ ├── __init__.py │ └── vis_utils.py ├── applications │ ├── __init__.py │ └── shapefile_masking.py ├── __init__.py └── reader.py ├── setup.cfg ├── MANIFEST.in ├── ci ├── docs-requirements.txt ├── requirements-py35.yml ├── requirements-py36.yml ├── bin │ └── download_travis_dependencies.sh └── install_python.ps1 ├── datasets ├── air.sig995.2012.nc └── NCEP │ ├── ersst.v4.185401.nc │ ├── ersst.v4.185402.nc │ ├── ersst.v4.185403.nc │ └── ersst.v4.185404.nc ├── readthedocs.yml ├── scripts └── cleanup.sh ├── examples └── bias │ ├── knmi_bias_compared_to_wrf.png │ ├── wrf_bias_compared_to_knmi.png │ └── bias.ipynb ├── .github ├── PULL_REQUEST_TEMPLATE.md └── ISSUE_TEMPLATE.md ├── mkdocs.yml ├── appveyor.yml ├── .travis.yml ├── .gitignore ├── setup.py ├── README.md ├── LICENSE └── bias.ipynb /sparkxarray/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sparkxarray/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sparkxarray/applications/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE -------------------------------------------------------------------------------- /ci/docs-requirements.txt: -------------------------------------------------------------------------------- 1 | pymdown-extensions 2 | -------------------------------------------------------------------------------- /datasets/air.sig995.2012.nc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/spark-xarray/HEAD/datasets/air.sig995.2012.nc -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | conda: 2 | file: doc/environment.yml 3 | python: 4 | version: 3 5 | setup_py_install: true -------------------------------------------------------------------------------- /datasets/NCEP/ersst.v4.185401.nc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/spark-xarray/HEAD/datasets/NCEP/ersst.v4.185401.nc -------------------------------------------------------------------------------- /datasets/NCEP/ersst.v4.185402.nc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/spark-xarray/HEAD/datasets/NCEP/ersst.v4.185402.nc -------------------------------------------------------------------------------- /datasets/NCEP/ersst.v4.185403.nc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/spark-xarray/HEAD/datasets/NCEP/ersst.v4.185403.nc -------------------------------------------------------------------------------- /datasets/NCEP/ersst.v4.185404.nc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/spark-xarray/HEAD/datasets/NCEP/ersst.v4.185404.nc -------------------------------------------------------------------------------- /scripts/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | rm -r -f metastore_db 4 | rm -r -f derby.log 5 | rm -r -f spark-warehouse -------------------------------------------------------------------------------- /examples/bias/knmi_bias_compared_to_wrf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/spark-xarray/HEAD/examples/bias/knmi_bias_compared_to_wrf.png -------------------------------------------------------------------------------- /examples/bias/wrf_bias_compared_to_knmi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/spark-xarray/HEAD/examples/bias/wrf_bias_compared_to_knmi.png -------------------------------------------------------------------------------- /sparkxarray/utils/vis_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities related to model visualization.""" 2 | #import holoviews as hv 3 | #import geoviews as gv 4 | #import geoviews.feature as gf 5 | #from cartopy import crs 6 | 7 | #hv.notebook_extension() 8 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | - [ ] closes #xxxx 2 | - [ ] tests added / passed 3 | - [ ] docs reflect changes 4 | - [ ] passes ``flake8 downscale tests docs`` 5 | - [ ] entry in HISTORY.rst 6 | 7 | [summarize your pull request here] 8 | -------------------------------------------------------------------------------- /sparkxarray/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | # List to define the behaviour of imports of the form: 4 | # from sparkxarray import * 5 | 6 | __all__ = [] 7 | 8 | # Package version number. 9 | __version__ = version = '0.1.dev0' 10 | 11 | 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * spark-xarray version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /ci/requirements-py35.yml: -------------------------------------------------------------------------------- 1 | name: test_env 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.5 6 | - dask 7 | - h5py 8 | - h5netcdf 9 | - toolz 10 | - seaborn 11 | - numpy 12 | - scipy 13 | - pandas 14 | - netcdf4 15 | - xarray 16 | - matplotlib 17 | - pyspark 18 | - pytest 19 | - geopandas 20 | - pip: 21 | - coverage 22 | - coveralls 23 | - codecov 24 | - pytest-cov 25 | - pymdown-extensions -------------------------------------------------------------------------------- /ci/requirements-py36.yml: -------------------------------------------------------------------------------- 1 | name: test_env 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.6 6 | - dask 7 | - h5py 8 | - h5netcdf 9 | - toolz 10 | - seaborn 11 | - numpy 12 | - scipy 13 | - pandas 14 | - netcdf4 15 | - xarray 16 | - matplotlib 17 | - pyspark 18 | - pytest 19 | - geopandas 20 | - pip: 21 | - coverage 22 | - coveralls 23 | - codecov 24 | - pytest-cov 25 | - pymdown-extensions -------------------------------------------------------------------------------- /ci/bin/download_travis_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "Downloading Spark if necessary" 3 | echo "Spark version = $SPARK_VERSION" 4 | echo "Spark build = $SPARK_BUILD" 5 | echo "Spark build URL = $SPARK_BUILD_URL" 6 | mkdir -p $HOME/.cache/spark-versions 7 | filename="$HOME/.cache/spark-versions/$SPARK_BUILD.tgz" 8 | if ! [ -f $filename ]; then 9 | echo "Downloading file..." 10 | echo `which curl` 11 | curl "$SPARK_BUILD_URL" > $filename 12 | echo "Content of directory:" 13 | ls -la $HOME/.cache/spark-versions/* 14 | tar xvf $filename --directory $HOME/.cache/spark-versions > /dev/null 15 | fi 16 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: spark-xarray 2 | theme: readthedocs 3 | repo_url: https://github.com/andersy005/spark-xarray 4 | site_url: https://andersy005.github.io/spark-xarray/ 5 | 6 | pages: 7 | - Home: index.md 8 | 9 | markdown_extensions: 10 | - pymdownx.arithmatex 11 | - pymdownx.betterem(smart_enable=all) 12 | - pymdownx.caret 13 | - pymdownx.critic 14 | - pymdownx.inlinehilite 15 | - pymdownx.magiclink 16 | - pymdownx.mark 17 | - pymdownx.smartsymbols 18 | - pymdownx.superfences 19 | - pymdownx.tasklist(custom_checkbox=true) 20 | - pymdownx.tilde 21 | - codehilite 22 | - footnotes 23 | - toc(permalink=true) 24 | 25 | 26 | extra: 27 | palette: 28 | primary: blue 29 | accent: blue 30 | 31 | font: 32 | text: Roboto 33 | code: Roboto Mono 34 | extra_javascript: 35 | - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS_HTML 36 | - 'javascripts/mathjax.js' 37 | -------------------------------------------------------------------------------- /sparkxarray/tests/test_reader.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from sparkxarray.reader import ncread 3 | import os 4 | 5 | 6 | spark = SparkSession.builder.appName('spark-tests').getOrCreate() 7 | sc = spark.sparkContext 8 | print(os.getcwd()) 9 | filename = os.path.abspath('sparkxarray/tests/data/air.sig995.2012.nc') 10 | print(filename) 11 | paths = os.path.abspath('sparkxarray/tests/data/NCEP/*.nc') 12 | print(paths) 13 | 14 | ### Tests for single file 15 | rdd1 = ncread(sc, filename, mode='single', partition_on=['lat', 'lon'], partitions=300) 16 | print(rdd1.count()) 17 | print(rdd1.first()) 18 | print(rdd1.getNumPartitions()) 19 | 20 | 21 | rdd2 = ncread(sc, filename, mode='single', partition_on=['time'], partitions=80) 22 | print(rdd2.count()) 23 | print(rdd2.first()) 24 | 25 | 26 | ### Tests for Multiple files 27 | rdd3 = ncread(sc, paths, mode='multi', partition_on=['lat', 'lon'], partitions=300) 28 | print(rdd3.count()) 29 | print(rdd3.first()) 30 | 31 | rdd4 = ncread(sc, paths, mode='multi', partition_on=['lat', 'lon', 'time', 'nv'], partitions=1000) 32 | print(rdd4.count()) 33 | print(rdd4.first()) 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # CI on Windows via appveyor 2 | # Adapted (minorly) from xarray's appveyor.yml, which itself 3 | # was based on Olivier Grisel's python-appveyor-demo 4 | 5 | environment: 6 | 7 | matrix: 8 | 9 | - PYTHON: "C:\\Python35-conda64" 10 | PYTHON_VERSION: "3.5" 11 | PYTHON_ARCH: "64" 12 | CONDA_ENV: "py35" 13 | 14 | - PYTHON: "C:\\Python36-conda64" 15 | PYTHON_VERSION: "3.6" 16 | PYTHON_ARCH: "64" 17 | CONDA_ENV: "py36" 18 | 19 | install: 20 | # Install miniconda Python 21 | - "powershell ./ci/install_python.ps1" 22 | 23 | # Prepend newly installed Python to the PATH of this build (this cannot be 24 | # done from inside the powershell script as it would require to restart 25 | # the parent CMD process). 26 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" 27 | 28 | # Check that we have the expected version and architecture for Python 29 | - "python --version" 30 | - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" 31 | 32 | # install xarray and depenencies 33 | - "conda env create -f ./ci/requirements-%CONDA_ENV%.yml" 34 | - "activate test_env" 35 | # Install the package: 36 | - "python setup.py install" 37 | #- git clone https://github.com/andersy005/spark-xarray.git 38 | #- cd spark-xarray 39 | #- pip install -e . 40 | 41 | build: false 42 | 43 | test_script: 44 | - "coverage run sparkxarray/tests/test_reader.py" 45 | 46 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | notifications: 2 | email: false 3 | 4 | sudo: required 5 | 6 | dist: trusty 7 | 8 | language: python 9 | 10 | matrix: 11 | include: 12 | - python: 3.5 13 | env: CONDA_ENV=py35 14 | 15 | - python: 3.6 16 | env: CONDA_ENV=py36 17 | 18 | 19 | before_install: 20 | - sudo apt-get update 21 | # We do this conditionally because it saves us some downloading if the 22 | # version is the same. 23 | - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then 24 | wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; 25 | else 26 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 27 | fi 28 | - bash miniconda.sh -b -p $HOME/miniconda 29 | - export PATH="$HOME/miniconda/bin:$PATH" 30 | - hash -r 31 | - conda config --set always_yes yes --set changeps1 no 32 | - conda update -q conda 33 | # Useful for debugging any issues with conda 34 | - conda info -a 35 | 36 | 37 | install: 38 | 39 | #- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION --file ./ci/requirements.txt 40 | - conda env create -f ci/requirements-$CONDA_ENV.yml 41 | - source activate test_env 42 | # Install the package: 43 | - python setup.py install 44 | #- git clone https://github.com/andersy005/spark-xarray.git 45 | #- cd spark-xarray 46 | #- pip install -e . 47 | 48 | script: 49 | - coverage run sparkxarray/tests/test_reader.py 50 | 51 | after_success: 52 | - bash <(curl -s https://codecov.io/bash) 53 | - codecov 54 | 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints 2 | *.pyc 3 | /lab 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | /tests -------------------------------------------------------------------------------- /sparkxarray/applications/shapefile_masking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Masking an area in a netCDF dataset using a geographical outline. 3 | This module reads a shapefile using geopandas. The user have the option 4 | of masking the area inside or outside the geographical outline. 5 | """ 6 | 7 | import warnings 8 | warnings.filterwarnings('ignore') 9 | import xarray as xr 10 | import geopandas as gpd 11 | from geopandas import GeoDataFrame # Loading boundaries data 12 | from shapely.geometry import Point, Polygon, shape # For creating geospatial data 13 | import time 14 | from functools import partial 15 | 16 | 17 | def _shift_lon_values(dset): 18 | from shapely.geometry import Point 19 | lat = dset.lat.values 20 | lon = dset.lon.values 21 | 22 | if lon >= 180: 23 | lon = lon - 360. 24 | 25 | coordinate = Point(lon, lat) 26 | return coordinate, dset 27 | 28 | 29 | 30 | def masking(sc, rdd, shapefile_path, mask_area='in'): 31 | 32 | print("Loading and broadcasting the shapefile....\n\n") 33 | shape = GeoDataFrame.from_file(shapefile_path) 34 | 35 | my_shape = sc.broadcast(shape) 36 | print("Successfully loaded the shapefile....\n\n") 37 | 38 | print("Masking the data against the shapefile in progress....\n\n") 39 | start = time.time() 40 | masked_rdd = rdd.map(_shift_lon_values).filter(partial(_point_look_up, my_shape))\ 41 | .collect() 42 | masked_data = [item[1] for item in masked_rdd] 43 | 44 | dset = xr.auto_combine(masked_data, concat_dim=None) 45 | 46 | stop = time.time() 47 | total_time = stop - start 48 | print("Successfully masked the data in {} seconds\n".format(round(total_time, 3))) 49 | return dset 50 | 51 | 52 | 53 | def _point_look_up(my_shape, element): 54 | grid_point = element[0] 55 | dset = element[1] 56 | 57 | # Access the broadcasted shape on the workers 58 | gdf = my_shape.value 59 | 60 | # See if the grid point i s inside the shape 61 | check = gdf.contains(grid_point).unique() 62 | 63 | 64 | if True in check: 65 | return True 66 | else: 67 | return False 68 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup 3 | from setuptools import find_packages 4 | import os 5 | from ast import parse 6 | 7 | LONG_DESCRIPTION = """ 8 | **spark-xarray**: 9 | 10 | Spark-xarray is a high level, Apache Spark and xarray-based Python library for working 11 | with netCDF climate model data with Apache Spark. 12 | 13 | Important links 14 | ------------------ 15 | 16 | - Official source code repo: https://github.com/andersy005/spark-xarray 17 | - Issue tracker: https://github.com/andersy005/spark-xarray/issues 18 | 19 | """ 20 | 21 | NAME = 'spark-xarray' 22 | 23 | def version(): 24 | """Return version string.""" 25 | with open(os.path.join(os.path.abspath(os.path.dirname(__file__)),'sparkxarray', '__init__.py')) as input_file: 26 | for line in input_file: 27 | if line.startswith('__version__'): 28 | return parse(line).body[0].value.s 29 | 30 | 31 | #for line in open('sparkxarray/__init__.py').readlines(): 32 | # if line.startswith('__version__'): 33 | # exec(line) 34 | 35 | INSTALL_REQUIRES = (['numpy >= 1.7', 36 | 'scipy >= 0.16', 37 | 'pandas >= 0.15.0', 38 | 'netCDF4 >= 1.2', 39 | 'xarray>=0.9.5', 40 | 'dask >= 0.14', 41 | 'distributed >= 1.16.1', 42 | 'geopandas >= 0.3.0', 43 | 'toolz>=0.8.2', 44 | 'cloudpickle >= 0.2.1']) 45 | 46 | packages = ['sparkxarray', 'sparkxarray.tests'] 47 | 48 | package_data = {'sparkxarray': ['data/*.nc']} 49 | 50 | setup(name=NAME, 51 | version=version(), 52 | author='Anderson Banihirwe, Kevin Paul', 53 | author_email='axbanihirwe@gmail.com', 54 | description='Big Atmospheric & Oceanic Data Analysis with Apache Spark + xarray', 55 | url='https://github.com/andersy005/spark-xarray', 56 | long_description=LONG_DESCRIPTION, 57 | install_requires=INSTALL_REQUIRES, 58 | packages=packages, 59 | package_data=package_data, 60 | keywords=[' Climate Science', 'xarray', 'Apache Spark', 'Distributed', 'netCDF', 'Parallel'], 61 | classifiers=[ 62 | 'Development Status :: 1 - Beta', 63 | 'Intended Audience :: Science/Research', 64 | 'License :: OSI Approved :: Apache', 65 | 'Natural Language :: English', 66 | 'Operating System :: OS Independent', 67 | 'Programming Language :: Python :: 2.7', 68 | 'Programming Language :: Python :: 3', 69 | 'Programming Language :: Python :: 3.4', 70 | 'Programming Language :: Python :: 3.5', 71 | 'Programming Language :: Python :: 3.6', 72 | 'Topic :: Scientific/Engineering :: Atmospheric Science' 73 | ], 74 | zip_safe=False, 75 | 76 | ) 77 | -------------------------------------------------------------------------------- /ci/install_python.ps1: -------------------------------------------------------------------------------- 1 | # Sample script to install Python and pip under Windows 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 4 | 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/" 6 | $BASE_URL = "https://www.python.org/ftp/python/" 7 | 8 | 9 | function DownloadMiniconda ($python_version, $platform_suffix) { 10 | $webclient = New-Object System.Net.WebClient 11 | if ($python_version -match "3.6") { 12 | $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" 13 | } else { 14 | $filename = "Miniconda2-latest-Windows-" + $platform_suffix + ".exe" 15 | } 16 | $url = $MINICONDA_URL + $filename 17 | 18 | $basedir = $pwd.Path + "\" 19 | $filepath = $basedir + $filename 20 | if (Test-Path $filename) { 21 | Write-Host "Reusing" $filepath 22 | return $filepath 23 | } 24 | 25 | # Download and retry up to 3 times in case of network transient errors. 26 | Write-Host "Downloading" $filename "from" $url 27 | $retry_attempts = 2 28 | for($i=0; $i -lt $retry_attempts; $i++){ 29 | try { 30 | $webclient.DownloadFile($url, $filepath) 31 | break 32 | } 33 | Catch [Exception]{ 34 | Start-Sleep 1 35 | } 36 | } 37 | if (Test-Path $filepath) { 38 | Write-Host "File saved at" $filepath 39 | } else { 40 | # Retry once to get the error message if any at the last try 41 | $webclient.DownloadFile($url, $filepath) 42 | } 43 | return $filepath 44 | } 45 | 46 | 47 | function InstallMiniconda ($python_version, $architecture, $python_home) { 48 | Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home 49 | if (Test-Path $python_home) { 50 | Write-Host $python_home "already exists, skipping." 51 | return $false 52 | } 53 | if ($architecture -match "32") { 54 | $platform_suffix = "x86" 55 | } else { 56 | $platform_suffix = "x86_64" 57 | } 58 | 59 | $filepath = DownloadMiniconda $python_version $platform_suffix 60 | Write-Host "Installing" $filepath "to" $python_home 61 | $install_log = $python_home + ".log" 62 | $args = "/S /D=$python_home" 63 | Write-Host $filepath $args 64 | Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru 65 | if (Test-Path $python_home) { 66 | Write-Host "Python $python_version ($architecture) installation complete" 67 | } else { 68 | Write-Host "Failed to install Python in $python_home" 69 | Get-Content -Path $install_log 70 | Exit 1 71 | } 72 | } 73 | 74 | 75 | function InstallCondaPackages ($python_home, $spec) { 76 | $conda_path = $python_home + "\Scripts\conda.exe" 77 | $args = "install --yes " + $spec 78 | Write-Host ("conda " + $args) 79 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 80 | } 81 | 82 | function UpdateConda ($python_home) { 83 | $conda_path = $python_home + "\Scripts\conda.exe" 84 | Write-Host "Updating conda..." 85 | $args = "update --yes conda" 86 | Write-Host $conda_path $args 87 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 88 | } 89 | 90 | 91 | function main () { 92 | InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON 93 | UpdateConda $env:PYTHON 94 | InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client" 95 | } 96 | 97 | main -------------------------------------------------------------------------------- /sparkxarray/reader.py: -------------------------------------------------------------------------------- 1 | """ Interface for Data Ingestion. 2 | """ 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | from __future__ import print_function 20 | from __future__ import absolute_import 21 | import os 22 | import numpy as np 23 | import pandas as pd 24 | import xarray as xr 25 | import itertools 26 | from glob import glob 27 | # from pyspark.sql import SparkSession # Removing this line simply makes the library compatible with Spark 1.6.3 ! 28 | 29 | def ncread(sc, paths, mode='single', **kwargs): 30 | """Calls sparkxarray netcdf read function based on the mode parameter. 31 | 32 | ============ ============================== 33 | Mode Reading Function 34 | ------------ ------------------------------ 35 | single : read_nc_single 36 | multi : read_nc_multi 37 | Anything else: Throw an exception 38 | ============= ============================== 39 | 40 | Parameters 41 | ---------- 42 | 43 | sc : sparkContext object 44 | 45 | paths : str or sequence 46 | Either a string glob in the form "path/to/my/files/*.nc" or an explicit 47 | list of files to open 48 | 49 | mode : str 50 | 'single' for a single file 51 | 'multi' for multiple files 52 | 53 | **kwargs : dict 54 | partitioning options to be passed on to the actual read function. 55 | 56 | 57 | """ 58 | 59 | if 'partitions' not in kwargs: 60 | kwargs['partitions'] = None 61 | 62 | if 'partition_on' not in kwargs: 63 | kwargs['partition_on'] = ['time'] 64 | 65 | if 'decode_times' not in kwargs: 66 | kwargs['decode_times'] = True 67 | 68 | error_msg = ("You specified a mode that is not implemented.") 69 | 70 | if (mode == 'single'): 71 | return _read_nc_single(sc, paths, **kwargs) 72 | 73 | elif (mode == 'multi'): 74 | return _read_nc_multi(sc, paths, **kwargs) 75 | else: 76 | raise NotImplementedError(error_msg) 77 | 78 | 79 | def _read_nc_single(sc, paths, **kwargs): 80 | """ Read a single netCDF file 81 | 82 | Parameters 83 | ----------- 84 | sc : sparkContext object 85 | 86 | paths : str 87 | an explicit filename to open 88 | 89 | 90 | **kwargs : dict 91 | Additional arguments for partitioning 92 | 93 | """ 94 | partition_on = kwargs.get('partition_on') 95 | partitions = kwargs.get('partitions') 96 | decode_times=kwargs.get('decode_times') 97 | 98 | dset = xr.open_dataset(paths, autoclose=True, decode_times=decode_times) 99 | 100 | # D = {'dim_1': dim_1_size, 'dim_2': dim_2_size, ...} 101 | D = {dset[dimension].name:dset[dimension].size for dimension in partition_on} 102 | 103 | # dim_sizes = [range(dim_1_size), range(dim_2_size), range(...)] 104 | dim_ranges = [range(dim_size) for dim_size in D.values()] 105 | 106 | 107 | dim_cartesian_product_indices = [element for element in itertools.product(*dim_ranges)] 108 | 109 | # create a list of dictionaries for positional indexing 110 | positional_indices = [dict(zip(partition_on, ij)) for ij in dim_cartesian_product_indices] 111 | 112 | if not partitions: 113 | partitions = len(dim_cartesian_product_indices) 114 | 115 | if partitions > len(dim_cartesian_product_indices): 116 | partitions = len(dim_cartesian_product_indices) 117 | 118 | 119 | # Create an RDD 120 | rdd = sc.parallelize(positional_indices, partitions).map(lambda x: _readone_slice(dset, x)) 121 | 122 | return rdd 123 | 124 | 125 | def _readone_slice(dset, positional_indices): 126 | """Read a slice from an xarray.Dataset. 127 | 128 | Parameters 129 | ---------- 130 | 131 | dset : file_object 132 | xarray.Dataset object 133 | positional_indices : dict 134 | dict containing positional indices for each dimension 135 | e.g. {'lat': 0, 'lon': 0} 136 | 137 | Returns 138 | --------- 139 | chunk : xarray.Dataset 140 | a subset of the Xarray Dataset 141 | 142 | """ 143 | 144 | # Change the positional indices into slice objects 145 | # e.g {'lat': 0, 'lon': 0} ---> {'lat': slice(0, 1, None), 'lon': slice(0, 1, None)} 146 | positional_slices = {dim: slice(positional_indices[dim], positional_indices[dim]+1) 147 | for dim in positional_indices} 148 | 149 | # Read a slice for the given positional_slices 150 | chunk = dset[positional_slices] 151 | return chunk 152 | 153 | 154 | def _read_nc_multi(sc, paths, **kwargs): 155 | """ Read multiple netCDF files 156 | 157 | Parameters 158 | ----------- 159 | sc : sparkContext object 160 | 161 | paths : str or sequence 162 | Either a string glob in the form "path/to/my/files/*.nc" or an explicit 163 | list of files to open 164 | 165 | **kwargs : dict 166 | Additional arguments for partitioning 167 | 168 | """ 169 | 170 | partition_on = kwargs.get('partition_on') 171 | partitions = kwargs.get('partitions') 172 | 173 | dset = xr.open_mfdataset(paths, autoclose=True) 174 | 175 | # D = {'dim_1': dim_1_size, 'dim_2': dim_2_size, ...} 176 | D ={dset[dimension].name:dset[dimension].size for dimension in partition_on} 177 | 178 | # dim_sizes = [range(dim_1_size), range(dim_2_size), range(...)] 179 | dim_ranges = [range(dim_size) for dim_size in D.values()] 180 | 181 | dim_cartesian_product_indices = [element for element in itertools.product(*dim_ranges)] 182 | 183 | # create a list of dictionaries for positional indexing 184 | positional_indices = [dict(zip(partition_on, ij)) for ij in dim_cartesian_product_indices] 185 | 186 | if not partitions: 187 | partitions = len(dim_cartesian_product_indices) / 50 188 | 189 | if partitions > len(dim_cartesian_product_indices): 190 | partitions = len(dim_cartesian_product_indices) 191 | 192 | 193 | # Create an RDD 194 | rdd = sc.parallelize(positional_indices, partitions).map(lambda x: readone_slice(dset, x)) 195 | 196 | return rdd -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Build Status](https://travis-ci.org/andersy005/spark-xarray.svg?branch=master)](https://travis-ci.org/andersy005/spark-xarray) 3 | [![codecov](https://codecov.io/gh/andersy005/spark-xarray/branch/master/graph/badge.svg)](https://codecov.io/gh/andersy005/spark-xarray) 4 | [![Build status](https://ci.appveyor.com/api/projects/status/93dmqmctpjcgnbcs/branch/master?svg=true)](https://ci.appveyor.com/project/andersy005/spark-xarray/branch/master) 5 | [![License: MIT](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT) [![PyPI](https://img.shields.io/pypi/pyversions/Django.svg)]() 6 | 7 | # spark-xarray 8 | 9 | spark-xarray is an open source project and Python package that seeks to integrate PySpark and xarray for Climate Data Analysis. It is built on top of [PySpark - Spark Python API](https://spark.apache.org/docs/latest/api/python/index.html) and [xarray](http://xarray.pydata.org/en/stable/). 10 | 11 | spark-xarray was originally conceived during the Summer of 2017 as part of [PySpark for "Big" Atmospheric & Oceanic Data Analysis](https://ncar.github.io/PySpark4Climate/) - [A CISL/SIParCS Research Project](https://www2.cisl.ucar.edu/siparcs). 12 | 13 | It is currently maintained by [Anderson Banihirwe](https://github.com/andersy005). 14 | 15 | Documentation is available at https://andersy005.github.io/spark-xarray/. 16 | ## Installation 17 | 18 | We will guide you how to install spark-xarray. However, we will assume that an Apache Spark installation is available. 19 | 20 | 21 | ### Install 22 | 23 | #### Requirements 24 | 25 | For the installation of ```spark-xarray```, the following packages are required: 26 | 27 | 28 | - [Spark 2.0+](https://spark.apache.org/) 29 | - [netcdf4-python (>=1.2.8)](https://unidata.github.io/netcdf4-python/) 30 | - ```xarray (>=0.9.5)``` 31 | - ```dask (>=0.15.1)``` 32 | - ```toolz (>=0.8.2)``` 33 | 34 | #### Install 35 | 36 | Clone the repository directly from GitHub and install it aftwards using ```$ python setup.py```. This will also resolve possible missing dependencies. 37 | 38 | ```sh 39 | $ git clone https://github.com/andersy005/spark-xarray.git 40 | $ cd spark-xarray 41 | $ python setup.py install 42 | ``` 43 | 44 | ## Development 45 | 46 | We welcome new contributors of all experience levels. 47 | 48 | ### Important links 49 | 50 | - Official source code repo: https://github.com/andersy005/spark-xarray 51 | - Issue tracker: https://github.com/andersy005/spark-xarray/issues 52 | 53 | ## Examples 54 | 55 | ### Single file 56 | 57 | ```python 58 | >>> from sparkxarray.reader import ncread 59 | >>> from pyspark.sql import SparkSession 60 | >>> spark = SparkSession.builder.appName('spark-rdd').getOrCreate() 61 | >>> sc = spark.SparkContext 62 | >>> filepath='spark-xarray/sparkxarray/tests/data/air.sig995.2012.nc' 63 | >>> # Create an RDD 64 | >>> rdd = ncread(sc, filepath, mode='single', partition_on=['time'], partitions=100) 65 | >>> rdd.first() # Get the first element 66 | 67 | Dimensions: (lat: 73, lon: 144, time: 1) 68 | Coordinates: 69 | * lat (lat) float32 90.0 87.5 85.0 82.5 80.0 77.5 75.0 72.5 70.0 67.5 ... 70 | * lon (lon) float32 0.0 2.5 5.0 7.5 10.0 12.5 15.0 17.5 20.0 22.5 ... 71 | * time (time) datetime64[ns] 2012-01-01 72 | Data variables: 73 | air (time, lat, lon) float64 234.5 234.5 234.5 234.5 234.5 234.5 ... 74 | Attributes: 75 | Conventions: COARDS 76 | title: mean daily NMC reanalysis (2012) 77 | history: created 2011/12 by Hoop (netCDF2.3) 78 | description: Data is from NMC initialized reanalysis\n(4x/day). These a... 79 | platform: Model 80 | references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly... 81 | >>> rdd.count() # Get a count of elements in the rdd 82 | 366 83 | >>> # The count above corresponds to number of timesteps in the netCDF file 84 | >>> rdd.getNumPartitions() # Get the number of partitions 85 | 100 86 | >>> # Compute the daily average for each day (element) in RDD 87 | >>> daily_average = rdd.map(lambda x: x.mean(dim=['lat', 'lon'])) 88 | >>> daily_average.take(3) 89 | [ 90 | Dimensions: (time: 1) 91 | Coordinates: 92 | * time (time) datetime64[ns] 2012-01-01 93 | Data variables: 94 | air (time) float64 277.0, 95 | Dimensions: (time: 1) 96 | Coordinates: 97 | * time (time) datetime64[ns] 2012-01-02 98 | Data variables: 99 | air (time) float64 276.8, 100 | Dimensions: (time: 1) 101 | Coordinates: 102 | * time (time) datetime64[ns] 2012-01-03 103 | Data variables: 104 | air 105 | ``` 106 | 107 | ### Multiple files 108 | 109 | ```python 110 | >>> from sparkxarray.reader import ncread 111 | >>> from pyspark.sql import SparkSession 112 | >>> spark = SparkSession.builder.appName('spark-rdd').getOrCreate() 113 | >>> sc = spark.SparkContext 114 | >>> paths='spark-xarray/sparkxarray/tests/data/NCEP/*.nc' 115 | >>> multi_rdd = ncread(sc, paths, mode='multi', partition_on=['lat', 'lon'], partitions=300) 116 | >>> multi_rdd.count() 117 | 16020 118 | >>> multi_rdd.first() 119 | 120 | Dimensions: (lat: 1, lon: 1, nv: 2, time: 4, zlev: 1) 121 | Coordinates: 122 | * zlev (zlev) float32 0.0 123 | * lat (lat) float32 -88.0 124 | * lon (lon) float32 0.0 125 | * time (time) datetime64[ns] 1854-01-15 1854-02-15 1854-03-15 1854-04-15 126 | Dimensions without coordinates: nv 127 | Data variables: 128 | lat_bnds (time, lat, nv) float32 -89.0 -87.0 -89.0 -87.0 -89.0 -87.0 ... 129 | lon_bnds (time, lon, nv) float32 -1.0 1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0 130 | sst (time, zlev, lat, lon) float64 nan nan nan nan 131 | anom (time, zlev, lat, lon) float64 nan nan nan nan 132 | Attributes: 133 | Conventions: CF-1.6 134 | Metadata_Conventions: CF-1.6, Unidata Dataset Discovery v1.0 135 | metadata_link: C00884 136 | id: ersst.v4.185401 137 | naming_authority: gov.noaa.ncdc 138 | title: NOAA Extended Reconstructed Sea Surface Tempe... 139 | summary: ERSST.v4 is developped based on v3b after rev... 140 | institution: NOAA/NESDIS/NCDC 141 | creator_name: Boyin Huang 142 | creator_email: boyin.huang@noaa.gov 143 | date_created: 2014-10-24 144 | production_version: Beta Version 4 145 | history: Version 4 based on Version 3b 146 | publisher_name: Boyin Huang 147 | publisher_email: boyin.huang@noaa.gov 148 | publisher_url: http://www.ncdc.noaa.gov 149 | creator_url: http://www.ncdc.noaa.gov 150 | license: No constraints on data access or use 151 | time_coverage_start: 1854-01-15T000000Z 152 | time_coverage_end: 1854-01-15T000000Z 153 | geospatial_lon_min: -1.0f 154 | geospatial_lon_max: 359.0f 155 | geospatial_lat_min: -89.0f 156 | geospatial_lat_max: 89.0f 157 | geospatial_lat_units: degrees_north 158 | geospatial_lat_resolution: 2.0 159 | geospatial_lon_units: degrees_east 160 | geospatial_lon_resolution: 2.0 161 | spatial_resolution: 2.0 degree grid 162 | cdm_data_type: Grid 163 | processing_level: L4 164 | standard_name_vocabulary: CF Standard Name Table v27 165 | keywords: Earth Science > Oceans > Ocean Temperat... 166 | keywords_vocabulary: NASA Global Change Master Directory (GCMD) Sc... 167 | project: NOAA Extended Reconstructed Sea Surface Tempe... 168 | platform: Ship and Buoy SSTs from ICOADS R2.5 and NCEP GTS 169 | instrument: Conventional thermometers 170 | source: ICOADS R2.5 SST, NCEP GTS SST, HadISST ice, N... 171 | comment: SSTs were observed by conventional thermomete... 172 | references: Huang et al, 2014: Extended Reconstructed Sea... 173 | climatology: Climatology is based on 1971-2000 SST, Xue, Y... 174 | description: In situ data: ICOADS2.5 before 2007 and NCEP ... 175 | ``` 176 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /bias.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
    " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "ExecuteTime": { 18 | "end_time": "2017-11-16T19:28:46.494511Z", 19 | "start_time": "2017-11-16T19:28:45.846305Z" 20 | } 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# import reader module from sparkxarray\n", 25 | "from sparkxarray import reader\n", 26 | "from pyspark.sql import SparkSession" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "ExecuteTime": { 34 | "end_time": "2017-11-16T19:28:51.948198Z", 35 | "start_time": "2017-11-16T19:28:47.038329Z" 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "# Create sparksession\n", 41 | "spark = SparkSession.builder.appName(\"bias\").getOrCreate()\n", 42 | "sc = spark.sparkContext" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "ExecuteTime": { 50 | "end_time": "2017-11-16T19:28:51.953696Z", 51 | "start_time": "2017-11-16T19:28:51.950303Z" 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "FILE_1 = \"/home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_KNMI-RACMO2.2b_CTL_ERAINT_MM_50km_1989-2008_tasmax.nc\"\n", 57 | "FILE_2 = \"/home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_UC-WRF311_CTL_ERAINT_MM_50km-rg_1989-2008_tasmax.nc\"\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": { 64 | "ExecuteTime": { 65 | "end_time": "2017-11-16T19:28:52.495714Z", 66 | "start_time": "2017-11-16T19:28:51.959272Z" 67 | } 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "knmi = reader.ncread(sc, FILE_1, mode='single', partition_on=['rlat', 'rlon'], partitions=500, decode_times=False)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": { 78 | "ExecuteTime": { 79 | "end_time": "2017-11-16T19:28:55.702752Z", 80 | "start_time": "2017-11-16T19:28:53.427679Z" 81 | } 82 | }, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "\n", 88 | "Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 89 | "Coordinates:\n", 90 | " * time (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...\n", 91 | " * rlon (rlon) float32 -24.64\n", 92 | " lon (rlat, rlon) float32 -24.64\n", 93 | " * rlat (rlat) float32 -45.76\n", 94 | " lat (rlat, rlon) float32 -45.76\n", 95 | " * height (height) float32 2.0\n", 96 | "Dimensions without coordinates: bnds\n", 97 | "Data variables:\n", 98 | " rotated_pole |S1 b''\n", 99 | " time_bnds (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...\n", 100 | " tasmax (time, height, rlat, rlon) float64 283.4 284.2 284.2 284.6 ...\n", 101 | "Attributes:\n", 102 | " institution: KNMI\n", 103 | " Conventions: CF-1.0\n", 104 | " conventionsURL: http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html\n", 105 | " source: RACMO2.2b\n", 106 | " project_id: ENSEMBLES\n", 107 | " experiment_id: ERAIN CORDEX-Africa-50km\n", 108 | " realization: 1\n", 109 | " comments: beta-version RACMO2.2 with default physics from ECMWF CY...\n", 110 | " creation_date: 2010-04-09 13:53:22" 111 | ] 112 | }, 113 | "execution_count": 5, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "knmi.first()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "metadata": { 126 | "ExecuteTime": { 127 | "end_time": "2017-11-16T19:28:55.835670Z", 128 | "start_time": "2017-11-16T19:28:55.706696Z" 129 | } 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "wrf = reader.ncread(sc, FILE_2, mode='single', partition_on=['rlat', 'rlon'], partitions=500, decode_times=False)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": { 140 | "ExecuteTime": { 141 | "end_time": "2017-11-16T19:28:56.969431Z", 142 | "start_time": "2017-11-16T19:28:55.840045Z" 143 | } 144 | }, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "\n", 150 | "Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 151 | "Coordinates:\n", 152 | " lon (rlat, rlon) float64 -24.64\n", 153 | " lat (rlat, rlon) float64 -45.76\n", 154 | " * height (height) float32 2.0\n", 155 | " * time (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...\n", 156 | " * rlat (rlat) float64 -45.76\n", 157 | " * rlon (rlon) float64 -24.64\n", 158 | "Dimensions without coordinates: bnds\n", 159 | "Data variables:\n", 160 | " tasmax (time, height, rlat, rlon) float64 283.4 284.2 284.3 284.6 ...\n", 161 | " rotated_pole |S1 b''\n", 162 | " time_bnds (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...\n", 163 | "Attributes:\n", 164 | " Conventions: CF-1.4\n", 165 | " institution: Universidad de Cantabria (Spain)\n", 166 | " title: CORDEX Africa Sensitivity Run\n", 167 | " comment: The simulation was forced with ERA-Interim 2x2...\n", 168 | " nco_openmp_thread_number: 1" 169 | ] 170 | }, 171 | "execution_count": 7, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "wrf.first()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "metadata": { 184 | "ExecuteTime": { 185 | "end_time": "2017-11-16T19:29:42.654411Z", 186 | "start_time": "2017-11-16T19:29:07.174825Z" 187 | } 188 | }, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "CPU times: user 72 ms, sys: 28 ms, total: 100 ms\n", 195 | "Wall time: 35.5 s\n" 196 | ] 197 | }, 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "38994" 202 | ] 203 | }, 204 | "execution_count": 8, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "%time wrf.count()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 12, 216 | "metadata": { 217 | "ExecuteTime": { 218 | "end_time": "2017-11-16T19:30:31.565338Z", 219 | "start_time": "2017-11-16T19:30:31.556340Z" 220 | } 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "def create_indices(element):\n", 225 | " lat = round(float(element.rlat.data), 1)\n", 226 | " lon = round(float(element.rlon.data), 1)\n", 227 | " key = (lat, lon)\n", 228 | " return (key, element)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 13, 234 | "metadata": { 235 | "ExecuteTime": { 236 | "end_time": "2017-11-16T19:30:32.584974Z", 237 | "start_time": "2017-11-16T19:30:32.491836Z" 238 | } 239 | }, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "((-45.8, -24.6), \n", 245 | " Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 246 | " Coordinates:\n", 247 | " * time (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...\n", 248 | " * rlon (rlon) float32 -24.64\n", 249 | " lon (rlat, rlon) float32 -24.64\n", 250 | " * rlat (rlat) float32 -45.76\n", 251 | " lat (rlat, rlon) float32 -45.76\n", 252 | " * height (height) float32 2.0\n", 253 | " Dimensions without coordinates: bnds\n", 254 | " Data variables:\n", 255 | " rotated_pole |S1 b''\n", 256 | " time_bnds (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...\n", 257 | " tasmax (time, height, rlat, rlon) float64 283.4 284.2 284.2 284.6 ...\n", 258 | " Attributes:\n", 259 | " institution: KNMI\n", 260 | " Conventions: CF-1.0\n", 261 | " conventionsURL: http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html\n", 262 | " source: RACMO2.2b\n", 263 | " project_id: ENSEMBLES\n", 264 | " experiment_id: ERAIN CORDEX-Africa-50km\n", 265 | " realization: 1\n", 266 | " comments: beta-version RACMO2.2 with default physics from ECMWF CY...\n", 267 | " creation_date: 2010-04-09 13:53:22)" 268 | ] 269 | }, 270 | "execution_count": 13, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "knmi2 = knmi.map(create_indices)\n", 277 | "knmi2.first()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 14, 283 | "metadata": { 284 | "ExecuteTime": { 285 | "end_time": "2017-11-16T19:30:35.180079Z", 286 | "start_time": "2017-11-16T19:30:35.094491Z" 287 | } 288 | }, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | "((-45.8, -24.6), \n", 294 | " Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 295 | " Coordinates:\n", 296 | " lon (rlat, rlon) float64 -24.64\n", 297 | " lat (rlat, rlon) float64 -45.76\n", 298 | " * height (height) float32 2.0\n", 299 | " * time (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...\n", 300 | " * rlat (rlat) float64 -45.76\n", 301 | " * rlon (rlon) float64 -24.64\n", 302 | " Dimensions without coordinates: bnds\n", 303 | " Data variables:\n", 304 | " tasmax (time, height, rlat, rlon) float64 283.4 284.2 284.3 284.6 ...\n", 305 | " rotated_pole |S1 b''\n", 306 | " time_bnds (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...\n", 307 | " Attributes:\n", 308 | " Conventions: CF-1.4\n", 309 | " institution: Universidad de Cantabria (Spain)\n", 310 | " title: CORDEX Africa Sensitivity Run\n", 311 | " comment: The simulation was forced with ERA-Interim 2x2...\n", 312 | " nco_openmp_thread_number: 1)" 313 | ] 314 | }, 315 | "execution_count": 14, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "wrf2 = wrf.map(create_indices)\n", 322 | "wrf2.first()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 15, 328 | "metadata": { 329 | "ExecuteTime": { 330 | "end_time": "2017-11-16T19:33:35.868775Z", 331 | "start_time": "2017-11-16T19:31:08.958021Z" 332 | } 333 | }, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "((-44.4, -14.1), (\n", 339 | " Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 340 | " Coordinates:\n", 341 | " lon (rlat, rlon) float64 -14.08\n", 342 | " lat (rlat, rlon) float64 -44.44\n", 343 | " * height (height) float32 2.0\n", 344 | " * time (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...\n", 345 | " * rlat (rlat) float64 -44.44\n", 346 | " * rlon (rlon) float64 -14.08\n", 347 | " Dimensions without coordinates: bnds\n", 348 | " Data variables:\n", 349 | " tasmax (time, height, rlat, rlon) float64 284.4 285.3 284.2 285.0 ...\n", 350 | " rotated_pole |S1 b''\n", 351 | " time_bnds (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...\n", 352 | " Attributes:\n", 353 | " Conventions: CF-1.4\n", 354 | " institution: Universidad de Cantabria (Spain)\n", 355 | " title: CORDEX Africa Sensitivity Run\n", 356 | " comment: The simulation was forced with ERA-Interim 2x2...\n", 357 | " nco_openmp_thread_number: 1, \n", 358 | " Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 359 | " Coordinates:\n", 360 | " * time (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...\n", 361 | " * rlon (rlon) float32 -14.08\n", 362 | " lon (rlat, rlon) float32 -14.08\n", 363 | " * rlat (rlat) float32 -44.44\n", 364 | " lat (rlat, rlon) float32 -44.44\n", 365 | " * height (height) float32 2.0\n", 366 | " Dimensions without coordinates: bnds\n", 367 | " Data variables:\n", 368 | " rotated_pole |S1 b''\n", 369 | " time_bnds (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...\n", 370 | " tasmax (time, height, rlat, rlon) float64 284.1 285.2 284.2 285.0 ...\n", 371 | " Attributes:\n", 372 | " institution: KNMI\n", 373 | " Conventions: CF-1.0\n", 374 | " conventionsURL: http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html\n", 375 | " source: RACMO2.2b\n", 376 | " project_id: ENSEMBLES\n", 377 | " experiment_id: ERAIN CORDEX-Africa-50km\n", 378 | " realization: 1\n", 379 | " comments: beta-version RACMO2.2 with default physics from ECMWF CY...\n", 380 | " creation_date: 2010-04-09 13:53:22))" 381 | ] 382 | }, 383 | "execution_count": 15, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "rdd = wrf2.join(knmi2, numPartitions=500)\n", 390 | "rdd.first()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 16, 396 | "metadata": { 397 | "ExecuteTime": { 398 | "end_time": "2017-11-16T19:33:35.877366Z", 399 | "start_time": "2017-11-16T19:33:35.871317Z" 400 | } 401 | }, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/plain": [ 406 | "500" 407 | ] 408 | }, 409 | "execution_count": 16, 410 | "metadata": {}, 411 | "output_type": "execute_result" 412 | } 413 | ], 414 | "source": [ 415 | "rdd.getNumPartitions()" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 17, 421 | "metadata": { 422 | "ExecuteTime": { 423 | "end_time": "2017-11-16T19:34:15.591937Z", 424 | "start_time": "2017-11-16T19:33:35.879862Z" 425 | } 426 | }, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "38994" 432 | ] 433 | }, 434 | "execution_count": 17, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "rdd.count()" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 18, 446 | "metadata": { 447 | "ExecuteTime": { 448 | "end_time": "2017-11-16T19:40:36.261659Z", 449 | "start_time": "2017-11-16T19:40:35.204744Z" 450 | } 451 | }, 452 | "outputs": [ 453 | { 454 | "data": { 455 | "text/plain": [ 456 | "((-44.4, -14.1), (\n", 457 | " Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 458 | " Coordinates:\n", 459 | " lon (rlat, rlon) float64 -14.08\n", 460 | " lat (rlat, rlon) float64 -44.44\n", 461 | " * height (height) float32 2.0\n", 462 | " * time (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...\n", 463 | " * rlat (rlat) float64 -44.44\n", 464 | " * rlon (rlon) float64 -14.08\n", 465 | " Dimensions without coordinates: bnds\n", 466 | " Data variables:\n", 467 | " tasmax (time, height, rlat, rlon) float64 284.4 285.3 284.2 285.0 ...\n", 468 | " rotated_pole |S1 b''\n", 469 | " time_bnds (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...\n", 470 | " Attributes:\n", 471 | " Conventions: CF-1.4\n", 472 | " institution: Universidad de Cantabria (Spain)\n", 473 | " title: CORDEX Africa Sensitivity Run\n", 474 | " comment: The simulation was forced with ERA-Interim 2x2...\n", 475 | " nco_openmp_thread_number: 1, \n", 476 | " Dimensions: (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)\n", 477 | " Coordinates:\n", 478 | " * time (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...\n", 479 | " * rlon (rlon) float32 -14.08\n", 480 | " lon (rlat, rlon) float32 -14.08\n", 481 | " * rlat (rlat) float32 -44.44\n", 482 | " lat (rlat, rlon) float32 -44.44\n", 483 | " * height (height) float32 2.0\n", 484 | " Dimensions without coordinates: bnds\n", 485 | " Data variables:\n", 486 | " rotated_pole |S1 b''\n", 487 | " time_bnds (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...\n", 488 | " tasmax (time, height, rlat, rlon) float64 284.1 285.2 284.2 285.0 ...\n", 489 | " Attributes:\n", 490 | " institution: KNMI\n", 491 | " Conventions: CF-1.0\n", 492 | " conventionsURL: http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html\n", 493 | " source: RACMO2.2b\n", 494 | " project_id: ENSEMBLES\n", 495 | " experiment_id: ERAIN CORDEX-Africa-50km\n", 496 | " realization: 1\n", 497 | " comments: beta-version RACMO2.2 with default physics from ECMWF CY...\n", 498 | " creation_date: 2010-04-09 13:53:22))" 499 | ] 500 | }, 501 | "execution_count": 18, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "a = rdd.first()\n", 508 | "a" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 139, 514 | "metadata": { 515 | "ExecuteTime": { 516 | "end_time": "2017-11-16T20:49:26.115527Z", 517 | "start_time": "2017-11-16T20:49:25.997131Z" 518 | } 519 | }, 520 | "outputs": [], 521 | "source": [ 522 | "def bias_correct(element):\n", 523 | " import numpy as np\n", 524 | " obs = element[1][1].tasmax.values.ravel()\n", 525 | " mod = element[1][0].tasmax.values.ravel()\n", 526 | " \n", 527 | " cdfn = 30.0\n", 528 | " \n", 529 | " obs = np.sort(obs)\n", 530 | " mod = np.sort(mod)\n", 531 | " \n", 532 | " global_max = max(np.amax(obs), np.amax(mod))\n", 533 | " \n", 534 | " wide = global_max / cdfn\n", 535 | " \n", 536 | " xbins = np.arange(0.0, global_max+wide, wide)\n", 537 | " \n", 538 | " pdfobs, bins = np.histogram(obs, bins=xbins)\n", 539 | " pdfmod, bins = np.histogram(mod, bins=xbins)\n", 540 | " \n", 541 | " cdfobs = np.insert(np.cumsum(pdfobs), 0, 0.0)\n", 542 | " cdfmod = np.insert(np.cumsum(pdfmod), 0, 0.0) \n", 543 | " \n", 544 | " vals = [150., 256.6, 100000]\n", 545 | " \n", 546 | " def bias_map(vals, xbins, cdfmod, cdfobs):\n", 547 | " xbins = xbins\n", 548 | " cdfmod = cdfmod\n", 549 | " cdfobs = cdfobs\n", 550 | " \n", 551 | " cdf1 = np.interp(vals, xbins, cdfmod)\n", 552 | " \n", 553 | " corrected = np.interp(cdf1, cdfobs, xbins)\n", 554 | " \n", 555 | " return corrected \n", 556 | "\n", 557 | " results = bias_map(vals, xbins, cdfmod, cdfobs)\n", 558 | " \n", 559 | " return results " 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 140, 565 | "metadata": { 566 | "ExecuteTime": { 567 | "end_time": "2017-11-16T20:49:26.644420Z", 568 | "start_time": "2017-11-16T20:49:26.640479Z" 569 | } 570 | }, 571 | "outputs": [], 572 | "source": [ 573 | "bias_corrected = rdd.map(bias_correct)" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 142, 579 | "metadata": { 580 | "ExecuteTime": { 581 | "end_time": "2017-11-16T20:51:21.227339Z", 582 | "start_time": "2017-11-16T20:51:20.369483Z" 583 | } 584 | }, 585 | "outputs": [ 586 | { 587 | "data": { 588 | "text/plain": [ 589 | "[array([ 276.68960063, 276.68960063, 286.23062134]),\n", 590 | " array([ 277.6843516 , 277.6843516 , 287.25967407]),\n", 591 | " array([ 280.57859904, 280.57859904, 290.25372314]),\n", 592 | " array([ 280.19751383, 280.19751383, 299.52148031]),\n", 593 | " array([ 283.49803975, 283.49803975, 293.27383423]),\n", 594 | " array([ 285.22876485, 285.22876485, 304.89971415]),\n", 595 | " array([ 285.08433126, 285.08433126, 294.91482544]),\n", 596 | " array([ 283.94258016, 283.94258016, 293.73370361]),\n", 597 | " array([ 283.80044759, 283.80044759, 293.58666992]),\n", 598 | " array([ 288.61755575, 288.61755575, 298.56988525])]" 599 | ] 600 | }, 601 | "execution_count": 142, 602 | "metadata": {}, 603 | "output_type": "execute_result" 604 | } 605 | ], 606 | "source": [ 607 | "bias_corrected.take(10)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 102, 613 | "metadata": { 614 | "ExecuteTime": { 615 | "end_time": "2017-11-16T20:37:53.698087Z", 616 | "start_time": "2017-11-16T20:37:53.528502Z" 617 | } 618 | }, 619 | "outputs": [ 620 | { 621 | "data": { 622 | "text/plain": [ 623 | "282.32231330871582" 624 | ] 625 | }, 626 | "execution_count": 102, 627 | "metadata": {}, 628 | "output_type": "execute_result" 629 | } 630 | ], 631 | "source": [ 632 | "bias_corrected.first().mean()" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [] 641 | } 642 | ], 643 | "metadata": { 644 | "kernelspec": { 645 | "display_name": "Python 3", 646 | "language": "python", 647 | "name": "python3" 648 | }, 649 | "language_info": { 650 | "codemirror_mode": { 651 | "name": "ipython", 652 | "version": 3 653 | }, 654 | "file_extension": ".py", 655 | "mimetype": "text/x-python", 656 | "name": "python", 657 | "nbconvert_exporter": "python", 658 | "pygments_lexer": "ipython3", 659 | "version": "3.6.3" 660 | }, 661 | "toc": { 662 | "nav_menu": {}, 663 | "number_sections": true, 664 | "sideBar": true, 665 | "skip_h1_title": false, 666 | "toc_cell": true, 667 | "toc_position": {}, 668 | "toc_section_display": "block", 669 | "toc_window_display": false 670 | } 671 | }, 672 | "nbformat": 4, 673 | "nbformat_minor": 2 674 | } 675 | -------------------------------------------------------------------------------- /examples/bias/bias.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "\n", 12 | "\n", 13 | "\n", 469 | "\n", 470 | "\n", 471 | "\n", 472 | "\n", 557 | "\n", 558 | "\n", 559 | "
    \n", 560 | "\n", 630 | "\n", 631 | "\n", 632 | "\n", 633 | "\n", 634 | " \n", 686 | " \n", 687 | "\n", 688 | "\n", 689 | "
    \n" 690 | ], 691 | "text/plain": [ 692 | "" 693 | ] 694 | }, 695 | "metadata": {}, 696 | "output_type": "display_data" 697 | } 698 | ], 699 | "source": [ 700 | "import warnings\n", 701 | "warnings.filterwarnings('ignore')\n", 702 | "%matplotlib inline\n", 703 | "import matplotlib.pyplot as plt\n", 704 | "import matplotlib\n", 705 | "matplotlib.style.use('ggplot')\n", 706 | "matplotlib.rcParams['figure.figsize'] = (12, 15)\n", 707 | "from pyspark.sql import SparkSession\n", 708 | "import holoviews as hv\n", 709 | "import geoviews as gv\n", 710 | "import geoviews.feature as gf\n", 711 | "from cartopy import crs\n", 712 | "\n", 713 | "hv.notebook_extension()\n" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 128, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "\"\"\" Interface for Data Ingestion.\n", 723 | "\"\"\"\n", 724 | "# Licensed to the Apache Software Foundation (ASF) under one or more\n", 725 | "# contributor license agreements. See the NOTICE file distributed with\n", 726 | "# this work for additional information regarding copyright ownership.\n", 727 | "# The ASF licenses this file to You under the Apache License, Version 2.0\n", 728 | "# (the \"License\"); you may not use this file except in compliance with\n", 729 | "# the License. You may obtain a copy of the License at\n", 730 | "#\n", 731 | "# http://www.apache.org/licenses/LICENSE-2.0\n", 732 | "#\n", 733 | "# Unless required by applicable law or agreed to in writing, software\n", 734 | "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", 735 | "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", 736 | "# See the License for the specific language governing permissions and\n", 737 | "# limitations under the License.\n", 738 | "\n", 739 | "\n", 740 | "from __future__ import print_function\n", 741 | "from __future__ import absolute_import\n", 742 | "import os\n", 743 | "import numpy as np\n", 744 | "import pandas as pd \n", 745 | "import xarray as xr\n", 746 | "import itertools\n", 747 | "from glob import glob\n", 748 | "# from pyspark.sql import SparkSession # Removing this line simply makes the library compatible with Spark 1.6.3 !\n", 749 | "\n", 750 | "def ncread(sc, paths, mode='single', **kwargs):\n", 751 | " \"\"\"Calls sparkxarray netcdf read function based on the mode parameter.\n", 752 | "\n", 753 | " ============ ==============================\n", 754 | " Mode Reading Function\n", 755 | " ------------ ------------------------------\n", 756 | " single : read_nc_single\n", 757 | " multi : read_nc_multi\n", 758 | " Anything else: Throw an exception\n", 759 | " ============= ==============================\n", 760 | "\n", 761 | " Parameters\n", 762 | " ----------\n", 763 | "\n", 764 | " sc : sparkContext object\n", 765 | "\n", 766 | " paths : str or sequence\n", 767 | " Either a string glob in the form \"path/to/my/files/*.nc\" or an explicit\n", 768 | " list of files to open\n", 769 | "\n", 770 | " mode : str\n", 771 | " 'single' for a single file\n", 772 | " 'multi' for multiple files\n", 773 | "\n", 774 | " **kwargs : dict\n", 775 | " partitioning options to be passed on to the actual read function.\n", 776 | " \n", 777 | " \n", 778 | " \"\"\"\n", 779 | "\n", 780 | " if 'partitions' not in kwargs:\n", 781 | " kwargs['partitions'] = None\n", 782 | "\n", 783 | " if 'partition_on' not in kwargs:\n", 784 | " kwargs['partition_on'] = ['time']\n", 785 | " \n", 786 | " if 'decode_times' not in kwargs:\n", 787 | " kwargs['decode_times'] = True\n", 788 | "\n", 789 | " error_msg = (\"You specified a mode that is not implemented.\")\n", 790 | "\n", 791 | " if (mode == 'single'):\n", 792 | " return _read_nc_single(sc, paths, **kwargs)\n", 793 | "\n", 794 | " elif (mode == 'multi'):\n", 795 | " return _read_nc_multi(sc, paths, **kwargs)\n", 796 | " else:\n", 797 | " raise NotImplementedError(error_msg)\n", 798 | "\n", 799 | " \n", 800 | "def _read_nc_single(sc, paths, **kwargs):\n", 801 | " \"\"\" Read a single netCDF file\n", 802 | "\n", 803 | " Parameters\n", 804 | " -----------\n", 805 | " sc : sparkContext object\n", 806 | "\n", 807 | " paths : str\n", 808 | " an explicit filename to open\n", 809 | " \n", 810 | "\n", 811 | " **kwargs : dict\n", 812 | " Additional arguments for partitioning \n", 813 | "\n", 814 | " \"\"\"\n", 815 | " partition_on = kwargs.get('partition_on')\n", 816 | " partitions = kwargs.get('partitions')\n", 817 | " decode_times=kwargs.get('decode_times')\n", 818 | "\n", 819 | " dset = xr.open_dataset(paths, autoclose=True, decode_times=decode_times)\n", 820 | "\n", 821 | " # D = {'dim_1': dim_1_size, 'dim_2': dim_2_size, ...}\n", 822 | " D = {dset[dimension].name:dset[dimension].size for dimension in partition_on}\n", 823 | " \n", 824 | " # dim_sizes = [range(dim_1_size), range(dim_2_size), range(...)]\n", 825 | " dim_ranges = [range(dim_size) for dim_size in D.values()]\n", 826 | " \n", 827 | "\n", 828 | " dim_cartesian_product_indices = [element for element in itertools.product(*dim_ranges)]\n", 829 | "\n", 830 | " # create a list of dictionaries for positional indexing\n", 831 | " positional_indices = [dict(zip(partition_on, ij)) for ij in dim_cartesian_product_indices]\n", 832 | "\n", 833 | " if not partitions:\n", 834 | " partitions = len(dim_cartesian_product_indices)\n", 835 | "\n", 836 | " if partitions > len(dim_cartesian_product_indices):\n", 837 | " partitions = len(dim_cartesian_product_indices)\n", 838 | "\n", 839 | " \n", 840 | " # Create an RDD\n", 841 | " rdd = sc.parallelize(positional_indices, partitions).map(lambda x: _readone_slice(dset, x))\n", 842 | "\n", 843 | " return rdd\n", 844 | "\n", 845 | "\n", 846 | "def _readone_slice(dset, positional_indices):\n", 847 | " \"\"\"Read a slice from an xarray.Dataset.\n", 848 | "\n", 849 | " Parameters\n", 850 | " ----------\n", 851 | "\n", 852 | " dset : file_object\n", 853 | " xarray.Dataset object\n", 854 | " positional_indices : dict\n", 855 | " dict containing positional indices for each dimension\n", 856 | " e.g. {'lat': 0, 'lon': 0}\n", 857 | "\n", 858 | " Returns\n", 859 | " ---------\n", 860 | " chunk : xarray.Dataset\n", 861 | " a subset of the Xarray Dataset\n", 862 | "\n", 863 | " \"\"\"\n", 864 | "\n", 865 | " # Change the positional indices into slice objects\n", 866 | " # e.g {'lat': 0, 'lon': 0} ---> {'lat': slice(0, 1, None), 'lon': slice(0, 1, None)}\n", 867 | " positional_slices = {dim: slice(positional_indices[dim], positional_indices[dim]+1) \n", 868 | " for dim in positional_indices}\n", 869 | "\n", 870 | " # Read a slice for the given positional_slices\n", 871 | " chunk = dset[positional_slices]\n", 872 | " return chunk\n", 873 | "\n", 874 | "\n", 875 | "def _read_nc_multi(sc, paths, **kwargs):\n", 876 | " \"\"\" Read multiple netCDF files\n", 877 | "\n", 878 | " Parameters\n", 879 | " -----------\n", 880 | " sc : sparkContext object\n", 881 | "\n", 882 | " paths : str or sequence\n", 883 | " Either a string glob in the form \"path/to/my/files/*.nc\" or an explicit\n", 884 | " list of files to open\n", 885 | "\n", 886 | " **kwargs : dict\n", 887 | " Additional arguments for partitioning \n", 888 | "\n", 889 | " \"\"\"\n", 890 | "\n", 891 | " partition_on = kwargs.get('partition_on')\n", 892 | " partitions = kwargs.get('partitions')\n", 893 | "\n", 894 | " dset = xr.open_mfdataset(paths, autoclose=True)\n", 895 | "\n", 896 | " # D = {'dim_1': dim_1_size, 'dim_2': dim_2_size, ...}\n", 897 | " D ={dset[dimension].name:dset[dimension].size for dimension in partition_on}\n", 898 | " \n", 899 | " # dim_sizes = [range(dim_1_size), range(dim_2_size), range(...)]\n", 900 | " dim_ranges = [range(dim_size) for dim_size in D.values()]\n", 901 | "\n", 902 | " dim_cartesian_product_indices = [element for element in itertools.product(*dim_ranges)]\n", 903 | "\n", 904 | " # create a list of dictionaries for positional indexing\n", 905 | " positional_indices = [dict(zip(partition_on, ij)) for ij in dim_cartesian_product_indices]\n", 906 | "\n", 907 | " if not partitions:\n", 908 | " partitions = len(dim_cartesian_product_indices) / 50\n", 909 | "\n", 910 | " if partitions > len(dim_cartesian_product_indices):\n", 911 | " partitions = len(dim_cartesian_product_indices)\n", 912 | "\n", 913 | " \n", 914 | " # Create an RDD\n", 915 | " rdd = sc.parallelize(positional_indices, partitions).map(lambda x: readone_slice(dset, x))\n", 916 | "\n", 917 | " return rdd\n", 918 | "\n" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 3, 924 | "metadata": {}, 925 | "outputs": [], 926 | "source": [ 927 | "# Create sparksession\n", 928 | "spark = SparkSession.builder.appName(\"bias\").getOrCreate()\n", 929 | "sc = spark.sparkContext" 930 | ] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": 167, 935 | "metadata": {}, 936 | "outputs": [], 937 | "source": [ 938 | "FILE_1 = \"/home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_KNMI-RACMO2.2b_CTL_ERAINT_MM_50km_1989-2008_tasmax.nc\"\n", 939 | "FILE_2 = \"/home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_UC-WRF311_CTL_ERAINT_MM_50km-rg_1989-2008_tasmax.nc\"" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 168, 945 | "metadata": {}, 946 | "outputs": [ 947 | { 948 | "data": { 949 | "text/plain": [ 950 | "\n", 951 | "Dimensions: (bnds: 2, height: 1, rlat: 201, rlon: 194, time: 240)\n", 952 | "Coordinates:\n", 953 | " * time (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...\n", 954 | " * rlon (rlon) float32 -24.64 -24.2 -23.76 -23.32 -22.88 -22.44 ...\n", 955 | " lon (rlat, rlon) float32 -24.64 -24.2 -23.76 -23.32 -22.88 ...\n", 956 | " * rlat (rlat) float32 -45.76 -45.32 -44.88 -44.44 -44.0 -43.56 ...\n", 957 | " lat (rlat, rlon) float32 -45.76 -45.76 -45.76 -45.76 -45.76 ...\n", 958 | " * height (height) float32 2.0\n", 959 | "Dimensions without coordinates: bnds\n", 960 | "Data variables:\n", 961 | " rotated_pole |S1 b''\n", 962 | " time_bnds (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...\n", 963 | " tasmax (time, height, rlat, rlon) float64 283.4 283.4 283.4 283.4 ...\n", 964 | "Attributes:\n", 965 | " institution: KNMI\n", 966 | " Conventions: CF-1.0\n", 967 | " conventionsURL: http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html\n", 968 | " source: RACMO2.2b\n", 969 | " project_id: ENSEMBLES\n", 970 | " experiment_id: ERAIN CORDEX-Africa-50km\n", 971 | " realization: 1\n", 972 | " comments: beta-version RACMO2.2 with default physics from ECMWF CY...\n", 973 | " creation_date: 2010-04-09 13:53:22" 974 | ] 975 | }, 976 | "execution_count": 168, 977 | "metadata": {}, 978 | "output_type": "execute_result" 979 | } 980 | ], 981 | "source": [ 982 | "knmi = xr.open_dataset(FILE_1, decode_times=False)\n", 983 | "knmi" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": 169, 989 | "metadata": {}, 990 | "outputs": [ 991 | { 992 | "data": { 993 | "text/plain": [ 994 | "\n", 995 | "Dimensions: (bnds: 2, height: 1, rlat: 201, rlon: 194, time: 240)\n", 996 | "Coordinates:\n", 997 | " lon (rlat, rlon) float64 -24.64 -24.2 -23.76 -23.32 -22.88 ...\n", 998 | " lat (rlat, rlon) float64 -45.76 -45.76 -45.76 -45.76 -45.76 ...\n", 999 | " * height (height) float32 2.0\n", 1000 | " * time (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...\n", 1001 | " * rlat (rlat) float64 -45.76 -45.32 -44.88 -44.44 -44.0 -43.56 ...\n", 1002 | " * rlon (rlon) float64 -24.64 -24.2 -23.76 -23.32 -22.88 -22.44 ...\n", 1003 | "Dimensions without coordinates: bnds\n", 1004 | "Data variables:\n", 1005 | " tasmax (time, height, rlat, rlon) float64 283.4 283.4 283.5 283.5 ...\n", 1006 | " rotated_pole |S1 b''\n", 1007 | " time_bnds (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...\n", 1008 | "Attributes:\n", 1009 | " Conventions: CF-1.4\n", 1010 | " institution: Universidad de Cantabria (Spain)\n", 1011 | " title: CORDEX Africa Sensitivity Run\n", 1012 | " comment: The simulation was forced with ERA-Interim 2x2...\n", 1013 | " nco_openmp_thread_number: 1" 1014 | ] 1015 | }, 1016 | "execution_count": 169, 1017 | "metadata": {}, 1018 | "output_type": "execute_result" 1019 | } 1020 | ], 1021 | "source": [ 1022 | "wrf = xr.open_dataset(FILE_2, decode_times=False)\n", 1023 | "wrf" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": 170, 1029 | "metadata": {}, 1030 | "outputs": [], 1031 | "source": [ 1032 | "import ocw.data_source.local as local\n", 1033 | "import ocw.dataset_processor as dsp\n", 1034 | "import ocw.evaluation as evaluation\n", 1035 | "import ocw.metrics as metrics\n", 1036 | "import ocw.plotter as plotter" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 171, 1042 | "metadata": {}, 1043 | "outputs": [ 1044 | { 1045 | "name": "stdout", 1046 | "output_type": "stream", 1047 | "text": [ 1048 | "Loading /home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_KNMI-RACMO2.2b_CTL_ERAINT_MM_50km_1989-2008_tasmax.nc into an OCW Dataset Object\n", 1049 | "KNMI_Dataset.values shape: (times, lats, lons) - (240, 201, 194) \n", 1050 | "\n", 1051 | "Loading /home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_UC-WRF311_CTL_ERAINT_MM_50km-rg_1989-2008_tasmax.nc into an OCW Dataset Object\n", 1052 | "WRF_Dataset.values shape: (times, lats, lons) - (240, 201, 194) \n", 1053 | "\n" 1054 | ] 1055 | } 1056 | ], 1057 | "source": [ 1058 | "\"\"\" Step 1: Load Local NetCDF Files into OCW Dataset Objects \"\"\"\n", 1059 | "print(\"Loading %s into an OCW Dataset Object\" % (FILE_1,))\n", 1060 | "knmi_dataset = local.load_file(FILE_1, \"tasmax\")\n", 1061 | "print(\"KNMI_Dataset.values shape: (times, lats, lons) - %s \\n\" %\n", 1062 | " (knmi_dataset.values.shape,))\n", 1063 | "\n", 1064 | "print(\"Loading %s into an OCW Dataset Object\" % (FILE_2,))\n", 1065 | "wrf_dataset = local.load_file(FILE_2, \"tasmax\")\n", 1066 | "print(\"WRF_Dataset.values shape: (times, lats, lons) - %s \\n\" %\n", 1067 | " (wrf_dataset.values.shape,))" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "code", 1072 | "execution_count": 172, 1073 | "metadata": {}, 1074 | "outputs": [ 1075 | { 1076 | "name": "stdout", 1077 | "output_type": "stream", 1078 | "text": [ 1079 | "Temporally Rebinning the Datasets to an Annual Timestep\n", 1080 | "KNMI_Dataset.values shape: (20, 201, 194)\n", 1081 | "WRF_Dataset.values shape: (20, 201, 194) \n", 1082 | "\n", 1083 | "\n" 1084 | ] 1085 | } 1086 | ], 1087 | "source": [ 1088 | "\"\"\" Step 2: Temporally Rebin the Data into an Annual Timestep \"\"\"\n", 1089 | "print(\"Temporally Rebinning the Datasets to an Annual Timestep\")\n", 1090 | "knmi_dataset = dsp.temporal_rebin(knmi_dataset, temporal_resolution='annual')\n", 1091 | "wrf_dataset = dsp.temporal_rebin(wrf_dataset, temporal_resolution='annual')\n", 1092 | "print(\"KNMI_Dataset.values shape: %s\" % (knmi_dataset.values.shape,))\n", 1093 | "print(\"WRF_Dataset.values shape: %s \\n\\n\" % (wrf_dataset.values.shape,))" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": 175, 1099 | "metadata": {}, 1100 | "outputs": [ 1101 | { 1102 | "name": "stdout", 1103 | "output_type": "stream", 1104 | "text": [ 1105 | "The KNMI_Dataset spatial bounds (min_lat, max_lat, min_lon, max_lon) are: \n", 1106 | "(-45.7599983215332, 42.2400016784668, -24.639999389648438, 60.279998779296875)\n", 1107 | "\n", 1108 | "The KNMI_Dataset spatial resolution (lat_resolution, lon_resolution) is: \n", 1109 | "(0.43999863, 0.44000053)\n", 1110 | "\n", 1111 | "\n" 1112 | ] 1113 | } 1114 | ], 1115 | "source": [ 1116 | "\"\"\" Step 3: Spatially Regrid the Dataset Objects to a 1 degree grid \"\"\"\n", 1117 | "# The spatial_boundaries() function returns the spatial extent of the dataset\n", 1118 | "print(\"The KNMI_Dataset spatial bounds (min_lat, max_lat, min_lon, max_lon) are: \\n\"\n", 1119 | " \"%s\\n\" % (knmi_dataset.spatial_boundaries(), ))\n", 1120 | "print(\"The KNMI_Dataset spatial resolution (lat_resolution, lon_resolution) is: \\n\"\n", 1121 | " \"%s\\n\\n\" % (knmi_dataset.spatial_resolution(), ))" 1122 | ] 1123 | }, 1124 | { 1125 | "cell_type": "code", 1126 | "execution_count": 176, 1127 | "metadata": {}, 1128 | "outputs": [ 1129 | { 1130 | "name": "stdout", 1131 | "output_type": "stream", 1132 | "text": [ 1133 | "Spatially Regridding the KNMI_Dataset...\n", 1134 | "Final shape of the KNMI_Dataset: \n", 1135 | "(20, 88, 85)\n", 1136 | "\n" 1137 | ] 1138 | } 1139 | ], 1140 | "source": [ 1141 | "min_lat, max_lat, min_lon, max_lon = knmi_dataset.spatial_boundaries()\n", 1142 | "\n", 1143 | "# Using the bounds we will create a new set of lats and lons on 1 degree step\n", 1144 | "new_lons = np.arange(min_lon, max_lon, 1)\n", 1145 | "new_lats = np.arange(min_lat, max_lat, 1)\n", 1146 | "\n", 1147 | "# Spatially regrid datasets using the new_lats, new_lons numpy arrays\n", 1148 | "print(\"Spatially Regridding the KNMI_Dataset...\")\n", 1149 | "knmi_dataset = dsp.spatial_regrid(knmi_dataset, new_lats, new_lons)\n", 1150 | "print(\"Final shape of the KNMI_Dataset: \\n\"\n", 1151 | " \"%s\\n\" % (knmi_dataset.values.shape, ))" 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "code", 1156 | "execution_count": 177, 1157 | "metadata": {}, 1158 | "outputs": [ 1159 | { 1160 | "name": "stdout", 1161 | "output_type": "stream", 1162 | "text": [ 1163 | "Final shape of the WRF_Dataset: \n", 1164 | "(20, 88, 85)\n", 1165 | "\n" 1166 | ] 1167 | } 1168 | ], 1169 | "source": [ 1170 | "wrf_dataset = dsp.spatial_regrid(wrf_dataset, new_lats, new_lons)\n", 1171 | "print(\"Final shape of the WRF_Dataset: \\n\"\n", 1172 | " \"%s\\n\" % (wrf_dataset.values.shape, ))" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": 178, 1178 | "metadata": {}, 1179 | "outputs": [ 1180 | { 1181 | "name": "stdout", 1182 | "output_type": "stream", 1183 | "text": [ 1184 | "Setting up a Bias metric to use for evaluation\n" 1185 | ] 1186 | } 1187 | ], 1188 | "source": [ 1189 | "\"\"\" Step 4: Build a Metric to use for Evaluation - Bias for this example \"\"\"\n", 1190 | "# You can build your own metrics, but OCW also ships with some common metrics\n", 1191 | "print(\"Setting up a Bias metric to use for evaluation\")\n", 1192 | "bias = metrics.Bias()" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": 179, 1198 | "metadata": {}, 1199 | "outputs": [ 1200 | { 1201 | "name": "stdout", 1202 | "output_type": "stream", 1203 | "text": [ 1204 | "Making the Evaluation definition\n", 1205 | "Executing the Evaluation using the object's run() method\n" 1206 | ] 1207 | } 1208 | ], 1209 | "source": [ 1210 | "\"\"\" Step 5: Create an Evaluation Object using Datasets and our Metric \"\"\"\n", 1211 | "# The Evaluation Class Signature is:\n", 1212 | "# Evaluation(reference, targets, metrics, subregions=None)\n", 1213 | "# Evaluation can take in multiple targets and metrics, so we need to convert\n", 1214 | "# our examples into Python lists. Evaluation will iterate over the lists\n", 1215 | "print(\"Making the Evaluation definition\")\n", 1216 | "bias_evaluation = evaluation.Evaluation(knmi_dataset, [wrf_dataset], [bias])\n", 1217 | "print(\"Executing the Evaluation using the object's run() method\")\n", 1218 | "bias_evaluation.run()" 1219 | ] 1220 | }, 1221 | { 1222 | "cell_type": "code", 1223 | "execution_count": 180, 1224 | "metadata": {}, 1225 | "outputs": [ 1226 | { 1227 | "name": "stdout", 1228 | "output_type": "stream", 1229 | "text": [ 1230 | "Accessing the Results of the Evaluation run\n", 1231 | "The results are of type: \n" 1232 | ] 1233 | } 1234 | ], 1235 | "source": [ 1236 | "\"\"\" Step 6: Make a Plot from the Evaluation.results \"\"\"\n", 1237 | "# The Evaluation.results are a set of nested lists to support many different\n", 1238 | "# possible Evaluation scenarios.\n", 1239 | "#\n", 1240 | "# The Evaluation results docs say:\n", 1241 | "# The shape of results is (num_metrics, num_target_datasets) if no subregion\n", 1242 | "# Accessing the actual results when we have used 1 metric and 1 dataset is\n", 1243 | "# done this way:\n", 1244 | "print(\"Accessing the Results of the Evaluation run\")\n", 1245 | "results = bias_evaluation.results[0][0]\n", 1246 | "print(\"The results are of type: %s\" % type(results))\n" 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": 181, 1252 | "metadata": {}, 1253 | "outputs": [], 1254 | "source": [ 1255 | "OUTPUT_PLOT = \"wrf_bias_compared_to_knmi\"" 1256 | ] 1257 | }, 1258 | { 1259 | "cell_type": "code", 1260 | "execution_count": 182, 1261 | "metadata": {}, 1262 | "outputs": [ 1263 | { 1264 | "name": "stdout", 1265 | "output_type": "stream", 1266 | "text": [ 1267 | "Generating a contour map using ocw.plotter.draw_contour_map()\n" 1268 | ] 1269 | }, 1270 | { 1271 | "data": { 1272 | "text/plain": [ 1273 | "" 1274 | ] 1275 | }, 1276 | "metadata": {}, 1277 | "output_type": "display_data" 1278 | } 1279 | ], 1280 | "source": [ 1281 | "print(\"Generating a contour map using ocw.plotter.draw_contour_map()\")\n", 1282 | "\n", 1283 | "lats = new_lats\n", 1284 | "lons = new_lons\n", 1285 | "fname = OUTPUT_PLOT\n", 1286 | "gridshape = (4, 5) # 20 Years worth of plots. 20 rows in 1 column\n", 1287 | "plot_title = \"TASMAX Bias of WRF Compared to KNMI (1989 - 2008)\"\n", 1288 | "sub_titles = range(1989, 2009, 1)\n", 1289 | "\n", 1290 | "plotter.draw_contour_map(results, lats, lons, fname,\n", 1291 | " gridshape=gridshape, ptitle=plot_title,\n", 1292 | " subtitles=sub_titles)\n", 1293 | "plt.show()" 1294 | ] 1295 | }, 1296 | { 1297 | "cell_type": "code", 1298 | "execution_count": 183, 1299 | "metadata": {}, 1300 | "outputs": [ 1301 | { 1302 | "name": "stdout", 1303 | "output_type": "stream", 1304 | "text": [ 1305 | "Making the Evaluation definition\n", 1306 | "Executing the Evaluation using the object's run() method\n", 1307 | "Accessing the Results of the Evaluation run\n", 1308 | "The results are of type: \n" 1309 | ] 1310 | } 1311 | ], 1312 | "source": [ 1313 | "print(\"Making the Evaluation definition\")\n", 1314 | "bias_evaluation = evaluation.Evaluation(wrf_dataset, [knmi_dataset], [bias])\n", 1315 | "print(\"Executing the Evaluation using the object's run() method\")\n", 1316 | "bias_evaluation.run()\n", 1317 | "print(\"Accessing the Results of the Evaluation run\")\n", 1318 | "results = bias_evaluation.results[0][0]\n", 1319 | "print(\"The results are of type: %s\" % type(results))\n" 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": 184, 1325 | "metadata": {}, 1326 | "outputs": [], 1327 | "source": [ 1328 | "OUTPUT_PLOT = \"knmi_bias_compared_to_wrf\"" 1329 | ] 1330 | }, 1331 | { 1332 | "cell_type": "code", 1333 | "execution_count": null, 1334 | "metadata": {}, 1335 | "outputs": [], 1336 | "source": [ 1337 | "print(\"Generating a contour map using ocw.plotter.draw_contour_map()\")\n", 1338 | "\n", 1339 | "lats = new_lats\n", 1340 | "lons = new_lons\n", 1341 | "fname = OUTPUT_PLOT\n", 1342 | "gridshape = (4, 5) # 20 Years worth of plots. 20 rows in 1 column\n", 1343 | "plot_title = \"TASMAX Bias of KNMI Compared to WRF (1989 - 2008)\"\n", 1344 | "sub_titles = range(1989, 2009, 1)\n", 1345 | "\n", 1346 | "plotter.draw_contour_map(results, lats, lons, fname,\n", 1347 | " gridshape=gridshape, ptitle=plot_title,\n", 1348 | " subtitles=sub_titles)\n", 1349 | "plt.show()" 1350 | ] 1351 | } 1352 | ], 1353 | "metadata": { 1354 | "kernelspec": { 1355 | "display_name": "Python 3", 1356 | "language": "python", 1357 | "name": "python3" 1358 | }, 1359 | "language_info": { 1360 | "codemirror_mode": { 1361 | "name": "ipython", 1362 | "version": 3 1363 | }, 1364 | "file_extension": ".py", 1365 | "mimetype": "text/x-python", 1366 | "name": "python", 1367 | "nbconvert_exporter": "python", 1368 | "pygments_lexer": "ipython3", 1369 | "version": "3.6.3" 1370 | } 1371 | }, 1372 | "nbformat": 4, 1373 | "nbformat_minor": 2 1374 | } 1375 | --------------------------------------------------------------------------------