├── notebooks
    ├── plots
    │   └── emptyfileforgit.txt
    ├── results
    │   └── emptyfileforgit.txt
    ├── utils.py
    ├── 5_Model_Selection_and_Evaluation.ipynb
    ├── 1_Data.ipynb
    └── 3_Supervised_Learning.ipynb
├── CHANGELOG.rst
├── .travis.yml
├── .gitignore
├── requirements.txt
├── bibliography.bib
├── CITATION.cff
├── .github
    └── workflows
    │   └── tests.yml
├── LICENSE
└── README.rst


/notebooks/plots/emptyfileforgit.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/results/emptyfileforgit.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | Change Log
 2 | ==========
 3 | 
 4 | [1.0.1] - 
 5 | ------------------------
 6 | - [CHANGED] Tensorflow 1 -> 2
 7 | - [REMOVED] Keras
 8 | 
 9 | [1.0.0] - 2019-09-19
10 | ------------------------
11 | - Initial release
12 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: xenial
 2 | language: python
 3 | python:
 4 |   - "3.6"
 5 |   - "3.7"
 6 |   - "3.8"
 7 | install:
 8 |   - pip install -r requirements.txt
 9 | script:
10 |   - pytest  --nbval --cov=.
11 | after_success:
12 |   - codecov
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | .DS_Store
 3 | .ipynb_checkpoints
 4 | __pycache__
 5 | notebooks/plots/*
 6 | !notebooks/plots/emptyfileforgit.txt
 7 | notebooks/results/*
 8 | !notebooks/results/emptyfileforgit.txt
 9 | .pytest_cache
10 | notebooks/tmp/
11 | notebooks/7_AutoML.ipynb
12 | .coverage
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorflow>=2.5.0
 2 | notebook>=6.0.0
 3 | jupyter
 4 | ipywidgets
 5 | susi
 6 | pandas
 7 | matplotlib
 8 | seaborn
 9 | modAL-python==0.4.2.1
10 | bayesian-optimization
11 | umap-learn>=0.3.10
12 | tqdm
13 | patchify
14 | scipy==1.10.1
15 | 
16 | # for testing
17 | pytest>=5.1.1
18 | pytest-cov
19 | codecov
20 | nbval>=0.9.3
21 | 


--------------------------------------------------------------------------------
/bibliography.bib:
--------------------------------------------------------------------------------
 1 | @incollection{riese2020supervised,
 2 |     author = {Riese, Felix~M. and Keller, Sina},
 3 |     title ={{Supervised, Semi-Supervised, and Unsupervised Learning for
 4 |             Hyperspectral Regression}},
 5 |     booktitle = {{Hyperspectral Image Analysis: Advances in Machine
 6 |                     Learning and Signal Processing}},
 7 |     editor = {Prasad, Saurabh and Chanussot, Jocelyn},
 8 |     year = {2020},
 9 |     publisher = {Springer International Publishing},
10 |     address = {Cham},
11 |     chapter = {7},
12 |     pages = {187--232},
13 |     doi = {10.1007/978-3-030-38617-7_7},
14 | }
15 | 
16 | @misc{riese2019hyperspectral,
17 |     author = {Riese, Felix~M. and Keller, Sina},
18 |     title = {{Hyperspectral Regression: Code Examples}},
19 |     year = {2019},
20 |     DOI = {10.5281/zenodo.3450676},
21 |     publisher = {Zenodo},
22 |     howpublished = {\href{https://doi.org/10.5281/zenodo.3450676}{doi.org/10.5281/zenodo.3450676}}
23 | }
24 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite both the article from preferred-citation and the software itself."
 3 | authors:
 4 |   - family-names: Riese
 5 |     given-names: Felix M.
 6 |     orcid: https://orcid.org/0000-0003-0596-9585
 7 |   - family-names: Keller
 8 |     given-names: Sina
 9 |     orcid: https://orcid.org/0000-0002-7710-5316
10 | title: "Hyperspectral Regression: Code Examples"
11 | version: 1.0.0
12 | doi: "10.5281/zenodo.3450676"
13 | date-released: 2019-09-19
14 | repository-code: https://github.com/felixriese/hyperspectral-regression
15 | license: BSD-3-Clause
16 | preferred-citation:
17 |   authors:
18 |     - family-names: Riese
19 |       given-names: Felix M.
20 |     - family-names: Keller
21 |       given-names: Sina
22 |   title: "Supervised, Semi-Supervised, and Unsupervised Learning for Hyperspectral Regression"
23 |   type: book
24 |   year: 2020
25 |   doi: "10.1007/978-3-030-38617-7_7"
26 |   publisher:
27 |     - name: "Springer International Publishing"
28 |     - Publications city: "Cham"
29 |   collection-title: "Hyperspectral Image Analysis: Advances in Machine Learning and Signal Processing"
30 |   start: 187
31 |   end: 232
32 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: "*"
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: ["3.8", "3.9", "3.10"]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install -r requirements.txt
27 |     - name: List of installed packages
28 |       run: |
29 |         pip list
30 |     - name: Test with pytest
31 |       if: ${{ matrix.python-version != '3.9' }}
32 |       run: |
33 |         pytest --nbval
34 |     - name: Test with pytest and Codecov
35 |       if: ${{ matrix.python-version == '3.9' }}
36 |       run: |
37 |         pip install pytest-cov
38 |         pytest  --nbval --cov=. --cov-report=xml
39 |     - name: Upload coverage to Codecov
40 |       if: ${{ matrix.python-version == '3.9' }}
41 |       uses: codecov/codecov-action@v1
42 |       with:
43 |         verbose: true
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Felix M. Riese and Sina Keller, Karlsruhe Institute of Technology
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://img.shields.io/github/license/felixriese/hyperspectral-regression
  2 |     :target: LICENSE
  3 |     :alt: License: BSD-3-Clause
  4 | 
  5 | .. image:: https://mybinder.org/badge_logo.svg
  6 |     :target: https://mybinder.org/v2/gh/felixriese/hyperspectral-regression/master?filepath=notebooks
  7 |     :alt: MyBinder
  8 | 
  9 | .. image:: https://travis-ci.com/felixriese/hyperspectral-regression.svg?branch=master
 10 |     :target: https://travis-ci.com/felixriese/hyperspectral-regression
 11 |     :alt: Travis.CI Status
 12 | 
 13 | .. image:: https://codecov.io/gh/felixriese/hyperspectral-regression/branch/master/graph/badge.svg
 14 |     :target: https://codecov.io/gh/felixriese/hyperspectral-regression
 15 |     :alt: Codecov
 16 | 
 17 | .. image:: https://api.codacy.com/project/badge/Grade/6808eea2d5984c7d8364f7659b40f9ea
 18 |     :target: https://www.codacy.com/manual/felixriese/hyperspectral-regression?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=felixriese/hyperspectral-regression&amp;utm_campaign=Badge_Grade
 19 |     :alt: Codacy Status
 20 | 
 21 | Hyperspectral Regression: Code Examples
 22 | ===============================================
 23 | 
 24 | This repository consists of additional material and exemplary implementations for our book chapter.
 25 | 
 26 | The code in this repository is provided via notebooks. The notebooks are structured as follows:
 27 | 
 28 | 1. `Data <notebooks/1_Data.ipynb>`_
 29 | 2. `Features <notebooks/2_Features.ipynb>`_
 30 | 3. `Supervised Learning <notebooks/3_Supervised_Learning.ipynb>`_
 31 | 4. `Active Learning <notebooks/4_Active_Learning.ipynb>`_
 32 | 5. `Model Selection and Evaluation <notebooks/5_Model_Selection_and_Evaluation.ipynb>`_
 33 | 6. `Generative Adversarial Networks <notebooks/6_GANs.ipynb>`_
 34 | 
 35 | Description
 36 | -----------
 37 | 
 38 | 
 39 | 
 40 | :License:
 41 |     `3-Clause BSD license <LICENSE>`_
 42 | 
 43 | :Authors:
 44 |     `Felix M. Riese <mailto:github@felixriese.de>`_, `Sina Keller <mailto:sina.keller@kit.edu>`_
 45 | 
 46 | :Citation:
 47 |     see `Citation`_
 48 | 
 49 | :Paper:
 50 |     `Riese and Keller (2020) <https://doi.org/10.1007/978-3-030-38617-7_7>`_
 51 | 
 52 | :Requirements:
 53 |     Python 3 with these `packages <requirements.txt>`_
 54 | 
 55 | 
 56 | How to use this repository?
 57 | ---------------------------
 58 | 
 59 | 1. Install Python 3, e.g. with `Anaconda <https://www.anaconda.com/distribution/>`_
 60 | 
 61 | 2. Install the required packages
 62 | 
 63 |     conda install --file requirements.txt
 64 | 
 65 | 3. Start jupyter
 66 | 
 67 |     jupyter notebook
 68 | 
 69 | 4. Open the notebook folder in this repository in the Jupyter browser and select the desired notebook.
 70 | 
 71 | ----
 72 | 
 73 | Citation
 74 | --------
 75 | 
 76 | The bibtex file including both references is available in `bibliography.bib
 77 | <bibliography.bib>`_.
 78 | 
 79 | **Paper:**
 80 | 
 81 | Felix M. Riese and Sina Keller, "Supervised, Semi-Supervised, and Unsupervised
 82 | Learning for Hyperspectral Regression", in *Hyperspectral Image Analysis:
 83 | Advances in Machine Learning and Signal Processing*, Saurabh Prasad and Jocelyn
 84 | Chanussot, Eds. Cham: Springer International Publishing, 2020, ch. 7,
 85 | pp. 187–232, `doi:10.1007/978-3-030-38617-7_7 <https://doi.org/10.1007/978-3-030-38617-7_7>`_.
 86 | 
 87 | .. code:: bibtex
 88 | 
 89 |     @incollection{riese2020supervised,
 90 |         author = {Riese, Felix~M. and Keller, Sina},
 91 |         title ={{Supervised, Semi-Supervised, and Unsupervised Learning for
 92 |                 Hyperspectral Regression}},
 93 |         booktitle = {{Hyperspectral Image Analysis: Advances in Machine
 94 |                      Learning and Signal Processing}},
 95 |         editor = {Prasad, Saurabh and Chanussot, Jocelyn},
 96 |         year = {2020},
 97 |         publisher = {Springer International Publishing},
 98 |         address = {Cham},
 99 |         chapter = {7},
100 |         pages = {187--232},
101 |         doi = {10.1007/978-3-030-38617-7_7},
102 |     }
103 | 
104 | **Code:**
105 | 
106 | Felix M. Riese and Sina Keller, "Hyperspectral Regression: Code Examples",
107 | Zenodo, `doi:10.5281/zenodo.3450676 <http://doi.org/10.5281/zenodo.3450676>`_,
108 | 2019.
109 | 
110 | .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3450676.svg
111 |     :target: https://doi.org/10.5281/zenodo.3450676
112 |     :alt: DOI
113 | 
114 | .. code:: bibtex
115 | 
116 |     @misc{riese2019hyperspectral,
117 |         author = {Riese, Felix~M. and Keller, Sina},
118 |         title = {{Hyperspectral Regression: Code Examples}},
119 |         year = {2019},
120 |         DOI = {10.5281/zenodo.3450676},
121 |         publisher = {Zenodo},
122 |         howpublished = {\href{https://doi.org/10.5281/zenodo.3450676}{doi.org/10.5281/zenodo.3450676}}
123 |     }
124 | 


--------------------------------------------------------------------------------
/notebooks/utils.py:
--------------------------------------------------------------------------------
  1 | """Package with helper functions."""
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.model_selection import train_test_split
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | 
  8 | def get_xy():
  9 |     """Download and format the data."""
 10 |     # load dataframe
 11 |     path = ("https://raw.githubusercontent.com/felixriese/hyperspectral"
 12 |             "-soilmoisture-dataset/master/soilmoisture_dataset.csv")
 13 |     df = pd.read_csv(path, index_col=0)
 14 | 
 15 |     # get features (= hyperspectral bands):
 16 |     features = [col for col in df.columns if col.isdigit()]
 17 | 
 18 |     X = df[features].values
 19 |     y = df["soil_moisture"].values
 20 | 
 21 |     return X, y
 22 | 
 23 | 
 24 | def get_xy_split(missing_rate=0.0):
 25 |     """Split data.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     missing_rate : float
 30 |         Percentage of missing data for semi-supervised learning.
 31 | 
 32 |     """
 33 |     X, y = get_xy()
 34 | 
 35 |     X_train, X_test, y_train, y_test = train_test_split(
 36 |         X, y, test_size=0.5, random_state=42, shuffle=True)
 37 | 
 38 |     if missing_rate == 0.0:
 39 |         return X_train, X_test, y_train, y_test
 40 | 
 41 |     # semi-supervised and active case
 42 |     else:
 43 |         rng = np.random.RandomState(42)
 44 |         random_unlabeled_points = rng.rand(len(y_train)) < missing_rate
 45 |         y_train_semi = np.copy(y_train)
 46 |         y_train_semi[random_unlabeled_points] = -1
 47 | 
 48 |         return X_train, X_test, y_train_semi, y_test, y_train
 49 | 
 50 | 
 51 | def get_xy_shifted(cut=35):
 52 |     """Generate dataset shift in data.
 53 | 
 54 |     Parameters
 55 |     ----------
 56 |     cut : int
 57 |         Cut at which the target variable is shifted.
 58 | 
 59 |     Returns
 60 |     -------
 61 |     X_train, X_test, y_test, y_train : np.arrays
 62 |         Training and test datasets with input data `X` and target variable `y`.
 63 | 
 64 |     """
 65 |     X, y = get_xy()
 66 | 
 67 |     mask = y < cut
 68 |     X_train = X[mask]
 69 |     y_train = y[mask]
 70 |     X_test = X[~mask]
 71 |     y_test = y[~mask]
 72 | 
 73 |     return X_train, X_test, y_test, y_train
 74 | 
 75 | 
 76 | def write_results_to_latex_table(results, filename="results"):
 77 |     """Generate LaTeX table with results."""
 78 |     with open("results/"+filename+".tex", "w") as f:
 79 |         f.write("\documentclass{article}\n")
 80 |         f.write("\\usepackage{booktabs}\n")
 81 |         f.write("\\usepackage{multirow}\n")
 82 |         f.write("\\usepackage{siunitx}\n")
 83 |         f.write("\\begin{document}\n")
 84 |         f.write("\\begin{table}\n")
 85 |         f.write("\t\centering\n")
 86 |         f.write("\t\caption{Regression results for soil moisture.}\n")
 87 |         f.write("\t\\begin{tabular}{lSSSl}\n")
 88 |         f.write("\t\t\\toprule\n")
 89 |         f.write("\t\t Model &{$R^2$ in $\\%$} &{MAE} &{RMSE} & {Potential}\\\\\n")
 90 |         f.write("\t\t\midrule\n")
 91 |         for i in range(len(results["model"])):
 92 |             f.write("\t\t{model:10} & {r2:.1f} & {mae:.2f} & {rmse:.1f} & {potential}\\\\\n"
 93 |                     .format(model=results["model"][i],
 94 |                             r2=results["r2"][i]*100,
 95 |                             mae=results["mae"][i],
 96 |                             rmse=results["rmse"][i],
 97 |                             potential=results["potential"][i]))
 98 |         f.write("\t\t\\bottomrule\n")
 99 |         f.write("\t\end{tabular}\n")
100 |         f.write("\t\label{tab:supervised_results}\n")
101 |         f.write("\end{table}\n")
102 |         f.write("\end{document}\n")
103 | 
104 | 
105 | def plot_regression_results(truth, pred, model_name):
106 |     """Plot regression results.
107 | 
108 |     Parameters
109 |     ----------
110 |     truth : np.array
111 |         Array of true values y.
112 |     pred : np.array
113 |         Array of predicted values y_pred.
114 |     model_name : str
115 |         Name of the model.
116 | 
117 |     """
118 |     _, ax = plt.subplots(1, 1, figsize=(5, 5))
119 |     fontsize = 15
120 | 
121 |     # plot data
122 |     plt.scatter(truth, pred, label="Datapoints", alpha=0.3)
123 | 
124 |     # set min and max
125 |     pmin = np.min([np.min(truth), np.min(pred)]) - 1.
126 |     pmax = np.max([np.max(truth), np.max(pred)]) + 1.
127 |     plt.xlim(pmin, pmax)
128 |     plt.ylim(pmin, pmax)
129 | 
130 |     # plot line
131 |     plt.plot(np.linspace(pmin, pmax, 20), np.linspace(pmin, pmax, 20),
132 |              linestyle="dashed", c="tab:red", label="Ideal estimation")
133 | 
134 |     plt.xlabel("Soil moisture (measured) in %", fontsize=fontsize)
135 |     plt.ylabel("Soil moisture (estimated) in %", fontsize=fontsize)
136 |     plt.legend(fontsize=fontsize*0.8)
137 |     plt.title(model_name, fontsize=fontsize)
138 |     for tick in ax.xaxis.get_major_ticks()[1::2]:
139 |         tick.label.set_visible(False)
140 |     for tick in ax.xaxis.get_major_ticks():
141 |         tick.label.set_fontsize(fontsize)
142 |     for tick in ax.yaxis.get_major_ticks()[1::2]:
143 |         tick.label.set_visible(False)
144 |     for tick in ax.yaxis.get_major_ticks():
145 |         tick.label.set_fontsize(fontsize)
146 |     plt.savefig("plots/truthestimation_"+model_name.replace(
147 |         " ", "").lower()+".pdf", bbox_inches="tight")
148 | 


--------------------------------------------------------------------------------
/notebooks/5_Model_Selection_and_Evaluation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<div class=\"alert alert-block alert-info\">\n",
  8 |     "Section of the book chapter: <b>5.3 Model Selection, Optimization and Evaluation</b>\n",
  9 |     "</div>\n",
 10 |     "\n",
 11 |     "# 5. Model Selection and Evaluation\n",
 12 |     "\n",
 13 |     "**Table of Contents**\n",
 14 |     "\n",
 15 |     "* [5.1 Hyperparameter Optimization](#5.1-Hyperparameter-Optimization)\n",
 16 |     "* [5.2 Model Evaluation](#5.2-Model-Evaluation)\n",
 17 |     "\n",
 18 |     "**Learnings:**\n",
 19 |     "\n",
 20 |     "- how to optimize machine learning (ML) models with grid search, random search and Bayesian optimization,\n",
 21 |     "- how to evaluate ML models.\n",
 22 |     "\n",
 23 |     "\n",
 24 |     "\n",
 25 |     "### Packages"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 1,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "%matplotlib inline\n",
 35 |     "%config InlineBackend.figure_format = 'retina'\n",
 36 |     "\n",
 37 |     "# ignore warnings\n",
 38 |     "import warnings\n",
 39 |     "warnings.filterwarnings('ignore')\n",
 40 |     "\n",
 41 |     "import numpy as np\n",
 42 |     "import pandas as pd\n",
 43 |     "import seaborn as sns\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "import matplotlib as mpl\n",
 46 |     "from sklearn.ensemble import RandomForestRegressor\n",
 47 |     "\n",
 48 |     "import utils"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### Read in Data\n",
 56 |     "\n",
 57 |     "**Dataset:** Felix M. Riese and Sina Keller, \"Hyperspectral benchmark dataset on soil moisture\", Dataset, Zenodo, 2018. [DOI:10.5281/zenodo.1227836](http://doi.org/10.5281/zenodo.1227836) and [GitHub](https://github.com/felixriese/hyperspectral-soilmoisture-dataset)\n",
 58 |     "\n",
 59 |     "**Introducing paper:** Felix M. Riese and Sina Keller, “Introducing a Framework of Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154. [DOI:10.1109/IGARSS.2018.8517812](https://doi.org/10.1109/IGARSS.2018.8517812)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "X_train, X_test, y_train, y_test = utils.get_xy_split()\n",
 69 |     "\n",
 70 |     "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### Fix Random State"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "np.random.seed(42)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "***\n",
 94 |     "\n",
 95 |     "## 5.1 Hyperparameter Optimization\n",
 96 |     "\n",
 97 |     "Content:\n",
 98 |     "\n",
 99 |     "- [5.1.1 Grid Search](#5.1.1-Grid-Search)\n",
100 |     "- [5.1.2 Randomized Search](#5.1.2-Randomized-Search)\n",
101 |     "- [5.1.3 Bayesian Optimization](#5.1.3-Bayesian-Optimization)\n",
102 |     "\n",
103 |     "### 5.1.1 Grid Search"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# NBVAL_IGNORE_OUTPUT\n",
113 |     "\n",
114 |     "from sklearn.svm import SVR\n",
115 |     "from sklearn.model_selection import GridSearchCV\n",
116 |     "\n",
117 |     "# example mode: support vector regressor\n",
118 |     "model = SVR(kernel=\"rbf\")\n",
119 |     "\n",
120 |     "# define parameter grid to be tested\n",
121 |     "params = {\n",
122 |     "    \"C\": np.logspace(-4, 4, 9),\n",
123 |     "    \"gamma\": np.logspace(-4, 4, 9)}\n",
124 |     "\n",
125 |     "\n",
126 |     "# set up grid search and run it on the data\n",
127 |     "gs = GridSearchCV(model, params)\n",
128 |     "%timeit gs.fit(X_train, y_train)\n",
129 |     "print(\"R2 score = {0:.2f} %\".format(gs.score(X_test, y_test)*100))"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "### 5.1.2 Randomized Search"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# NBVAL_IGNORE_OUTPUT\n",
146 |     "\n",
147 |     "from sklearn.svm import SVR\n",
148 |     "from sklearn.model_selection import RandomizedSearchCV\n",
149 |     "\n",
150 |     "# example mode: support vector regressor\n",
151 |     "model = SVR(kernel=\"rbf\")\n",
152 |     "\n",
153 |     "# define parameter grid to be tested\n",
154 |     "params = {\n",
155 |     "    \"C\": np.logspace(-4, 4, 9),\n",
156 |     "    \"gamma\": np.logspace(-4, 4, 9)}\n",
157 |     "\n",
158 |     "# set up grid search and run it on the data\n",
159 |     "gsr = RandomizedSearchCV(model, params, n_iter=15, refit=True)\n",
160 |     "%timeit gsr.fit(X_train, y_train)\n",
161 |     "print(\"R2 score = {0:.2f} %\".format(gsr.score(X_test, y_test)*100))"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "### 5.1.3 Bayesian Optimization\n",
169 |     "\n",
170 |     "Implementation: [github.com/fmfn/BayesianOptimization](https://github.com/fmfn/BayesianOptimization)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "# NBVAL_IGNORE_OUTPUT\n",
180 |     "\n",
181 |     "from sklearn.svm import SVR\n",
182 |     "from bayes_opt import BayesianOptimization\n",
183 |     "\n",
184 |     "# define function to be optimized\n",
185 |     "def opt_func(C, gamma):\n",
186 |     "    model = SVR(C=C, gamma=gamma)\n",
187 |     "    return model.fit(X_train, y_train).score(X_test, y_test)\n",
188 |     "\n",
189 |     "# set bounded region of parameter space\n",
190 |     "pbounds = {'C': (1e-5, 1e4), 'gamma': (1e-5, 1e4)}\n",
191 |     "\n",
192 |     "# define optimizer\n",
193 |     "optimizer = BayesianOptimization(\n",
194 |     "    f=opt_func,\n",
195 |     "    pbounds=pbounds,\n",
196 |     "    random_state=1)\n",
197 |     "\n",
198 |     "# optimize\n",
199 |     "%time optimizer.maximize(init_points=2, n_iter=15)\n",
200 |     "print(\"R2 score = {0:.2f} %\".format(optimizer.max[\"target\"]*100))"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "***\n",
208 |     "\n",
209 |     "## 5.2 Model Evaluation\n",
210 |     "\n",
211 |     "Content:\n",
212 |     "\n",
213 |     "- [5.2.1 Generate Exemplary Data](#5.2.1-Generate-Exemplary-Data)\n",
214 |     "- [5.2.2 Plot the Data](#5.2.2-Plot-the-Data)\n",
215 |     "- [5.2.3 Evaluation Metrics](#5.2.3-Evaluation-Metrics)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "import sklearn.metrics as me"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "### 5.2.1 Generate Exemplary Data"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "### generate example data\n",
241 |     "np.random.seed(1)\n",
242 |     "\n",
243 |     "# define x grid\n",
244 |     "x_grid = np.linspace(0, 10, 11)\n",
245 |     "y_model = x_grid*0.5\n",
246 |     "\n",
247 |     "# define first dataset without outlier\n",
248 |     "y1 = np.array([y + np.random.normal(scale=0.2) for y in y_model])\n",
249 |     "\n",
250 |     "# define second dataset with outlier\n",
251 |     "y2 = np.copy(y1)\n",
252 |     "y2[9] = 0.5\n",
253 |     "\n",
254 |     "# define third dataset with higher variance\n",
255 |     "y3 = np.array([y + np.random.normal(scale=1.0) for y in y_model])"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "### 5.2.2 Plot the Data"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "# plot example data\n",
272 |     "fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(12,4))\n",
273 |     "fontsize = 18\n",
274 |     "titleweight = \"bold\"\n",
275 |     "titlepad = 10\n",
276 |     "\n",
277 |     "scatter_label = \"Data\"\n",
278 |     "scatter_alpha = 0.7\n",
279 |     "scatter_s = 100\n",
280 |     "ax1.scatter(x_grid, y1, label=scatter_label, alpha=scatter_alpha, s=scatter_s)\n",
281 |     "ax1.set_title(\"(a) Low var.\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n",
282 |     "\n",
283 |     "ax2.scatter(x_grid, y2, label=scatter_label, alpha=scatter_alpha, s=scatter_s)\n",
284 |     "ax2.set_title(\"(b) Low var. + outlier\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n",
285 |     "\n",
286 |     "ax3.scatter(x_grid, y3, label=scatter_label, alpha=scatter_alpha, s=scatter_s)\n",
287 |     "ax3.set_title(\"(c) Higher var.\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n",
288 |     "\n",
289 |     "for i, ax in enumerate([ax1, ax2, ax3]):\n",
290 |     "    i += 1\n",
291 |     "    \n",
292 |     "    # red line\n",
293 |     "    ax.plot(x_grid, y_model, label=\"Model\", c=\"tab:red\", linestyle=\"dashed\", linewidth=4, alpha=scatter_alpha)\n",
294 |     "    \n",
295 |     "    # x-axis cosmetics\n",
296 |     "    ax.set_xlabel(\"x in a.u.\", fontsize=fontsize)\n",
297 |     "    for tick in ax.xaxis.get_major_ticks():\n",
298 |     "        tick.label.set_fontsize(fontsize) \n",
299 |     "    \n",
300 |     "    # y-axis cosmetics\n",
301 |     "    if i != 1:\n",
302 |     "        ax.set_yticklabels([])\n",
303 |     "    else:\n",
304 |     "        ax.set_ylabel(\"y in a.u.\", fontsize=fontsize, rotation=90)\n",
305 |     "        for tick in ax.yaxis.get_major_ticks():\n",
306 |     "            tick.label.set_fontsize(fontsize) \n",
307 |     "    ax.set_xlim(-0.5, 10.5)\n",
308 |     "    ax.set_ylim(-0.5, 6.5)\n",
309 |     "    # ax.set_title(\"Example \"+str(i), fontsize=fontsize)\n",
310 |     "    if i == 2:\n",
311 |     "        ax.legend(loc=2, fontsize=fontsize*1.0, frameon=True)\n",
312 |     "\n",
313 |     "plt.tight_layout()\n",
314 |     "plt.savefig(\"plots/metrics_plot.pdf\", bbox_inches=\"tight\")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "### 5.2.3 Evaluation Metrics"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "# calculating the metrics\n",
331 |     "for i, y in enumerate([y1, y2, y3]):\n",
332 |     "    print(\"Example\", i+1)\n",
333 |     "    print(\"- MAE = {:.2f}\".format(me.mean_absolute_error(y_model, y)))\n",
334 |     "    print(\"- MSE = {:.2f}\".format(me.mean_squared_error(y_model, y)))\n",
335 |     "    print(\"- RMSE = {:.2f}\".format(np.sqrt(me.mean_squared_error(y_model, y))))\n",
336 |     "    print(\"- R2 = {:.2f}%\".format(me.r2_score(y_model, y)*100))\n",
337 |     "    print(\"-\"*20)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "raw",
342 |    "metadata": {},
343 |    "source": [
344 |     "# print out for LaTeX table\n",
345 |     "\n",
346 |     "descriptions = {\n",
347 |     "    1: \"Low variance\",\n",
348 |     "    2: \"Low variance and one outlier\",\n",
349 |     "    3: \"Higher variance\",}\n",
350 |     "bold = [[False, False, False, False], [False, True, True, True], [True, False, False, False]]\n",
351 |     "def make_bold(is_bold):\n",
352 |     "    if is_bold:\n",
353 |     "        return \"\\\\bfseries\"\n",
354 |     "    return \"\"\n",
355 |     "\n",
356 |     "for i, y in enumerate([y1, y2, y3]):\n",
357 |     "    print(\"{description} & {bold1} {mae:.2f} & {bold2} {mse:.2f} & {bold3} {rmse:.2f} & {bold4} {r2:.2f} \\\\\\\\\".format(\n",
358 |     "        description=descriptions[i+1],\n",
359 |     "        mae=me.mean_absolute_error(y_model, y),\n",
360 |     "        mse=me.mean_squared_error(y_model, y),\n",
361 |     "        rmse=np.sqrt(me.mean_squared_error(y_model, y)),\n",
362 |     "        r2=me.r2_score(y_model, y)*100,\n",
363 |     "        bold1=make_bold(bold[i][0]),\n",
364 |     "        bold2=make_bold(bold[i][1]),\n",
365 |     "        bold3=make_bold(bold[i][2]),\n",
366 |     "        bold4=make_bold(bold[i][3]),))"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": []
375 |   }
376 |  ],
377 |  "metadata": {
378 |   "kernelspec": {
379 |    "display_name": "Python 3",
380 |    "language": "python",
381 |    "name": "python3"
382 |   },
383 |   "language_info": {
384 |    "codemirror_mode": {
385 |     "name": "ipython",
386 |     "version": 3
387 |    },
388 |    "file_extension": ".py",
389 |    "mimetype": "text/x-python",
390 |    "name": "python",
391 |    "nbconvert_exporter": "python",
392 |    "pygments_lexer": "ipython3",
393 |    "version": "3.9.1"
394 |   },
395 |   "toc": {
396 |    "nav_menu": {},
397 |    "number_sections": false,
398 |    "sideBar": true,
399 |    "skip_h1_title": false,
400 |    "toc_cell": false,
401 |    "toc_position": {},
402 |    "toc_section_display": "block",
403 |    "toc_window_display": false
404 |   },
405 |   "varInspector": {
406 |    "cols": {
407 |     "lenName": 16,
408 |     "lenType": 16,
409 |     "lenVar": 40
410 |    },
411 |    "kernels_config": {
412 |     "python": {
413 |      "delete_cmd_postfix": "",
414 |      "delete_cmd_prefix": "del ",
415 |      "library": "var_list.py",
416 |      "varRefreshCmd": "print(var_dic_list())"
417 |     },
418 |     "r": {
419 |      "delete_cmd_postfix": ") ",
420 |      "delete_cmd_prefix": "rm(",
421 |      "library": "var_list.r",
422 |      "varRefreshCmd": "cat(var_dic_list()) "
423 |     }
424 |    },
425 |    "types_to_exclude": [
426 |     "module",
427 |     "function",
428 |     "builtin_function_or_method",
429 |     "instance",
430 |     "_Feature"
431 |    ],
432 |    "window_display": false
433 |   }
434 |  },
435 |  "nbformat": 4,
436 |  "nbformat_minor": 2
437 | }
438 | 


--------------------------------------------------------------------------------
/notebooks/1_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<div class=\"alert alert-block alert-info\">\n",
  8 |     "Section of the book chapter: <b>3. Regression on Data Level</b>\n",
  9 |     "</div>\n",
 10 |     "\n",
 11 |     "# 1. Data Level\n",
 12 |     "\n",
 13 |     "\n",
 14 |     "**Table of Contents**\n",
 15 |     "\n",
 16 |     "* [1.1 Data handling](#1.1-Data-handling)\n",
 17 |     "* [1.2 Dataset shift](#1.2-Dataset-shift)\n",
 18 |     "* [1.3 Dataset splitting](#1.3-Dataset-splitting)\n",
 19 |     "\n",
 20 |     "\n",
 21 |     "**Learnings:**\n",
 22 |     "\n",
 23 |     "- how to read in, validate and scale data,\n",
 24 |     "- how datashift looks like in a real world example,\n",
 25 |     "- how to split datasets with random, systematic, patch and stratified split.\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "### Packages"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 1,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "%matplotlib inline\n",
 38 |     "%config InlineBackend.figure_format = 'retina'\n",
 39 |     "\n",
 40 |     "# ignore warnings\n",
 41 |     "import warnings\n",
 42 |     "warnings.filterwarnings('ignore')\n",
 43 |     "\n",
 44 |     "import numpy as np\n",
 45 |     "import pandas as pd\n",
 46 |     "import seaborn as sns\n",
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "import matplotlib as mpl\n",
 49 |     "import itertools\n",
 50 |     "from patchify import patchify\n",
 51 |     "from sklearn.ensemble import RandomForestRegressor\n",
 52 |     "\n",
 53 |     "import utils"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "### Read in Data\n",
 61 |     "\n",
 62 |     "**Dataset:** Felix M. Riese and Sina Keller, \"Hyperspectral benchmark dataset on soil moisture\", Dataset, Zenodo, 2018. [DOI:10.5281/zenodo.1227836](http://doi.org/10.5281/zenodo.1227836) and [GitHub](https://github.com/felixriese/hyperspectral-soilmoisture-dataset)\n",
 63 |     "\n",
 64 |     "**Introducing paper:** Felix M. Riese and Sina Keller, “Introducing a Framework of Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154. [DOI:10.1109/IGARSS.2018.8517812](https://doi.org/10.1109/IGARSS.2018.8517812)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "X, y = utils.get_xy()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Plot Configurations"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "norm = mpl.colors.Normalize(vmin=np.min(y), vmax=np.max(y))\n",
 90 |     "cmap = \"cividis_r\"\n",
 91 |     "\n",
 92 |     "\n",
 93 |     "myblue = \"#4664ab\"\n",
 94 |     "myblue30 = \"#c7d0e6\"\n",
 95 |     "myred = \"#9b1724\""
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Fix Random State"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "np.random.seed(42)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "***\n",
119 |     "\n",
120 |     "## 1.1 Data handling\n",
121 |     "\n",
122 |     "Steps:\n",
123 |     "\n",
124 |     "- [1.1.1 Collect the data](#1.1.1-Collect-the-data)\n",
125 |     "- [1.1.2 Validate the data](#1.1.2-Validate-the-data)\n",
126 |     "- [1.1.3 Prepare the data](#1.1.3-Prepare-the-data)\n",
127 |     "\n",
128 |     "### 1.1.1 Collect the data\n",
129 |     "\n",
130 |     "Columns:\n",
131 |     "\n",
132 |     "* `0` - `124`: 125 hyperspectral bands\n",
133 |     "* `soil_moisture`: target variable of soil moisture in percent"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "df = pd.DataFrame(X) \n",
143 |     "df[\"soil_moisture\"] = y"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "### 1.1.2 Validate the data"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "# dataset statistics\n",
160 |     "df.describe()"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# target variable distribution\n",
170 |     "df[\"soil_moisture\"].hist()\n",
171 |     "plt.show()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# correlations of different features and the target variable in a heatmap\n",
181 |     "sns.heatmap(df[[0, 1, 2, 3, 4, \"soil_moisture\"]].corr(), vmin=-1., vmax=1.)\n",
182 |     "plt.show()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "### 1.1.3 Prepare the data\n",
190 |     "\n",
191 |     "We use the [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) to scale our data. The resulting `X_scaled` is the scaled input data `X`."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "from sklearn.preprocessing import StandardScaler\n",
201 |     "\n",
202 |     "scaler = StandardScaler()\n",
203 |     "X_scaled = scaler.fit_transform(X)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "***\n",
211 |     "\n",
212 |     "## 1.2 Dataset shift\n",
213 |     "\n",
214 |     "Content:\n",
215 |     "\n",
216 |     "- [1.2.1 Generate shifted dataset](#1.2.1-Generate-shifted-dataset)\n",
217 |     "- [1.2.2 Simple regression](#1.2.2-Simple-regression)\n",
218 |     "\n",
219 |     "### 1.2.1 Generate shifted dataset"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# get shifted data\n",
229 |     "X_train, X_test, y_test, y_train = utils.get_xy_shifted()\n",
230 |     "print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "# plot shifted data\n",
240 |     "fontsize = 15\n",
241 |     "\n",
242 |     "fig, ax = plt.subplots(1,1, figsize=(7,4))\n",
243 |     "\n",
244 |     "bins = np.arange(25., 42.5, 2.5)\n",
245 |     "plt.hist(y_train, bins=bins, label=\"Training\", alpha=1.0, color=myblue)\n",
246 |     "plt.hist(y_test, bins=bins, label=\"Unknown\", alpha=1.0, color=myblue30)\n",
247 |     "\n",
248 |     "lfact = 0.9\n",
249 |     "leg = plt.legend(title=\"Datasets:\", fontsize=fontsize*lfact, frameon=False)\n",
250 |     "plt.setp(leg.get_title(), fontsize=fontsize*lfact)\n",
251 |     "\n",
252 |     "plt.xlabel(\"Soil moisture in %\", fontsize=fontsize, labelpad=10)\n",
253 |     "plt.ylabel(\"Number of datapoints\", fontsize=fontsize, labelpad=10)\n",
254 |     "plt.xlim(24, 41)\n",
255 |     "plt.ylim(0, 225)\n",
256 |     "\n",
257 |     "for tick in ax.xaxis.get_major_ticks()[1::2]:\n",
258 |     "    tick.label.set_visible(False)\n",
259 |     "for tick in ax.xaxis.get_major_ticks():\n",
260 |     "    tick.label.set_fontsize(fontsize) \n",
261 |     "for tick in ax.yaxis.get_major_ticks()[1::2]:\n",
262 |     "    tick.label.set_visible(False)\n",
263 |     "for tick in ax.yaxis.get_major_ticks():\n",
264 |     "    tick.label.set_fontsize(fontsize) \n",
265 |     "    \n",
266 |     "plt.savefig(\"plots/datasetshift_distributions.pdf\", bbox_inches=\"tight\")"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "### 1.2.2 Simple regression"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "model = RandomForestRegressor(n_estimators=100, n_jobs=-1)\n",
283 |     "model.fit(X_train, y_train)\n",
284 |     "score = model.score(X_test, y_test)\n",
285 |     "print(\"R2 = {0:.2f} %\".format(score*100))"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "***\n",
293 |     "\n",
294 |     "## 1.3 Dataset splitting\n",
295 |     "\n",
296 |     "Content:\n",
297 |     "\n",
298 |     "- [1.3.1 Random Split](#1.3.1-Random-Split)\n",
299 |     "- [1.3.2 Split plot](#1.3.2-Split-plot)\n",
300 |     "\n",
301 |     "### 1.3.1 Random Split"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "from sklearn.model_selection import train_test_split\n",
311 |     "\n",
312 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
313 |     "    X, y, test_size=0.5, random_state=42, shuffle=True)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "### 1.3.2 Split plot"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))\n",
330 |     "axes = [ax1, ax2, ax3, ax4]\n",
331 |     "\n",
332 |     "fontsize = 18\n",
333 |     "titleweight = \"bold\"\n",
334 |     "titlepad = 10\n",
335 |     "msize = 100\n",
336 |     "il = 12     # image length\n",
337 |     "\n",
338 |     "# random split\n",
339 |     "data_rand = np.random.randint(low=1, high=il, size=(20,20))\n",
340 |     "ax1.scatter(data_rand[0], data_rand[1], marker=\"o\", color=myblue, s=msize)\n",
341 |     "ax1.set_title(\"(a) Random split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n",
342 |     "\n",
343 |     "# systematic split\n",
344 |     "k = 2\n",
345 |     "data_sys = [(x, y) for (x, y) in itertools.product(range(il), range(il)) if ((x % k == 0) and (y % k == 0))]\n",
346 |     "ax2.scatter([x[0] for x in data_sys], [x[1] for x in data_sys], marker=\"o\", color=myblue, s=msize)\n",
347 |     "ax2.set_title(\"(b) Systematic split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n",
348 |     "\n",
349 |     "# patch split\n",
350 |     "data_pat = np.zeros((il, il), dtype=tuple)\n",
351 |     "for i in range(il):\n",
352 |     "    for j in range(il):\n",
353 |     "        data_pat[i, j] = (i,j)\n",
354 |     "patches = patchify(data_pat, (4, 4), step=4).reshape(3*3, 4*4)\n",
355 |     "split_mask = np.random.rand(il) < 0.5\n",
356 |     "patches_train = []\n",
357 |     "patches_test = []\n",
358 |     "for p in range(patches.shape[0]):\n",
359 |     "    if split_mask[p]:\n",
360 |     "        patches_train.append(patches[p])\n",
361 |     "    else:\n",
362 |     "        patches_test.append(patches[p])\n",
363 |     "ax3.scatter([x[0] for p in patches_train for x in p], [x[1] for p in patches_train for x in p],\n",
364 |     "            marker=\"o\", color=myblue, edgecolor=myblue, s=msize)\n",
365 |     "ax3.set_title(\"(c) Patch split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n",
366 |     "\n",
367 |     "# stratified split\n",
368 |     "data_strat_1 = np.random.randint(low=0, high=2, size=(10,10))\n",
369 |     "data_strat_2 = np.random.randint(low=5, high=9, size=(10,10))\n",
370 |     "data_strat_3 = [(np.random.randint(low=2, high=3), np.random.randint(low=5, high=9)) for _ in range(20)]\n",
371 |     "data_strat_4 = [(np.random.randint(low=7, high=10), np.random.randint(low=1, high=3)) for _ in range(20)]\n",
372 |     "ax4.scatter([x[0] for x in data_strat_1], [x[1] for x in data_strat_1],\n",
373 |     "            marker=\"o\", color=\"white\", edgecolor=\"black\", s=msize)\n",
374 |     "l_train = ax4.scatter([x[0] for x in data_strat_2], [x[1] for x in data_strat_2],\n",
375 |     "                      marker=\"o\", color=\"white\", edgecolor=\"black\", s=msize)\n",
376 |     "ax4.scatter([x[0] for x in data_strat_3], [x[1] for x in data_strat_3], marker=\"o\", color=myblue, s=msize)\n",
377 |     "l_test = ax4.scatter([x[0] for x in data_strat_4], [x[1] for x in data_strat_4],\n",
378 |     "                     marker=\"o\", color=myblue, s=msize)\n",
379 |     "l_not = ax4.scatter([], [], marker=\"s\", color=\"lightgrey\", s=msize) #, edgecolor=\"grey\")\n",
380 |     "ax4.set_facecolor(\"lightgrey\")\n",
381 |     "ax4.set_title(\"(d) Stratified split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n",
382 |     "\n",
383 |     "for i, ax in enumerate(axes):\n",
384 |     "    ax.set_xlim(0,12)\n",
385 |     "    ax.set_ylim(0,12)\n",
386 |     "    # ax.set_xlim(-1,101)\n",
387 |     "    # ax.set_ylim(-1,101)\n",
388 |     "    ax.set_xlabel(\"x coordinate in a.u.\", fontsize=fontsize)\n",
389 |     "    ax.set_ylabel(\"y coordinate in a.u.\", fontsize=fontsize)\n",
390 |     "    \n",
391 |     "    for tick in ax.xaxis.get_major_ticks()[1::2]:\n",
392 |     "        tick.label.set_visible(False)\n",
393 |     "    for tick in ax.xaxis.get_major_ticks():\n",
394 |     "        tick.label.set_fontsize(fontsize) \n",
395 |     "    for tick in ax.yaxis.get_major_ticks()[1::2]:\n",
396 |     "        tick.label.set_visible(False)\n",
397 |     "    for tick in ax.yaxis.get_major_ticks():\n",
398 |     "        tick.label.set_fontsize(fontsize) \n",
399 |     "    \n",
400 |     "    if i != 0 and i != 2:\n",
401 |     "        ax.yaxis.set_visible(False)\n",
402 |     "        \n",
403 |     "    if i != 2 and i != 3:\n",
404 |     "        ax.xaxis.set_visible(False)\n",
405 |     "\n",
406 |     "        \n",
407 |     "leg = fig.legend(\n",
408 |     "    (l_train, l_test, l_not), (\"Training\", \"Test\", \"Not used\"),\n",
409 |     "    bbox_to_anchor=(1.25, 0.65), title=\"Subsets:\", ncol=1, fontsize=fontsize, frameon=False)\n",
410 |     "frame = leg.get_frame()\n",
411 |     "plt.setp(leg.get_title(), fontsize=fontsize)\n",
412 |     "\n",
413 |     "plt.tight_layout()\n",
414 |     "plt.savefig(\"plots/split_approaches.pdf\", bbox_inches=\"tight\")"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": []
423 |   }
424 |  ],
425 |  "metadata": {
426 |   "kernelspec": {
427 |    "display_name": "Python 3",
428 |    "language": "python",
429 |    "name": "python3"
430 |   },
431 |   "language_info": {
432 |    "codemirror_mode": {
433 |     "name": "ipython",
434 |     "version": 3
435 |    },
436 |    "file_extension": ".py",
437 |    "mimetype": "text/x-python",
438 |    "name": "python",
439 |    "nbconvert_exporter": "python",
440 |    "pygments_lexer": "ipython3",
441 |    "version": "3.7.5"
442 |   },
443 |   "toc": {
444 |    "nav_menu": {
445 |     "height": "208px",
446 |     "width": "227px"
447 |    },
448 |    "number_sections": false,
449 |    "sideBar": true,
450 |    "skip_h1_title": false,
451 |    "toc_cell": false,
452 |    "toc_position": {},
453 |    "toc_section_display": "block",
454 |    "toc_window_display": false
455 |   },
456 |   "varInspector": {
457 |    "cols": {
458 |     "lenName": 16,
459 |     "lenType": 16,
460 |     "lenVar": 40
461 |    },
462 |    "kernels_config": {
463 |     "python": {
464 |      "delete_cmd_postfix": "",
465 |      "delete_cmd_prefix": "del ",
466 |      "library": "var_list.py",
467 |      "varRefreshCmd": "print(var_dic_list())"
468 |     },
469 |     "r": {
470 |      "delete_cmd_postfix": ") ",
471 |      "delete_cmd_prefix": "rm(",
472 |      "library": "var_list.r",
473 |      "varRefreshCmd": "cat(var_dic_list()) "
474 |     }
475 |    },
476 |    "types_to_exclude": [
477 |     "module",
478 |     "function",
479 |     "builtin_function_or_method",
480 |     "instance",
481 |     "_Feature"
482 |    ],
483 |    "window_display": false
484 |   }
485 |  },
486 |  "nbformat": 4,
487 |  "nbformat_minor": 2
488 | }
489 | 


--------------------------------------------------------------------------------
/notebooks/3_Supervised_Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<div class=\"alert alert-block alert-info\">\n",
  8 |     "Section of the book chapter: <b>5.1 Supervised Learning Models</b>\n",
  9 |     "</div>\n",
 10 |     "\n",
 11 |     "# 3. Supervised learning\n",
 12 |     "\n",
 13 |     "**Table of Contents**\n",
 14 |     "\n",
 15 |     "* [3.1 Linear regression and partial least squares](#3.1-Linear-regression-and-partial-least-squares)\n",
 16 |     "* [3.2 Tree-based Models](#3.2-Tree-based-Models)\n",
 17 |     "* [3.3 Support Vector Machines](#3.3-Support-Vector-Machines)\n",
 18 |     "* [3.4 k-Nearest Neighbors](#3.4-k-Nearest-Neighbors)\n",
 19 |     "* [3.5 Artificial Neural Networks (ANN)](#3.5-Artificial-Neural-Networks,-ANN)\n",
 20 |     "* [3.6 SUSI: Supervised Self-organizing Maps in Python](#3.6-SUSI:-Supervised-Self-organizing-Maps-in-Python)\n",
 21 |     "* [3.7 Overall results](#3.7-Overall-results)\n",
 22 |     "\n",
 23 |     "**Learnings:**\n",
 24 |     "\n",
 25 |     "- how to implement different supervised machine learning models,\n",
 26 |     "- how to plot regression results.\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "### Packages"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "%matplotlib inline\n",
 40 |     "%config InlineBackend.figure_format = 'retina'\n",
 41 |     "import os\n",
 42 |     "\n",
 43 |     "import numpy as np\n",
 44 |     "import pandas as pd\n",
 45 |     "import seaborn as sns\n",
 46 |     "import matplotlib.pyplot as plt\n",
 47 |     "import matplotlib as mpl\n",
 48 |     "import sklearn.metrics as met\n",
 49 |     "import datetime\n",
 50 |     "\n",
 51 |     "import utils\n",
 52 |     "\n",
 53 |     "# ignore warnings\n",
 54 |     "import warnings\n",
 55 |     "warnings.filterwarnings('ignore')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### Read in Data\n",
 63 |     "\n",
 64 |     "**Dataset:** Felix M. Riese and Sina Keller, \"Hyperspectral benchmark dataset on soil moisture\", Dataset, Zenodo, 2018. [DOI:10.5281/zenodo.1227836](http://doi.org/10.5281/zenodo.1227836) and [GitHub](https://github.com/felixriese/hyperspectral-soilmoisture-dataset)\n",
 65 |     "\n",
 66 |     "**Introducing paper:** Felix M. Riese and Sina Keller, “Introducing a Framework of Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154. [DOI:10.1109/IGARSS.2018.8517812](https://doi.org/10.1109/IGARSS.2018.8517812)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "X_train, X_test, y_train, y_test = utils.get_xy_split()\n",
 76 |     "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "### Plot Configurations"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "norm = mpl.colors.Normalize(vmin=np.min([np.min(y_train), np.min(y_test)]),\n",
 93 |     "                            vmax=np.max([np.max(y_train), np.max(y_test)]))\n",
 94 |     "cmap = \"cividis_r\""
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "### Results Dataframe"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "results = pd.DataFrame(columns=[\"model\", \"r2\", \"mae\", \"rmse\", \"potential\"])"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "### Metrics\n",
118 |     "\n",
119 |     "The following functions calculate and print the following performance metrics:\n",
120 |     "\n",
121 |     "* Coefficient of Determination $R^2$\n",
122 |     "* Mean Absolute Error (MEA)\n",
123 |     "* Root Mean Squared Error (RMSE)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "def get_regression_metrics(y_pred):\n",
133 |     "    global y_test\n",
134 |     "    return (\n",
135 |     "        met.r2_score(y_test, y_pred),\n",
136 |     "        met.mean_absolute_error(y_test, y_pred),\n",
137 |     "        np.sqrt(met.mean_squared_error(y_test, y_pred)))\n",
138 |     "\n",
139 |     "def print_regression_metrics(y_pred, model_name, potential):\n",
140 |     "    global results\n",
141 |     "    \n",
142 |     "    # get and print metrics\n",
143 |     "    r2, mae, rmse = get_regression_metrics(y_pred)\n",
144 |     "    print(\"R2 =   {0:.1f}% \\nMAE =  {1:.2f} \\nRMSE = {2:.2f}\".format(\n",
145 |     "        r2*100, mae, rmse))\n",
146 |     "    \n",
147 |     "    # save metrics to dataframe\n",
148 |     "    if not ((results[\"model\"]==model_name).any()):\n",
149 |     "        rdict = {\n",
150 |     "            \"model\": model_name,\n",
151 |     "            \"r2\": r2,\n",
152 |     "            \"mae\": mae,\n",
153 |     "            \"rmse\": rmse,\n",
154 |     "            \"potential\": potential}\n",
155 |     "        results = pd.concat([results, pd.DataFrame(rdict, index=[0])], ignore_index=True)\n",
156 |     "        \n",
157 |     "    else:\n",
158 |     "        idx = results.index[results['model'] == model_name].tolist()[0]\n",
159 |     "        results.at[idx, \"r2\"] = r2\n",
160 |     "        results.at[idx, \"mae\"] = mae\n",
161 |     "        results.at[idx, \"rmse\"] = rmse\n",
162 |     "        results.at[idx, \"potential\"] = potential"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "### Fix Random State"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # disable warning\n",
179 |     "import tensorflow as tf\n",
180 |     "\n",
181 |     "np.random.seed(42)\n",
182 |     "tf.random.set_seed(43)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "***\n",
190 |     "\n",
191 |     "## 3.1 Linear regression and partial least squares\n",
192 |     "\n",
193 |     "Content:\n",
194 |     "\n",
195 |     "- [3.1.1 Linear regression](#3.1.1-Linear-regression)\n",
196 |     "- [3.1.2 Partial least squares](#3.1.2-Partial-least-squares)\n",
197 |     "\n",
198 |     "### 3.1.1 Linear regression\n",
199 |     "Implementation: [sklearn.linear_model.LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "from sklearn.linear_model import LinearRegression\n",
209 |     "\n",
210 |     "model_lin = LinearRegression()\n",
211 |     "model_lin.fit(X_train, y_train)\n",
212 |     "y_pred_lin = model_lin.predict(X_test)\n",
213 |     "\n",
214 |     "print_regression_metrics(y_pred_lin, \"Linear\", \"-\")\n",
215 |     "utils.plot_regression_results(y_test, y_pred_lin, \"Linear\")"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "### 3.1.2 Partial least squares\n",
223 |     "Implementation: [sklearn.cross_decomposition.PLSRegression](https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "from sklearn.cross_decomposition import PLSRegression\n",
233 |     "\n",
234 |     "model_pls = PLSRegression(n_components=5)\n",
235 |     "model_pls.fit(X_train, y_train)\n",
236 |     "y_pred_pls = model_pls.predict(X_test)\n",
237 |     "\n",
238 |     "print_regression_metrics(y_pred_pls, \"PLS\", \"Minor\")\n",
239 |     "utils.plot_regression_results(y_test, y_pred_pls, \"PLS\")"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "***\n",
247 |     "\n",
248 |     "## 3.2 Tree-based Models\n",
249 |     "\n",
250 |     "Content:\n",
251 |     "\n",
252 |     "- [3.2.1 Decision Tree](#3.2.1-Decision-Tree)\n",
253 |     "- [3.2.2 Bagging: Random Forest & Extremly Randomized Trees](#3.2.2-Bagging:-Random-Forest-&-Extremly-Randomized-Trees)\n",
254 |     "- [3.2.3 Boosting: Gradient Boosting](#3.2.3-Boosting:-Gradient-Boosting)\n",
255 |     "\n",
256 |     "### 3.2.1 Decision Tree\n",
257 |     "\n",
258 |     "**Source:** Breiman, L., Friedman, J., Olshen, R.A., Stone, C.J.: Classification and regression trees. Chapman and Hall/CRC (1984)\n",
259 |     "\n",
260 |     "**Algorithm:**\n",
261 |     "\n",
262 |     "The regression trees algorithm is defined as follows:\n",
263 |     "1. Start with the root node.\n",
264 |     "2. Start with the most significant feature of the training data.\n",
265 |     "3. Divide the input data with (binary) a cut $c_1$ on feature $x_i$, e.g. according to the Gini index, see below.\n",
266 |     "4. Divide data along the next best feature on cut $c_j$ for $j=2, 3, \\ldots$\n",
267 |     "5. Stop if a condition is met, e.g. maximum number of nodes, maximum depth, maximum purity etc.\n",
268 |     "6. Every leaf is then averaged and therefore contains one output value.\n",
269 |     "\n",
270 |     "The Gini index is defined as:\n",
271 |     "\n",
272 |     "$G = 1 - \\sum_{i=1}^n P_i^2 \\qquad \\text{with } P_i = \\frac{N_i}{N},\\label{eq:gini}$\n",
273 |     "\n",
274 |     "with $N$ as number of all objects and $N_i$ as number of objects of class $i$.\n",
275 |     "\n",
276 |     "**Implementation:** [sklearn.tree.DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "from sklearn.tree import DecisionTreeRegressor\n",
286 |     "\n",
287 |     "model_dt = DecisionTreeRegressor()\n",
288 |     "model_dt.fit(X_train, y_train)\n",
289 |     "y_pred_dt = model_dt.predict(X_test)\n",
290 |     "\n",
291 |     "print_regression_metrics(y_pred_dt, \"Decision Tree\", \"Minor\")\n",
292 |     "utils.plot_regression_results(y_test, y_pred_dt, \"Decision Tree\")"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "### 3.2.2 Bagging: Random Forest & Extremly Randomized Trees\n",
300 |     "#### Random Forest\n",
301 |     "Implementation: [sklearn.ensemble.RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor) "
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "from sklearn.ensemble import RandomForestRegressor\n",
311 |     "\n",
312 |     "model_rf = RandomForestRegressor(n_estimators=100, oob_score=True)\n",
313 |     "model_rf.fit(X_train, y_train)\n",
314 |     "y_pred_rf = model_rf.predict(X_test)\n",
315 |     "\n",
316 |     "print_regression_metrics(y_pred_rf, \"RF\", \"Minor\")\n",
317 |     "utils.plot_regression_results(y_test, y_pred_rf, \"RF\")\n",
318 |     "\n",
319 |     "print(\"Out-of-bag estimate = {0:.1f}%\".format(model_rf.oob_score_*100))"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "#### Extremly Randomized Trees\n",
327 |     "Implementation: [sklearn.ensemble.ExtraTreesRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html#sklearn.ensemble.ExtraTreesRegressor)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "from sklearn.ensemble import ExtraTreesRegressor\n",
337 |     "\n",
338 |     "model_et = ExtraTreesRegressor(n_estimators=100)\n",
339 |     "model_et.fit(X_train, y_train)\n",
340 |     "y_pred_et = model_et.predict(X_test)\n",
341 |     "\n",
342 |     "print_regression_metrics(y_pred_et, \"ET\", \"Minor\")\n",
343 |     "utils.plot_regression_results(y_test, y_pred_et, \"ET\")"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {},
349 |    "source": [
350 |     "#### Feature Importance"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "n_features_plotted = 15\n",
360 |     "\n",
361 |     "importances = model_rf.feature_importances_\n",
362 |     "indices = np.argsort(importances)\n",
363 |     "std = np.std([tree.feature_importances_ for tree in model_rf.estimators_], axis=0)\n",
364 |     "plt.figure(figsize=(15,5))\n",
365 |     "plt.title(\"Feature importances\")\n",
366 |     "plt.bar(range(X_train.shape[1])[125-n_features_plotted:], importances[indices][125-n_features_plotted:], color=\"r\", yerr=std[indices][125-n_features_plotted:], align=\"center\")\n",
367 |     "# If you want to define your own labels,\n",
368 |     "# change indices to a list of labels on the following line.\n",
369 |     "plt.xticks(range(X_train.shape[1])[125-n_features_plotted:], indices[:n_features_plotted], rotation=90)\n",
370 |     "plt.xlim([-1 + 125-n_features_plotted, X_train.shape[1]])\n",
371 |     "plt.xlabel(\"Hyperspectral band\")\n",
372 |     "plt.ylabel(\"Feature importance\")\n",
373 |     "plt.savefig(\"plots/featureimportance_rf.pdf\", bbox_inches=\"tight\")"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "### 3.2.3 Boosting: Gradient Boosting\n",
381 |     "Implementation: [sklearn.ensemble.GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "from sklearn.ensemble import GradientBoostingRegressor\n",
391 |     "\n",
392 |     "model_gb = GradientBoostingRegressor()\n",
393 |     "model_gb.fit(X_train, y_train)\n",
394 |     "y_pred_gb = model_gb.predict(X_test)\n",
395 |     "\n",
396 |     "print_regression_metrics(y_pred_gb, \"GB\", \"Minor\")\n",
397 |     "utils.plot_regression_results(y_test, y_pred_gb, \"GB\")"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "***\n",
405 |     "\n",
406 |     "## 3.3 Support Vector Machines\n",
407 |     "Implementation: [sklearn.svm.SVR](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html)\n",
408 |     "\n",
409 |     "The SVM is tuned with a Grid Search, see [sklearn.model_selection.RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "from sklearn.svm import SVR\n",
419 |     "from sklearn.model_selection import RandomizedSearchCV\n",
420 |     "\n",
421 |     "# 1. find hyperparameters\n",
422 |     "params = {\"C\": np.logspace(-8, 8, 17), \"gamma\": np.logspace(-8, 8, 17)}\n",
423 |     "rsearch = RandomizedSearchCV(\n",
424 |     "    estimator=SVR(),\n",
425 |     "    n_iter=30,\n",
426 |     "    cv=5,\n",
427 |     "    n_jobs=-1,\n",
428 |     "    param_distributions=params)\n",
429 |     "rsearch.fit(X_train, y_train)\n",
430 |     "model_svm = rsearch.best_estimator_\n",
431 |     "\n",
432 |     "# 2. predict\n",
433 |     "model_svm.fit(X_train, y_train)\n",
434 |     "y_pred_svm = model_svm.predict(X_test)\n",
435 |     "\n",
436 |     "print_regression_metrics(y_pred_svm, \"SVM\", \"Minor\")\n",
437 |     "utils.plot_regression_results(y_test, y_pred_svm, \"SVM\")"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "markdown",
442 |    "metadata": {},
443 |    "source": [
444 |     "***\n",
445 |     "\n",
446 |     "## 3.4 k-Nearest Neighbors\n",
447 |     "\n",
448 |     "Types:\n",
449 |     "\n",
450 |     "- [3.4.1 Without weighting](#3.4.1-Without-weighting)\n",
451 |     "- [3.4.2 With distance weighting](#3.4.2-With-distance-weighting)\n",
452 |     "\n",
453 |     "Implementation: [sklearn.neighbors.KNeighborsRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html#sklearn.neighbors.KNeighborsRegressor)\n",
454 |     "\n",
455 |     "### 3.4.1 Without weighting"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": [
464 |     "from sklearn.neighbors import KNeighborsRegressor\n",
465 |     "\n",
466 |     "model_knn = KNeighborsRegressor(n_neighbors=5)\n",
467 |     "model_knn.fit(X_train, y_train)\n",
468 |     "y_pred_knn = model_knn.predict(X_test)\n",
469 |     "\n",
470 |     "print_regression_metrics(y_pred_knn, \"k-NN\", \"Minor\")\n",
471 |     "utils.plot_regression_results(y_test, y_pred_knn, \"kNN\")"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {},
477 |    "source": [
478 |     "### 3.4.2 With distance weighting"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "from sklearn.neighbors import KNeighborsRegressor\n",
488 |     "\n",
489 |     "model_knnw = KNeighborsRegressor(n_neighbors=5, weights=\"distance\")\n",
490 |     "model_knnw.fit(X_train, y_train)\n",
491 |     "y_pred_knnw = model_knnw.predict(X_test)\n",
492 |     "\n",
493 |     "print_regression_metrics(y_pred_knnw, \"k-NN (weighted)\", \"Minor\")\n",
494 |     "utils.plot_regression_results(y_test, y_pred_knnw, \"kNN weighted\")"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "***\n",
502 |     "\n",
503 |     "## 3.5 Artificial Neural Networks, ANN\n",
504 |     "\n",
505 |     "Types:\n",
506 |     "\n",
507 |     "- [3.5.1 Fully-connected ANNs](#3.5.1-Fully-connected-ANNs)\n",
508 |     "- [3.5.2 CNN with Keras and TensorFlow](#3.5.2-CNN-with-Keras-and-TensorFlow)\n",
509 |     "\n",
510 |     "### 3.5.1 Fully-connected ANNs\n",
511 |     "#### scikit-learn\n",
512 |     "Implementation: [sklearn.neural_network.MLPRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "from sklearn.neural_network import MLPRegressor\n",
522 |     "\n",
523 |     "model_ann = MLPRegressor(hidden_layer_sizes=(20, 20, 20), batch_size=10, max_iter=500)\n",
524 |     "model_ann.fit(X_train, y_train)\n",
525 |     "y_pred_ann = model_ann.predict(X_test)\n",
526 |     "\n",
527 |     "print_regression_metrics(y_pred_ann, \"ANN (sklearn)\", \"Major\")\n",
528 |     "utils.plot_regression_results(y_test, y_pred_ann, \"ANN (sklearn)\")"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {},
534 |    "source": [
535 |     "#### Keras with TensorFlow"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "from tensorflow import keras\n",
545 |     "from tensorflow.keras.models import Sequential\n",
546 |     "from tensorflow.keras.layers import Dense\n",
547 |     "\n",
548 |     "keras.backend.clear_session()\n",
549 |     "\n",
550 |     "# define model\n",
551 |     "model = Sequential()\n",
552 |     "model.add(Dense(20, input_dim=X_train.shape[1], activation=\"relu\"))\n",
553 |     "model.add(Dense(10, activation=\"relu\"))\n",
554 |     "model.add(Dense(1, activation=\"linear\"))\n",
555 |     "\n",
556 |     "# compile and train model\n",
557 |     "model.compile(loss=\"mean_squared_error\", optimizer=\"nadam\")\n",
558 |     "model.fit(X_train, y_train, epochs=1000, verbose=0, batch_size=10,\n",
559 |     "          validation_data=(X_test, y_test))\n",
560 |     "y_pred_annk = model.predict(X_test)\n",
561 |     "\n",
562 |     "print_regression_metrics(y_pred_annk, \"ANN (keras)\", \"Major\")\n",
563 |     "utils.plot_regression_results(y_test, y_pred_annk, \"ANN (keras)\")"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "markdown",
568 |    "metadata": {},
569 |    "source": [
570 |     "### 3.5.2 CNN with Keras and TensorFlow"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": [
579 |     "from tensorflow import keras\n",
580 |     "from tensorflow.keras.models import Sequential\n",
581 |     "from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten\n",
582 |     "\n",
583 |     "keras.backend.clear_session()\n",
584 |     "\n",
585 |     "# define model\n",
586 |     "model = Sequential()\n",
587 |     "\n",
588 |     "model.add(Conv1D(filters=8, kernel_size=3, activation=\"relu\",\n",
589 |     "                 input_shape=(X_train.shape[1],1)))\n",
590 |     "model.add(MaxPooling1D(pool_size=2))\n",
591 |     "\n",
592 |     "model.add(Conv1D(filters=16, kernel_size=3, activation=\"relu\"))\n",
593 |     "model.add(MaxPooling1D(pool_size=2))\n",
594 |     "\n",
595 |     "model.add(Conv1D(filters=32, kernel_size=3, activation=\"relu\"))\n",
596 |     "model.add(MaxPooling1D(pool_size=2))\n",
597 |     "\n",
598 |     "model.add(Flatten())\n",
599 |     "\n",
600 |     "model.add(Dense(20, activation=\"relu\"))\n",
601 |     "model.add(Dense(1, activation=\"linear\"))\n",
602 |     "\n",
603 |     "# compile and train model\n",
604 |     "model.compile(loss=\"mean_squared_error\", optimizer=\"nadam\")\n",
605 |     "model.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train,\n",
606 |     "          epochs=500, verbose=0, batch_size=10,\n",
607 |     "          validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test))\n",
608 |     "y_pred_cnn = model.predict(X_test.reshape(X_test.shape[0], X_test.shape[1], 1))\n",
609 |     "\n",
610 |     "print_regression_metrics(y_pred_cnn, \"CNN\", \"Major\")\n",
611 |     "utils.plot_regression_results(y_test, y_pred_cnn, \"CNN\")"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "markdown",
616 |    "metadata": {},
617 |    "source": [
618 |     "***\n",
619 |     "\n",
620 |     "## 3.6 SUSI: Supervised Self-organizing Maps in Python\n",
621 |     "Implementation: [felixriese/susi](https://github.com/felixriese/susi)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {},
628 |    "outputs": [],
629 |    "source": [
630 |     "import susi\n",
631 |     "\n",
632 |     "model_som = susi.SOMRegressor(\n",
633 |     "    n_rows=35,\n",
634 |     "    n_columns=35,\n",
635 |     "    n_iter_unsupervised=10000,\n",
636 |     "    n_iter_supervised=10000,\n",
637 |     "    n_jobs=-1)\n",
638 |     "model_som.fit(X_train, y_train)\n",
639 |     "y_pred_som = model_som.predict(X_test)\n",
640 |     "\n",
641 |     "print_regression_metrics(y_pred_som, \"SOM\", \"Minor\")\n",
642 |     "utils.plot_regression_results(y_test, y_pred_som, \"SOM\")"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "markdown",
647 |    "metadata": {},
648 |    "source": [
649 |     "***\n",
650 |     "\n",
651 |     "## 3.7 Overall results"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "code",
656 |    "execution_count": null,
657 |    "metadata": {},
658 |    "outputs": [],
659 |    "source": [
660 |     "# save results to CSV\n",
661 |     "dt = datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
662 |     "results.to_csv(\"results/results_\"+dt+\".csv\")"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "raw",
667 |    "metadata": {},
668 |    "source": [
669 |     "# load results from CSV\n",
670 |     "# results = pd.read_csv(\"results/results.csv\")\n",
671 |     "results = pd.read_csv(\"results/results_20190704-112011.csv\")"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "code",
676 |    "execution_count": null,
677 |    "metadata": {},
678 |    "outputs": [],
679 |    "source": [
680 |     "# plot horizontal bar plot for results\n",
681 |     "fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(15,5))\n",
682 |     "\n",
683 |     "results.plot(x=\"model\", y=\"r2\", kind=\"barh\", ax=ax1, title=\"$R^2$\", legend=False)\n",
684 |     "results.plot(x=\"model\", y=\"mae\", kind=\"barh\", ax=ax2, title=\"MAE\", legend=False)\n",
685 |     "results.plot(x=\"model\", y=\"rmse\", kind=\"barh\", ax=ax3, title=\"RMSE\", legend=False)\n",
686 |     "for ax in [ax1, ax2, ax3]:\n",
687 |     "    ax.set_ylabel(\"\")\n",
688 |     "\n",
689 |     "plt.tight_layout()\n",
690 |     "plt.savefig(\"plots/results_bar.pdf\", bbox_inches=\"tight\")"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": null,
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "# generate LaTeX table\n",
700 |     "utils.write_results_to_latex_table(results)"
701 |    ]
702 |   },
703 |   {
704 |    "cell_type": "code",
705 |    "execution_count": null,
706 |    "metadata": {},
707 |    "outputs": [],
708 |    "source": []
709 |   }
710 |  ],
711 |  "metadata": {
712 |   "kernelspec": {
713 |    "display_name": "Python 3 (ipykernel)",
714 |    "language": "python",
715 |    "name": "python3"
716 |   },
717 |   "language_info": {
718 |    "codemirror_mode": {
719 |     "name": "ipython",
720 |     "version": 3
721 |    },
722 |    "file_extension": ".py",
723 |    "mimetype": "text/x-python",
724 |    "name": "python",
725 |    "nbconvert_exporter": "python",
726 |    "pygments_lexer": "ipython3",
727 |    "version": "3.10.12"
728 |   },
729 |   "toc": {
730 |    "nav_menu": {},
731 |    "number_sections": false,
732 |    "sideBar": true,
733 |    "skip_h1_title": false,
734 |    "toc_cell": false,
735 |    "toc_position": {},
736 |    "toc_section_display": "block",
737 |    "toc_window_display": false
738 |   },
739 |   "varInspector": {
740 |    "cols": {
741 |     "lenName": 16,
742 |     "lenType": 16,
743 |     "lenVar": 40
744 |    },
745 |    "kernels_config": {
746 |     "python": {
747 |      "delete_cmd_postfix": "",
748 |      "delete_cmd_prefix": "del ",
749 |      "library": "var_list.py",
750 |      "varRefreshCmd": "print(var_dic_list())"
751 |     },
752 |     "r": {
753 |      "delete_cmd_postfix": ") ",
754 |      "delete_cmd_prefix": "rm(",
755 |      "library": "var_list.r",
756 |      "varRefreshCmd": "cat(var_dic_list()) "
757 |     }
758 |    },
759 |    "types_to_exclude": [
760 |     "module",
761 |     "function",
762 |     "builtin_function_or_method",
763 |     "instance",
764 |     "_Feature"
765 |    ],
766 |    "window_display": false
767 |   }
768 |  },
769 |  "nbformat": 4,
770 |  "nbformat_minor": 4
771 | }
772 | 


--------------------------------------------------------------------------------