├── notebooks ├── plots │ └── emptyfileforgit.txt ├── results │ └── emptyfileforgit.txt ├── utils.py ├── 5_Model_Selection_and_Evaluation.ipynb ├── 1_Data.ipynb └── 3_Supervised_Learning.ipynb ├── CHANGELOG.rst ├── .travis.yml ├── .gitignore ├── requirements.txt ├── bibliography.bib ├── CITATION.cff ├── .github └── workflows │ └── tests.yml ├── LICENSE └── README.rst /notebooks/plots/emptyfileforgit.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/results/emptyfileforgit.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | Change Log 2 | ========== 3 | 4 | [1.0.1] - 5 | ------------------------ 6 | - [CHANGED] Tensorflow 1 -> 2 7 | - [REMOVED] Keras 8 | 9 | [1.0.0] - 2019-09-19 10 | ------------------------ 11 | - Initial release 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | language: python 3 | python: 4 | - "3.6" 5 | - "3.7" 6 | - "3.8" 7 | install: 8 | - pip install -r requirements.txt 9 | script: 10 | - pytest --nbval --cov=. 11 | after_success: 12 | - codecov 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .DS_Store 3 | .ipynb_checkpoints 4 | __pycache__ 5 | notebooks/plots/* 6 | !notebooks/plots/emptyfileforgit.txt 7 | notebooks/results/* 8 | !notebooks/results/emptyfileforgit.txt 9 | .pytest_cache 10 | notebooks/tmp/ 11 | notebooks/7_AutoML.ipynb 12 | .coverage 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=2.5.0 2 | notebook>=6.0.0 3 | jupyter 4 | ipywidgets 5 | susi 6 | pandas 7 | matplotlib 8 | seaborn 9 | modAL-python==0.4.2.1 10 | bayesian-optimization 11 | umap-learn>=0.3.10 12 | tqdm 13 | patchify 14 | scipy==1.10.1 15 | 16 | # for testing 17 | pytest>=5.1.1 18 | pytest-cov 19 | codecov 20 | nbval>=0.9.3 21 | -------------------------------------------------------------------------------- /bibliography.bib: -------------------------------------------------------------------------------- 1 | @incollection{riese2020supervised, 2 | author = {Riese, Felix~M. and Keller, Sina}, 3 | title ={{Supervised, Semi-Supervised, and Unsupervised Learning for 4 | Hyperspectral Regression}}, 5 | booktitle = {{Hyperspectral Image Analysis: Advances in Machine 6 | Learning and Signal Processing}}, 7 | editor = {Prasad, Saurabh and Chanussot, Jocelyn}, 8 | year = {2020}, 9 | publisher = {Springer International Publishing}, 10 | address = {Cham}, 11 | chapter = {7}, 12 | pages = {187--232}, 13 | doi = {10.1007/978-3-030-38617-7_7}, 14 | } 15 | 16 | @misc{riese2019hyperspectral, 17 | author = {Riese, Felix~M. and Keller, Sina}, 18 | title = {{Hyperspectral Regression: Code Examples}}, 19 | year = {2019}, 20 | DOI = {10.5281/zenodo.3450676}, 21 | publisher = {Zenodo}, 22 | howpublished = {\href{https://doi.org/10.5281/zenodo.3450676}{doi.org/10.5281/zenodo.3450676}} 23 | } 24 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite both the article from preferred-citation and the software itself." 3 | authors: 4 | - family-names: Riese 5 | given-names: Felix M. 6 | orcid: https://orcid.org/0000-0003-0596-9585 7 | - family-names: Keller 8 | given-names: Sina 9 | orcid: https://orcid.org/0000-0002-7710-5316 10 | title: "Hyperspectral Regression: Code Examples" 11 | version: 1.0.0 12 | doi: "10.5281/zenodo.3450676" 13 | date-released: 2019-09-19 14 | repository-code: https://github.com/felixriese/hyperspectral-regression 15 | license: BSD-3-Clause 16 | preferred-citation: 17 | authors: 18 | - family-names: Riese 19 | given-names: Felix M. 20 | - family-names: Keller 21 | given-names: Sina 22 | title: "Supervised, Semi-Supervised, and Unsupervised Learning for Hyperspectral Regression" 23 | type: book 24 | year: 2020 25 | doi: "10.1007/978-3-030-38617-7_7" 26 | publisher: 27 | - name: "Springer International Publishing" 28 | - Publications city: "Cham" 29 | collection-title: "Hyperspectral Image Analysis: Advances in Machine Learning and Signal Processing" 30 | start: 187 31 | end: 232 32 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: "*" 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.8", "3.9", "3.10"] 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt 27 | - name: List of installed packages 28 | run: | 29 | pip list 30 | - name: Test with pytest 31 | if: ${{ matrix.python-version != '3.9' }} 32 | run: | 33 | pytest --nbval 34 | - name: Test with pytest and Codecov 35 | if: ${{ matrix.python-version == '3.9' }} 36 | run: | 37 | pip install pytest-cov 38 | pytest --nbval --cov=. --cov-report=xml 39 | - name: Upload coverage to Codecov 40 | if: ${{ matrix.python-version == '3.9' }} 41 | uses: codecov/codecov-action@v1 42 | with: 43 | verbose: true 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Felix M. Riese and Sina Keller, Karlsruhe Institute of Technology 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://img.shields.io/github/license/felixriese/hyperspectral-regression 2 | :target: LICENSE 3 | :alt: License: BSD-3-Clause 4 | 5 | .. image:: https://mybinder.org/badge_logo.svg 6 | :target: https://mybinder.org/v2/gh/felixriese/hyperspectral-regression/master?filepath=notebooks 7 | :alt: MyBinder 8 | 9 | .. image:: https://travis-ci.com/felixriese/hyperspectral-regression.svg?branch=master 10 | :target: https://travis-ci.com/felixriese/hyperspectral-regression 11 | :alt: Travis.CI Status 12 | 13 | .. image:: https://codecov.io/gh/felixriese/hyperspectral-regression/branch/master/graph/badge.svg 14 | :target: https://codecov.io/gh/felixriese/hyperspectral-regression 15 | :alt: Codecov 16 | 17 | .. image:: https://api.codacy.com/project/badge/Grade/6808eea2d5984c7d8364f7659b40f9ea 18 | :target: https://www.codacy.com/manual/felixriese/hyperspectral-regression?utm_source=github.com&utm_medium=referral&utm_content=felixriese/hyperspectral-regression&utm_campaign=Badge_Grade 19 | :alt: Codacy Status 20 | 21 | Hyperspectral Regression: Code Examples 22 | =============================================== 23 | 24 | This repository consists of additional material and exemplary implementations for our book chapter. 25 | 26 | The code in this repository is provided via notebooks. The notebooks are structured as follows: 27 | 28 | 1. `Data `_ 29 | 2. `Features `_ 30 | 3. `Supervised Learning `_ 31 | 4. `Active Learning `_ 32 | 5. `Model Selection and Evaluation `_ 33 | 6. `Generative Adversarial Networks `_ 34 | 35 | Description 36 | ----------- 37 | 38 | 39 | 40 | :License: 41 | `3-Clause BSD license `_ 42 | 43 | :Authors: 44 | `Felix M. Riese `_, `Sina Keller `_ 45 | 46 | :Citation: 47 | see `Citation`_ 48 | 49 | :Paper: 50 | `Riese and Keller (2020) `_ 51 | 52 | :Requirements: 53 | Python 3 with these `packages `_ 54 | 55 | 56 | How to use this repository? 57 | --------------------------- 58 | 59 | 1. Install Python 3, e.g. with `Anaconda `_ 60 | 61 | 2. Install the required packages 62 | 63 | conda install --file requirements.txt 64 | 65 | 3. Start jupyter 66 | 67 | jupyter notebook 68 | 69 | 4. Open the notebook folder in this repository in the Jupyter browser and select the desired notebook. 70 | 71 | ---- 72 | 73 | Citation 74 | -------- 75 | 76 | The bibtex file including both references is available in `bibliography.bib 77 | `_. 78 | 79 | **Paper:** 80 | 81 | Felix M. Riese and Sina Keller, "Supervised, Semi-Supervised, and Unsupervised 82 | Learning for Hyperspectral Regression", in *Hyperspectral Image Analysis: 83 | Advances in Machine Learning and Signal Processing*, Saurabh Prasad and Jocelyn 84 | Chanussot, Eds. Cham: Springer International Publishing, 2020, ch. 7, 85 | pp. 187–232, `doi:10.1007/978-3-030-38617-7_7 `_. 86 | 87 | .. code:: bibtex 88 | 89 | @incollection{riese2020supervised, 90 | author = {Riese, Felix~M. and Keller, Sina}, 91 | title ={{Supervised, Semi-Supervised, and Unsupervised Learning for 92 | Hyperspectral Regression}}, 93 | booktitle = {{Hyperspectral Image Analysis: Advances in Machine 94 | Learning and Signal Processing}}, 95 | editor = {Prasad, Saurabh and Chanussot, Jocelyn}, 96 | year = {2020}, 97 | publisher = {Springer International Publishing}, 98 | address = {Cham}, 99 | chapter = {7}, 100 | pages = {187--232}, 101 | doi = {10.1007/978-3-030-38617-7_7}, 102 | } 103 | 104 | **Code:** 105 | 106 | Felix M. Riese and Sina Keller, "Hyperspectral Regression: Code Examples", 107 | Zenodo, `doi:10.5281/zenodo.3450676 `_, 108 | 2019. 109 | 110 | .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3450676.svg 111 | :target: https://doi.org/10.5281/zenodo.3450676 112 | :alt: DOI 113 | 114 | .. code:: bibtex 115 | 116 | @misc{riese2019hyperspectral, 117 | author = {Riese, Felix~M. and Keller, Sina}, 118 | title = {{Hyperspectral Regression: Code Examples}}, 119 | year = {2019}, 120 | DOI = {10.5281/zenodo.3450676}, 121 | publisher = {Zenodo}, 122 | howpublished = {\href{https://doi.org/10.5281/zenodo.3450676}{doi.org/10.5281/zenodo.3450676}} 123 | } 124 | -------------------------------------------------------------------------------- /notebooks/utils.py: -------------------------------------------------------------------------------- 1 | """Package with helper functions.""" 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def get_xy(): 9 | """Download and format the data.""" 10 | # load dataframe 11 | path = ("https://raw.githubusercontent.com/felixriese/hyperspectral" 12 | "-soilmoisture-dataset/master/soilmoisture_dataset.csv") 13 | df = pd.read_csv(path, index_col=0) 14 | 15 | # get features (= hyperspectral bands): 16 | features = [col for col in df.columns if col.isdigit()] 17 | 18 | X = df[features].values 19 | y = df["soil_moisture"].values 20 | 21 | return X, y 22 | 23 | 24 | def get_xy_split(missing_rate=0.0): 25 | """Split data. 26 | 27 | Parameters 28 | ---------- 29 | missing_rate : float 30 | Percentage of missing data for semi-supervised learning. 31 | 32 | """ 33 | X, y = get_xy() 34 | 35 | X_train, X_test, y_train, y_test = train_test_split( 36 | X, y, test_size=0.5, random_state=42, shuffle=True) 37 | 38 | if missing_rate == 0.0: 39 | return X_train, X_test, y_train, y_test 40 | 41 | # semi-supervised and active case 42 | else: 43 | rng = np.random.RandomState(42) 44 | random_unlabeled_points = rng.rand(len(y_train)) < missing_rate 45 | y_train_semi = np.copy(y_train) 46 | y_train_semi[random_unlabeled_points] = -1 47 | 48 | return X_train, X_test, y_train_semi, y_test, y_train 49 | 50 | 51 | def get_xy_shifted(cut=35): 52 | """Generate dataset shift in data. 53 | 54 | Parameters 55 | ---------- 56 | cut : int 57 | Cut at which the target variable is shifted. 58 | 59 | Returns 60 | ------- 61 | X_train, X_test, y_test, y_train : np.arrays 62 | Training and test datasets with input data `X` and target variable `y`. 63 | 64 | """ 65 | X, y = get_xy() 66 | 67 | mask = y < cut 68 | X_train = X[mask] 69 | y_train = y[mask] 70 | X_test = X[~mask] 71 | y_test = y[~mask] 72 | 73 | return X_train, X_test, y_test, y_train 74 | 75 | 76 | def write_results_to_latex_table(results, filename="results"): 77 | """Generate LaTeX table with results.""" 78 | with open("results/"+filename+".tex", "w") as f: 79 | f.write("\documentclass{article}\n") 80 | f.write("\\usepackage{booktabs}\n") 81 | f.write("\\usepackage{multirow}\n") 82 | f.write("\\usepackage{siunitx}\n") 83 | f.write("\\begin{document}\n") 84 | f.write("\\begin{table}\n") 85 | f.write("\t\centering\n") 86 | f.write("\t\caption{Regression results for soil moisture.}\n") 87 | f.write("\t\\begin{tabular}{lSSSl}\n") 88 | f.write("\t\t\\toprule\n") 89 | f.write("\t\t Model &{$R^2$ in $\\%$} &{MAE} &{RMSE} & {Potential}\\\\\n") 90 | f.write("\t\t\midrule\n") 91 | for i in range(len(results["model"])): 92 | f.write("\t\t{model:10} & {r2:.1f} & {mae:.2f} & {rmse:.1f} & {potential}\\\\\n" 93 | .format(model=results["model"][i], 94 | r2=results["r2"][i]*100, 95 | mae=results["mae"][i], 96 | rmse=results["rmse"][i], 97 | potential=results["potential"][i])) 98 | f.write("\t\t\\bottomrule\n") 99 | f.write("\t\end{tabular}\n") 100 | f.write("\t\label{tab:supervised_results}\n") 101 | f.write("\end{table}\n") 102 | f.write("\end{document}\n") 103 | 104 | 105 | def plot_regression_results(truth, pred, model_name): 106 | """Plot regression results. 107 | 108 | Parameters 109 | ---------- 110 | truth : np.array 111 | Array of true values y. 112 | pred : np.array 113 | Array of predicted values y_pred. 114 | model_name : str 115 | Name of the model. 116 | 117 | """ 118 | _, ax = plt.subplots(1, 1, figsize=(5, 5)) 119 | fontsize = 15 120 | 121 | # plot data 122 | plt.scatter(truth, pred, label="Datapoints", alpha=0.3) 123 | 124 | # set min and max 125 | pmin = np.min([np.min(truth), np.min(pred)]) - 1. 126 | pmax = np.max([np.max(truth), np.max(pred)]) + 1. 127 | plt.xlim(pmin, pmax) 128 | plt.ylim(pmin, pmax) 129 | 130 | # plot line 131 | plt.plot(np.linspace(pmin, pmax, 20), np.linspace(pmin, pmax, 20), 132 | linestyle="dashed", c="tab:red", label="Ideal estimation") 133 | 134 | plt.xlabel("Soil moisture (measured) in %", fontsize=fontsize) 135 | plt.ylabel("Soil moisture (estimated) in %", fontsize=fontsize) 136 | plt.legend(fontsize=fontsize*0.8) 137 | plt.title(model_name, fontsize=fontsize) 138 | for tick in ax.xaxis.get_major_ticks()[1::2]: 139 | tick.label.set_visible(False) 140 | for tick in ax.xaxis.get_major_ticks(): 141 | tick.label.set_fontsize(fontsize) 142 | for tick in ax.yaxis.get_major_ticks()[1::2]: 143 | tick.label.set_visible(False) 144 | for tick in ax.yaxis.get_major_ticks(): 145 | tick.label.set_fontsize(fontsize) 146 | plt.savefig("plots/truthestimation_"+model_name.replace( 147 | " ", "").lower()+".pdf", bbox_inches="tight") 148 | -------------------------------------------------------------------------------- /notebooks/5_Model_Selection_and_Evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "Section of the book chapter: 5.3 Model Selection, Optimization and Evaluation\n", 9 | "
\n", 10 | "\n", 11 | "# 5. Model Selection and Evaluation\n", 12 | "\n", 13 | "**Table of Contents**\n", 14 | "\n", 15 | "* [5.1 Hyperparameter Optimization](#5.1-Hyperparameter-Optimization)\n", 16 | "* [5.2 Model Evaluation](#5.2-Model-Evaluation)\n", 17 | "\n", 18 | "**Learnings:**\n", 19 | "\n", 20 | "- how to optimize machine learning (ML) models with grid search, random search and Bayesian optimization,\n", 21 | "- how to evaluate ML models.\n", 22 | "\n", 23 | "\n", 24 | "\n", 25 | "### Packages" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "%matplotlib inline\n", 35 | "%config InlineBackend.figure_format = 'retina'\n", 36 | "\n", 37 | "# ignore warnings\n", 38 | "import warnings\n", 39 | "warnings.filterwarnings('ignore')\n", 40 | "\n", 41 | "import numpy as np\n", 42 | "import pandas as pd\n", 43 | "import seaborn as sns\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "import matplotlib as mpl\n", 46 | "from sklearn.ensemble import RandomForestRegressor\n", 47 | "\n", 48 | "import utils" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Read in Data\n", 56 | "\n", 57 | "**Dataset:** Felix M. Riese and Sina Keller, \"Hyperspectral benchmark dataset on soil moisture\", Dataset, Zenodo, 2018. [DOI:10.5281/zenodo.1227836](http://doi.org/10.5281/zenodo.1227836) and [GitHub](https://github.com/felixriese/hyperspectral-soilmoisture-dataset)\n", 58 | "\n", 59 | "**Introducing paper:** Felix M. Riese and Sina Keller, “Introducing a Framework of Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154. [DOI:10.1109/IGARSS.2018.8517812](https://doi.org/10.1109/IGARSS.2018.8517812)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "X_train, X_test, y_train, y_test = utils.get_xy_split()\n", 69 | "\n", 70 | "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Fix Random State" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "np.random.seed(42)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "***\n", 94 | "\n", 95 | "## 5.1 Hyperparameter Optimization\n", 96 | "\n", 97 | "Content:\n", 98 | "\n", 99 | "- [5.1.1 Grid Search](#5.1.1-Grid-Search)\n", 100 | "- [5.1.2 Randomized Search](#5.1.2-Randomized-Search)\n", 101 | "- [5.1.3 Bayesian Optimization](#5.1.3-Bayesian-Optimization)\n", 102 | "\n", 103 | "### 5.1.1 Grid Search" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# NBVAL_IGNORE_OUTPUT\n", 113 | "\n", 114 | "from sklearn.svm import SVR\n", 115 | "from sklearn.model_selection import GridSearchCV\n", 116 | "\n", 117 | "# example mode: support vector regressor\n", 118 | "model = SVR(kernel=\"rbf\")\n", 119 | "\n", 120 | "# define parameter grid to be tested\n", 121 | "params = {\n", 122 | " \"C\": np.logspace(-4, 4, 9),\n", 123 | " \"gamma\": np.logspace(-4, 4, 9)}\n", 124 | "\n", 125 | "\n", 126 | "# set up grid search and run it on the data\n", 127 | "gs = GridSearchCV(model, params)\n", 128 | "%timeit gs.fit(X_train, y_train)\n", 129 | "print(\"R2 score = {0:.2f} %\".format(gs.score(X_test, y_test)*100))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "### 5.1.2 Randomized Search" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# NBVAL_IGNORE_OUTPUT\n", 146 | "\n", 147 | "from sklearn.svm import SVR\n", 148 | "from sklearn.model_selection import RandomizedSearchCV\n", 149 | "\n", 150 | "# example mode: support vector regressor\n", 151 | "model = SVR(kernel=\"rbf\")\n", 152 | "\n", 153 | "# define parameter grid to be tested\n", 154 | "params = {\n", 155 | " \"C\": np.logspace(-4, 4, 9),\n", 156 | " \"gamma\": np.logspace(-4, 4, 9)}\n", 157 | "\n", 158 | "# set up grid search and run it on the data\n", 159 | "gsr = RandomizedSearchCV(model, params, n_iter=15, refit=True)\n", 160 | "%timeit gsr.fit(X_train, y_train)\n", 161 | "print(\"R2 score = {0:.2f} %\".format(gsr.score(X_test, y_test)*100))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### 5.1.3 Bayesian Optimization\n", 169 | "\n", 170 | "Implementation: [github.com/fmfn/BayesianOptimization](https://github.com/fmfn/BayesianOptimization)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# NBVAL_IGNORE_OUTPUT\n", 180 | "\n", 181 | "from sklearn.svm import SVR\n", 182 | "from bayes_opt import BayesianOptimization\n", 183 | "\n", 184 | "# define function to be optimized\n", 185 | "def opt_func(C, gamma):\n", 186 | " model = SVR(C=C, gamma=gamma)\n", 187 | " return model.fit(X_train, y_train).score(X_test, y_test)\n", 188 | "\n", 189 | "# set bounded region of parameter space\n", 190 | "pbounds = {'C': (1e-5, 1e4), 'gamma': (1e-5, 1e4)}\n", 191 | "\n", 192 | "# define optimizer\n", 193 | "optimizer = BayesianOptimization(\n", 194 | " f=opt_func,\n", 195 | " pbounds=pbounds,\n", 196 | " random_state=1)\n", 197 | "\n", 198 | "# optimize\n", 199 | "%time optimizer.maximize(init_points=2, n_iter=15)\n", 200 | "print(\"R2 score = {0:.2f} %\".format(optimizer.max[\"target\"]*100))" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "***\n", 208 | "\n", 209 | "## 5.2 Model Evaluation\n", 210 | "\n", 211 | "Content:\n", 212 | "\n", 213 | "- [5.2.1 Generate Exemplary Data](#5.2.1-Generate-Exemplary-Data)\n", 214 | "- [5.2.2 Plot the Data](#5.2.2-Plot-the-Data)\n", 215 | "- [5.2.3 Evaluation Metrics](#5.2.3-Evaluation-Metrics)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "import sklearn.metrics as me" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### 5.2.1 Generate Exemplary Data" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "### generate example data\n", 241 | "np.random.seed(1)\n", 242 | "\n", 243 | "# define x grid\n", 244 | "x_grid = np.linspace(0, 10, 11)\n", 245 | "y_model = x_grid*0.5\n", 246 | "\n", 247 | "# define first dataset without outlier\n", 248 | "y1 = np.array([y + np.random.normal(scale=0.2) for y in y_model])\n", 249 | "\n", 250 | "# define second dataset with outlier\n", 251 | "y2 = np.copy(y1)\n", 252 | "y2[9] = 0.5\n", 253 | "\n", 254 | "# define third dataset with higher variance\n", 255 | "y3 = np.array([y + np.random.normal(scale=1.0) for y in y_model])" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "### 5.2.2 Plot the Data" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# plot example data\n", 272 | "fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(12,4))\n", 273 | "fontsize = 18\n", 274 | "titleweight = \"bold\"\n", 275 | "titlepad = 10\n", 276 | "\n", 277 | "scatter_label = \"Data\"\n", 278 | "scatter_alpha = 0.7\n", 279 | "scatter_s = 100\n", 280 | "ax1.scatter(x_grid, y1, label=scatter_label, alpha=scatter_alpha, s=scatter_s)\n", 281 | "ax1.set_title(\"(a) Low var.\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n", 282 | "\n", 283 | "ax2.scatter(x_grid, y2, label=scatter_label, alpha=scatter_alpha, s=scatter_s)\n", 284 | "ax2.set_title(\"(b) Low var. + outlier\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n", 285 | "\n", 286 | "ax3.scatter(x_grid, y3, label=scatter_label, alpha=scatter_alpha, s=scatter_s)\n", 287 | "ax3.set_title(\"(c) Higher var.\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n", 288 | "\n", 289 | "for i, ax in enumerate([ax1, ax2, ax3]):\n", 290 | " i += 1\n", 291 | " \n", 292 | " # red line\n", 293 | " ax.plot(x_grid, y_model, label=\"Model\", c=\"tab:red\", linestyle=\"dashed\", linewidth=4, alpha=scatter_alpha)\n", 294 | " \n", 295 | " # x-axis cosmetics\n", 296 | " ax.set_xlabel(\"x in a.u.\", fontsize=fontsize)\n", 297 | " for tick in ax.xaxis.get_major_ticks():\n", 298 | " tick.label.set_fontsize(fontsize) \n", 299 | " \n", 300 | " # y-axis cosmetics\n", 301 | " if i != 1:\n", 302 | " ax.set_yticklabels([])\n", 303 | " else:\n", 304 | " ax.set_ylabel(\"y in a.u.\", fontsize=fontsize, rotation=90)\n", 305 | " for tick in ax.yaxis.get_major_ticks():\n", 306 | " tick.label.set_fontsize(fontsize) \n", 307 | " ax.set_xlim(-0.5, 10.5)\n", 308 | " ax.set_ylim(-0.5, 6.5)\n", 309 | " # ax.set_title(\"Example \"+str(i), fontsize=fontsize)\n", 310 | " if i == 2:\n", 311 | " ax.legend(loc=2, fontsize=fontsize*1.0, frameon=True)\n", 312 | "\n", 313 | "plt.tight_layout()\n", 314 | "plt.savefig(\"plots/metrics_plot.pdf\", bbox_inches=\"tight\")" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "### 5.2.3 Evaluation Metrics" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# calculating the metrics\n", 331 | "for i, y in enumerate([y1, y2, y3]):\n", 332 | " print(\"Example\", i+1)\n", 333 | " print(\"- MAE = {:.2f}\".format(me.mean_absolute_error(y_model, y)))\n", 334 | " print(\"- MSE = {:.2f}\".format(me.mean_squared_error(y_model, y)))\n", 335 | " print(\"- RMSE = {:.2f}\".format(np.sqrt(me.mean_squared_error(y_model, y))))\n", 336 | " print(\"- R2 = {:.2f}%\".format(me.r2_score(y_model, y)*100))\n", 337 | " print(\"-\"*20)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "raw", 342 | "metadata": {}, 343 | "source": [ 344 | "# print out for LaTeX table\n", 345 | "\n", 346 | "descriptions = {\n", 347 | " 1: \"Low variance\",\n", 348 | " 2: \"Low variance and one outlier\",\n", 349 | " 3: \"Higher variance\",}\n", 350 | "bold = [[False, False, False, False], [False, True, True, True], [True, False, False, False]]\n", 351 | "def make_bold(is_bold):\n", 352 | " if is_bold:\n", 353 | " return \"\\\\bfseries\"\n", 354 | " return \"\"\n", 355 | "\n", 356 | "for i, y in enumerate([y1, y2, y3]):\n", 357 | " print(\"{description} & {bold1} {mae:.2f} & {bold2} {mse:.2f} & {bold3} {rmse:.2f} & {bold4} {r2:.2f} \\\\\\\\\".format(\n", 358 | " description=descriptions[i+1],\n", 359 | " mae=me.mean_absolute_error(y_model, y),\n", 360 | " mse=me.mean_squared_error(y_model, y),\n", 361 | " rmse=np.sqrt(me.mean_squared_error(y_model, y)),\n", 362 | " r2=me.r2_score(y_model, y)*100,\n", 363 | " bold1=make_bold(bold[i][0]),\n", 364 | " bold2=make_bold(bold[i][1]),\n", 365 | " bold3=make_bold(bold[i][2]),\n", 366 | " bold4=make_bold(bold[i][3]),))" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.9.1" 394 | }, 395 | "toc": { 396 | "nav_menu": {}, 397 | "number_sections": false, 398 | "sideBar": true, 399 | "skip_h1_title": false, 400 | "toc_cell": false, 401 | "toc_position": {}, 402 | "toc_section_display": "block", 403 | "toc_window_display": false 404 | }, 405 | "varInspector": { 406 | "cols": { 407 | "lenName": 16, 408 | "lenType": 16, 409 | "lenVar": 40 410 | }, 411 | "kernels_config": { 412 | "python": { 413 | "delete_cmd_postfix": "", 414 | "delete_cmd_prefix": "del ", 415 | "library": "var_list.py", 416 | "varRefreshCmd": "print(var_dic_list())" 417 | }, 418 | "r": { 419 | "delete_cmd_postfix": ") ", 420 | "delete_cmd_prefix": "rm(", 421 | "library": "var_list.r", 422 | "varRefreshCmd": "cat(var_dic_list()) " 423 | } 424 | }, 425 | "types_to_exclude": [ 426 | "module", 427 | "function", 428 | "builtin_function_or_method", 429 | "instance", 430 | "_Feature" 431 | ], 432 | "window_display": false 433 | } 434 | }, 435 | "nbformat": 4, 436 | "nbformat_minor": 2 437 | } 438 | -------------------------------------------------------------------------------- /notebooks/1_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "Section of the book chapter: 3. Regression on Data Level\n", 9 | "
\n", 10 | "\n", 11 | "# 1. Data Level\n", 12 | "\n", 13 | "\n", 14 | "**Table of Contents**\n", 15 | "\n", 16 | "* [1.1 Data handling](#1.1-Data-handling)\n", 17 | "* [1.2 Dataset shift](#1.2-Dataset-shift)\n", 18 | "* [1.3 Dataset splitting](#1.3-Dataset-splitting)\n", 19 | "\n", 20 | "\n", 21 | "**Learnings:**\n", 22 | "\n", 23 | "- how to read in, validate and scale data,\n", 24 | "- how datashift looks like in a real world example,\n", 25 | "- how to split datasets with random, systematic, patch and stratified split.\n", 26 | "\n", 27 | "\n", 28 | "### Packages" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 1, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "%matplotlib inline\n", 38 | "%config InlineBackend.figure_format = 'retina'\n", 39 | "\n", 40 | "# ignore warnings\n", 41 | "import warnings\n", 42 | "warnings.filterwarnings('ignore')\n", 43 | "\n", 44 | "import numpy as np\n", 45 | "import pandas as pd\n", 46 | "import seaborn as sns\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "import matplotlib as mpl\n", 49 | "import itertools\n", 50 | "from patchify import patchify\n", 51 | "from sklearn.ensemble import RandomForestRegressor\n", 52 | "\n", 53 | "import utils" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "### Read in Data\n", 61 | "\n", 62 | "**Dataset:** Felix M. Riese and Sina Keller, \"Hyperspectral benchmark dataset on soil moisture\", Dataset, Zenodo, 2018. [DOI:10.5281/zenodo.1227836](http://doi.org/10.5281/zenodo.1227836) and [GitHub](https://github.com/felixriese/hyperspectral-soilmoisture-dataset)\n", 63 | "\n", 64 | "**Introducing paper:** Felix M. Riese and Sina Keller, “Introducing a Framework of Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154. [DOI:10.1109/IGARSS.2018.8517812](https://doi.org/10.1109/IGARSS.2018.8517812)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "X, y = utils.get_xy()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### Plot Configurations" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "norm = mpl.colors.Normalize(vmin=np.min(y), vmax=np.max(y))\n", 90 | "cmap = \"cividis_r\"\n", 91 | "\n", 92 | "\n", 93 | "myblue = \"#4664ab\"\n", 94 | "myblue30 = \"#c7d0e6\"\n", 95 | "myred = \"#9b1724\"" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Fix Random State" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "np.random.seed(42)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "***\n", 119 | "\n", 120 | "## 1.1 Data handling\n", 121 | "\n", 122 | "Steps:\n", 123 | "\n", 124 | "- [1.1.1 Collect the data](#1.1.1-Collect-the-data)\n", 125 | "- [1.1.2 Validate the data](#1.1.2-Validate-the-data)\n", 126 | "- [1.1.3 Prepare the data](#1.1.3-Prepare-the-data)\n", 127 | "\n", 128 | "### 1.1.1 Collect the data\n", 129 | "\n", 130 | "Columns:\n", 131 | "\n", 132 | "* `0` - `124`: 125 hyperspectral bands\n", 133 | "* `soil_moisture`: target variable of soil moisture in percent" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "df = pd.DataFrame(X) \n", 143 | "df[\"soil_moisture\"] = y" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "### 1.1.2 Validate the data" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# dataset statistics\n", 160 | "df.describe()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# target variable distribution\n", 170 | "df[\"soil_moisture\"].hist()\n", 171 | "plt.show()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# correlations of different features and the target variable in a heatmap\n", 181 | "sns.heatmap(df[[0, 1, 2, 3, 4, \"soil_moisture\"]].corr(), vmin=-1., vmax=1.)\n", 182 | "plt.show()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### 1.1.3 Prepare the data\n", 190 | "\n", 191 | "We use the [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) to scale our data. The resulting `X_scaled` is the scaled input data `X`." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "from sklearn.preprocessing import StandardScaler\n", 201 | "\n", 202 | "scaler = StandardScaler()\n", 203 | "X_scaled = scaler.fit_transform(X)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "***\n", 211 | "\n", 212 | "## 1.2 Dataset shift\n", 213 | "\n", 214 | "Content:\n", 215 | "\n", 216 | "- [1.2.1 Generate shifted dataset](#1.2.1-Generate-shifted-dataset)\n", 217 | "- [1.2.2 Simple regression](#1.2.2-Simple-regression)\n", 218 | "\n", 219 | "### 1.2.1 Generate shifted dataset" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# get shifted data\n", 229 | "X_train, X_test, y_test, y_train = utils.get_xy_shifted()\n", 230 | "print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# plot shifted data\n", 240 | "fontsize = 15\n", 241 | "\n", 242 | "fig, ax = plt.subplots(1,1, figsize=(7,4))\n", 243 | "\n", 244 | "bins = np.arange(25., 42.5, 2.5)\n", 245 | "plt.hist(y_train, bins=bins, label=\"Training\", alpha=1.0, color=myblue)\n", 246 | "plt.hist(y_test, bins=bins, label=\"Unknown\", alpha=1.0, color=myblue30)\n", 247 | "\n", 248 | "lfact = 0.9\n", 249 | "leg = plt.legend(title=\"Datasets:\", fontsize=fontsize*lfact, frameon=False)\n", 250 | "plt.setp(leg.get_title(), fontsize=fontsize*lfact)\n", 251 | "\n", 252 | "plt.xlabel(\"Soil moisture in %\", fontsize=fontsize, labelpad=10)\n", 253 | "plt.ylabel(\"Number of datapoints\", fontsize=fontsize, labelpad=10)\n", 254 | "plt.xlim(24, 41)\n", 255 | "plt.ylim(0, 225)\n", 256 | "\n", 257 | "for tick in ax.xaxis.get_major_ticks()[1::2]:\n", 258 | " tick.label.set_visible(False)\n", 259 | "for tick in ax.xaxis.get_major_ticks():\n", 260 | " tick.label.set_fontsize(fontsize) \n", 261 | "for tick in ax.yaxis.get_major_ticks()[1::2]:\n", 262 | " tick.label.set_visible(False)\n", 263 | "for tick in ax.yaxis.get_major_ticks():\n", 264 | " tick.label.set_fontsize(fontsize) \n", 265 | " \n", 266 | "plt.savefig(\"plots/datasetshift_distributions.pdf\", bbox_inches=\"tight\")" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "### 1.2.2 Simple regression" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "model = RandomForestRegressor(n_estimators=100, n_jobs=-1)\n", 283 | "model.fit(X_train, y_train)\n", 284 | "score = model.score(X_test, y_test)\n", 285 | "print(\"R2 = {0:.2f} %\".format(score*100))" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "***\n", 293 | "\n", 294 | "## 1.3 Dataset splitting\n", 295 | "\n", 296 | "Content:\n", 297 | "\n", 298 | "- [1.3.1 Random Split](#1.3.1-Random-Split)\n", 299 | "- [1.3.2 Split plot](#1.3.2-Split-plot)\n", 300 | "\n", 301 | "### 1.3.1 Random Split" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "from sklearn.model_selection import train_test_split\n", 311 | "\n", 312 | "X_train, X_test, y_train, y_test = train_test_split(\n", 313 | " X, y, test_size=0.5, random_state=42, shuffle=True)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "### 1.3.2 Split plot" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))\n", 330 | "axes = [ax1, ax2, ax3, ax4]\n", 331 | "\n", 332 | "fontsize = 18\n", 333 | "titleweight = \"bold\"\n", 334 | "titlepad = 10\n", 335 | "msize = 100\n", 336 | "il = 12 # image length\n", 337 | "\n", 338 | "# random split\n", 339 | "data_rand = np.random.randint(low=1, high=il, size=(20,20))\n", 340 | "ax1.scatter(data_rand[0], data_rand[1], marker=\"o\", color=myblue, s=msize)\n", 341 | "ax1.set_title(\"(a) Random split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n", 342 | "\n", 343 | "# systematic split\n", 344 | "k = 2\n", 345 | "data_sys = [(x, y) for (x, y) in itertools.product(range(il), range(il)) if ((x % k == 0) and (y % k == 0))]\n", 346 | "ax2.scatter([x[0] for x in data_sys], [x[1] for x in data_sys], marker=\"o\", color=myblue, s=msize)\n", 347 | "ax2.set_title(\"(b) Systematic split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n", 348 | "\n", 349 | "# patch split\n", 350 | "data_pat = np.zeros((il, il), dtype=tuple)\n", 351 | "for i in range(il):\n", 352 | " for j in range(il):\n", 353 | " data_pat[i, j] = (i,j)\n", 354 | "patches = patchify(data_pat, (4, 4), step=4).reshape(3*3, 4*4)\n", 355 | "split_mask = np.random.rand(il) < 0.5\n", 356 | "patches_train = []\n", 357 | "patches_test = []\n", 358 | "for p in range(patches.shape[0]):\n", 359 | " if split_mask[p]:\n", 360 | " patches_train.append(patches[p])\n", 361 | " else:\n", 362 | " patches_test.append(patches[p])\n", 363 | "ax3.scatter([x[0] for p in patches_train for x in p], [x[1] for p in patches_train for x in p],\n", 364 | " marker=\"o\", color=myblue, edgecolor=myblue, s=msize)\n", 365 | "ax3.set_title(\"(c) Patch split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n", 366 | "\n", 367 | "# stratified split\n", 368 | "data_strat_1 = np.random.randint(low=0, high=2, size=(10,10))\n", 369 | "data_strat_2 = np.random.randint(low=5, high=9, size=(10,10))\n", 370 | "data_strat_3 = [(np.random.randint(low=2, high=3), np.random.randint(low=5, high=9)) for _ in range(20)]\n", 371 | "data_strat_4 = [(np.random.randint(low=7, high=10), np.random.randint(low=1, high=3)) for _ in range(20)]\n", 372 | "ax4.scatter([x[0] for x in data_strat_1], [x[1] for x in data_strat_1],\n", 373 | " marker=\"o\", color=\"white\", edgecolor=\"black\", s=msize)\n", 374 | "l_train = ax4.scatter([x[0] for x in data_strat_2], [x[1] for x in data_strat_2],\n", 375 | " marker=\"o\", color=\"white\", edgecolor=\"black\", s=msize)\n", 376 | "ax4.scatter([x[0] for x in data_strat_3], [x[1] for x in data_strat_3], marker=\"o\", color=myblue, s=msize)\n", 377 | "l_test = ax4.scatter([x[0] for x in data_strat_4], [x[1] for x in data_strat_4],\n", 378 | " marker=\"o\", color=myblue, s=msize)\n", 379 | "l_not = ax4.scatter([], [], marker=\"s\", color=\"lightgrey\", s=msize) #, edgecolor=\"grey\")\n", 380 | "ax4.set_facecolor(\"lightgrey\")\n", 381 | "ax4.set_title(\"(d) Stratified split\", fontsize=fontsize, fontweight=titleweight, pad=titlepad)\n", 382 | "\n", 383 | "for i, ax in enumerate(axes):\n", 384 | " ax.set_xlim(0,12)\n", 385 | " ax.set_ylim(0,12)\n", 386 | " # ax.set_xlim(-1,101)\n", 387 | " # ax.set_ylim(-1,101)\n", 388 | " ax.set_xlabel(\"x coordinate in a.u.\", fontsize=fontsize)\n", 389 | " ax.set_ylabel(\"y coordinate in a.u.\", fontsize=fontsize)\n", 390 | " \n", 391 | " for tick in ax.xaxis.get_major_ticks()[1::2]:\n", 392 | " tick.label.set_visible(False)\n", 393 | " for tick in ax.xaxis.get_major_ticks():\n", 394 | " tick.label.set_fontsize(fontsize) \n", 395 | " for tick in ax.yaxis.get_major_ticks()[1::2]:\n", 396 | " tick.label.set_visible(False)\n", 397 | " for tick in ax.yaxis.get_major_ticks():\n", 398 | " tick.label.set_fontsize(fontsize) \n", 399 | " \n", 400 | " if i != 0 and i != 2:\n", 401 | " ax.yaxis.set_visible(False)\n", 402 | " \n", 403 | " if i != 2 and i != 3:\n", 404 | " ax.xaxis.set_visible(False)\n", 405 | "\n", 406 | " \n", 407 | "leg = fig.legend(\n", 408 | " (l_train, l_test, l_not), (\"Training\", \"Test\", \"Not used\"),\n", 409 | " bbox_to_anchor=(1.25, 0.65), title=\"Subsets:\", ncol=1, fontsize=fontsize, frameon=False)\n", 410 | "frame = leg.get_frame()\n", 411 | "plt.setp(leg.get_title(), fontsize=fontsize)\n", 412 | "\n", 413 | "plt.tight_layout()\n", 414 | "plt.savefig(\"plots/split_approaches.pdf\", bbox_inches=\"tight\")" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [] 423 | } 424 | ], 425 | "metadata": { 426 | "kernelspec": { 427 | "display_name": "Python 3", 428 | "language": "python", 429 | "name": "python3" 430 | }, 431 | "language_info": { 432 | "codemirror_mode": { 433 | "name": "ipython", 434 | "version": 3 435 | }, 436 | "file_extension": ".py", 437 | "mimetype": "text/x-python", 438 | "name": "python", 439 | "nbconvert_exporter": "python", 440 | "pygments_lexer": "ipython3", 441 | "version": "3.7.5" 442 | }, 443 | "toc": { 444 | "nav_menu": { 445 | "height": "208px", 446 | "width": "227px" 447 | }, 448 | "number_sections": false, 449 | "sideBar": true, 450 | "skip_h1_title": false, 451 | "toc_cell": false, 452 | "toc_position": {}, 453 | "toc_section_display": "block", 454 | "toc_window_display": false 455 | }, 456 | "varInspector": { 457 | "cols": { 458 | "lenName": 16, 459 | "lenType": 16, 460 | "lenVar": 40 461 | }, 462 | "kernels_config": { 463 | "python": { 464 | "delete_cmd_postfix": "", 465 | "delete_cmd_prefix": "del ", 466 | "library": "var_list.py", 467 | "varRefreshCmd": "print(var_dic_list())" 468 | }, 469 | "r": { 470 | "delete_cmd_postfix": ") ", 471 | "delete_cmd_prefix": "rm(", 472 | "library": "var_list.r", 473 | "varRefreshCmd": "cat(var_dic_list()) " 474 | } 475 | }, 476 | "types_to_exclude": [ 477 | "module", 478 | "function", 479 | "builtin_function_or_method", 480 | "instance", 481 | "_Feature" 482 | ], 483 | "window_display": false 484 | } 485 | }, 486 | "nbformat": 4, 487 | "nbformat_minor": 2 488 | } 489 | -------------------------------------------------------------------------------- /notebooks/3_Supervised_Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | "Section of the book chapter: 5.1 Supervised Learning Models\n", 9 | "
\n", 10 | "\n", 11 | "# 3. Supervised learning\n", 12 | "\n", 13 | "**Table of Contents**\n", 14 | "\n", 15 | "* [3.1 Linear regression and partial least squares](#3.1-Linear-regression-and-partial-least-squares)\n", 16 | "* [3.2 Tree-based Models](#3.2-Tree-based-Models)\n", 17 | "* [3.3 Support Vector Machines](#3.3-Support-Vector-Machines)\n", 18 | "* [3.4 k-Nearest Neighbors](#3.4-k-Nearest-Neighbors)\n", 19 | "* [3.5 Artificial Neural Networks (ANN)](#3.5-Artificial-Neural-Networks,-ANN)\n", 20 | "* [3.6 SUSI: Supervised Self-organizing Maps in Python](#3.6-SUSI:-Supervised-Self-organizing-Maps-in-Python)\n", 21 | "* [3.7 Overall results](#3.7-Overall-results)\n", 22 | "\n", 23 | "**Learnings:**\n", 24 | "\n", 25 | "- how to implement different supervised machine learning models,\n", 26 | "- how to plot regression results.\n", 27 | "\n", 28 | "\n", 29 | "\n", 30 | "### Packages" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "%matplotlib inline\n", 40 | "%config InlineBackend.figure_format = 'retina'\n", 41 | "import os\n", 42 | "\n", 43 | "import numpy as np\n", 44 | "import pandas as pd\n", 45 | "import seaborn as sns\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "import matplotlib as mpl\n", 48 | "import sklearn.metrics as met\n", 49 | "import datetime\n", 50 | "\n", 51 | "import utils\n", 52 | "\n", 53 | "# ignore warnings\n", 54 | "import warnings\n", 55 | "warnings.filterwarnings('ignore')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Read in Data\n", 63 | "\n", 64 | "**Dataset:** Felix M. Riese and Sina Keller, \"Hyperspectral benchmark dataset on soil moisture\", Dataset, Zenodo, 2018. [DOI:10.5281/zenodo.1227836](http://doi.org/10.5281/zenodo.1227836) and [GitHub](https://github.com/felixriese/hyperspectral-soilmoisture-dataset)\n", 65 | "\n", 66 | "**Introducing paper:** Felix M. Riese and Sina Keller, “Introducing a Framework of Self-Organizing Maps for Regression of Soil Moisture with Hyperspectral Data,” in IGARSS 2018 - 2018 IEEE International Geoscience and Remote Sensing Symposium, Valencia, Spain, 2018, pp. 6151-6154. [DOI:10.1109/IGARSS.2018.8517812](https://doi.org/10.1109/IGARSS.2018.8517812)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "X_train, X_test, y_train, y_test = utils.get_xy_split()\n", 76 | "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### Plot Configurations" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "norm = mpl.colors.Normalize(vmin=np.min([np.min(y_train), np.min(y_test)]),\n", 93 | " vmax=np.max([np.max(y_train), np.max(y_test)]))\n", 94 | "cmap = \"cividis_r\"" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Results Dataframe" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "results = pd.DataFrame(columns=[\"model\", \"r2\", \"mae\", \"rmse\", \"potential\"])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Metrics\n", 118 | "\n", 119 | "The following functions calculate and print the following performance metrics:\n", 120 | "\n", 121 | "* Coefficient of Determination $R^2$\n", 122 | "* Mean Absolute Error (MEA)\n", 123 | "* Root Mean Squared Error (RMSE)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "def get_regression_metrics(y_pred):\n", 133 | " global y_test\n", 134 | " return (\n", 135 | " met.r2_score(y_test, y_pred),\n", 136 | " met.mean_absolute_error(y_test, y_pred),\n", 137 | " np.sqrt(met.mean_squared_error(y_test, y_pred)))\n", 138 | "\n", 139 | "def print_regression_metrics(y_pred, model_name, potential):\n", 140 | " global results\n", 141 | " \n", 142 | " # get and print metrics\n", 143 | " r2, mae, rmse = get_regression_metrics(y_pred)\n", 144 | " print(\"R2 = {0:.1f}% \\nMAE = {1:.2f} \\nRMSE = {2:.2f}\".format(\n", 145 | " r2*100, mae, rmse))\n", 146 | " \n", 147 | " # save metrics to dataframe\n", 148 | " if not ((results[\"model\"]==model_name).any()):\n", 149 | " rdict = {\n", 150 | " \"model\": model_name,\n", 151 | " \"r2\": r2,\n", 152 | " \"mae\": mae,\n", 153 | " \"rmse\": rmse,\n", 154 | " \"potential\": potential}\n", 155 | " results = pd.concat([results, pd.DataFrame(rdict, index=[0])], ignore_index=True)\n", 156 | " \n", 157 | " else:\n", 158 | " idx = results.index[results['model'] == model_name].tolist()[0]\n", 159 | " results.at[idx, \"r2\"] = r2\n", 160 | " results.at[idx, \"mae\"] = mae\n", 161 | " results.at[idx, \"rmse\"] = rmse\n", 162 | " results.at[idx, \"potential\"] = potential" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Fix Random State" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # disable warning\n", 179 | "import tensorflow as tf\n", 180 | "\n", 181 | "np.random.seed(42)\n", 182 | "tf.random.set_seed(43)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "***\n", 190 | "\n", 191 | "## 3.1 Linear regression and partial least squares\n", 192 | "\n", 193 | "Content:\n", 194 | "\n", 195 | "- [3.1.1 Linear regression](#3.1.1-Linear-regression)\n", 196 | "- [3.1.2 Partial least squares](#3.1.2-Partial-least-squares)\n", 197 | "\n", 198 | "### 3.1.1 Linear regression\n", 199 | "Implementation: [sklearn.linear_model.LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "from sklearn.linear_model import LinearRegression\n", 209 | "\n", 210 | "model_lin = LinearRegression()\n", 211 | "model_lin.fit(X_train, y_train)\n", 212 | "y_pred_lin = model_lin.predict(X_test)\n", 213 | "\n", 214 | "print_regression_metrics(y_pred_lin, \"Linear\", \"-\")\n", 215 | "utils.plot_regression_results(y_test, y_pred_lin, \"Linear\")" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "### 3.1.2 Partial least squares\n", 223 | "Implementation: [sklearn.cross_decomposition.PLSRegression](https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.PLSRegression.html)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "from sklearn.cross_decomposition import PLSRegression\n", 233 | "\n", 234 | "model_pls = PLSRegression(n_components=5)\n", 235 | "model_pls.fit(X_train, y_train)\n", 236 | "y_pred_pls = model_pls.predict(X_test)\n", 237 | "\n", 238 | "print_regression_metrics(y_pred_pls, \"PLS\", \"Minor\")\n", 239 | "utils.plot_regression_results(y_test, y_pred_pls, \"PLS\")" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "***\n", 247 | "\n", 248 | "## 3.2 Tree-based Models\n", 249 | "\n", 250 | "Content:\n", 251 | "\n", 252 | "- [3.2.1 Decision Tree](#3.2.1-Decision-Tree)\n", 253 | "- [3.2.2 Bagging: Random Forest & Extremly Randomized Trees](#3.2.2-Bagging:-Random-Forest-&-Extremly-Randomized-Trees)\n", 254 | "- [3.2.3 Boosting: Gradient Boosting](#3.2.3-Boosting:-Gradient-Boosting)\n", 255 | "\n", 256 | "### 3.2.1 Decision Tree\n", 257 | "\n", 258 | "**Source:** Breiman, L., Friedman, J., Olshen, R.A., Stone, C.J.: Classification and regression trees. Chapman and Hall/CRC (1984)\n", 259 | "\n", 260 | "**Algorithm:**\n", 261 | "\n", 262 | "The regression trees algorithm is defined as follows:\n", 263 | "1. Start with the root node.\n", 264 | "2. Start with the most significant feature of the training data.\n", 265 | "3. Divide the input data with (binary) a cut $c_1$ on feature $x_i$, e.g. according to the Gini index, see below.\n", 266 | "4. Divide data along the next best feature on cut $c_j$ for $j=2, 3, \\ldots$\n", 267 | "5. Stop if a condition is met, e.g. maximum number of nodes, maximum depth, maximum purity etc.\n", 268 | "6. Every leaf is then averaged and therefore contains one output value.\n", 269 | "\n", 270 | "The Gini index is defined as:\n", 271 | "\n", 272 | "$G = 1 - \\sum_{i=1}^n P_i^2 \\qquad \\text{with } P_i = \\frac{N_i}{N},\\label{eq:gini}$\n", 273 | "\n", 274 | "with $N$ as number of all objects and $N_i$ as number of objects of class $i$.\n", 275 | "\n", 276 | "**Implementation:** [sklearn.tree.DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "from sklearn.tree import DecisionTreeRegressor\n", 286 | "\n", 287 | "model_dt = DecisionTreeRegressor()\n", 288 | "model_dt.fit(X_train, y_train)\n", 289 | "y_pred_dt = model_dt.predict(X_test)\n", 290 | "\n", 291 | "print_regression_metrics(y_pred_dt, \"Decision Tree\", \"Minor\")\n", 292 | "utils.plot_regression_results(y_test, y_pred_dt, \"Decision Tree\")" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### 3.2.2 Bagging: Random Forest & Extremly Randomized Trees\n", 300 | "#### Random Forest\n", 301 | "Implementation: [sklearn.ensemble.RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor) " 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "from sklearn.ensemble import RandomForestRegressor\n", 311 | "\n", 312 | "model_rf = RandomForestRegressor(n_estimators=100, oob_score=True)\n", 313 | "model_rf.fit(X_train, y_train)\n", 314 | "y_pred_rf = model_rf.predict(X_test)\n", 315 | "\n", 316 | "print_regression_metrics(y_pred_rf, \"RF\", \"Minor\")\n", 317 | "utils.plot_regression_results(y_test, y_pred_rf, \"RF\")\n", 318 | "\n", 319 | "print(\"Out-of-bag estimate = {0:.1f}%\".format(model_rf.oob_score_*100))" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "#### Extremly Randomized Trees\n", 327 | "Implementation: [sklearn.ensemble.ExtraTreesRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html#sklearn.ensemble.ExtraTreesRegressor)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "from sklearn.ensemble import ExtraTreesRegressor\n", 337 | "\n", 338 | "model_et = ExtraTreesRegressor(n_estimators=100)\n", 339 | "model_et.fit(X_train, y_train)\n", 340 | "y_pred_et = model_et.predict(X_test)\n", 341 | "\n", 342 | "print_regression_metrics(y_pred_et, \"ET\", \"Minor\")\n", 343 | "utils.plot_regression_results(y_test, y_pred_et, \"ET\")" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "#### Feature Importance" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "n_features_plotted = 15\n", 360 | "\n", 361 | "importances = model_rf.feature_importances_\n", 362 | "indices = np.argsort(importances)\n", 363 | "std = np.std([tree.feature_importances_ for tree in model_rf.estimators_], axis=0)\n", 364 | "plt.figure(figsize=(15,5))\n", 365 | "plt.title(\"Feature importances\")\n", 366 | "plt.bar(range(X_train.shape[1])[125-n_features_plotted:], importances[indices][125-n_features_plotted:], color=\"r\", yerr=std[indices][125-n_features_plotted:], align=\"center\")\n", 367 | "# If you want to define your own labels,\n", 368 | "# change indices to a list of labels on the following line.\n", 369 | "plt.xticks(range(X_train.shape[1])[125-n_features_plotted:], indices[:n_features_plotted], rotation=90)\n", 370 | "plt.xlim([-1 + 125-n_features_plotted, X_train.shape[1]])\n", 371 | "plt.xlabel(\"Hyperspectral band\")\n", 372 | "plt.ylabel(\"Feature importance\")\n", 373 | "plt.savefig(\"plots/featureimportance_rf.pdf\", bbox_inches=\"tight\")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "### 3.2.3 Boosting: Gradient Boosting\n", 381 | "Implementation: [sklearn.ensemble.GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "from sklearn.ensemble import GradientBoostingRegressor\n", 391 | "\n", 392 | "model_gb = GradientBoostingRegressor()\n", 393 | "model_gb.fit(X_train, y_train)\n", 394 | "y_pred_gb = model_gb.predict(X_test)\n", 395 | "\n", 396 | "print_regression_metrics(y_pred_gb, \"GB\", \"Minor\")\n", 397 | "utils.plot_regression_results(y_test, y_pred_gb, \"GB\")" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "***\n", 405 | "\n", 406 | "## 3.3 Support Vector Machines\n", 407 | "Implementation: [sklearn.svm.SVR](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html)\n", 408 | "\n", 409 | "The SVM is tuned with a Grid Search, see [sklearn.model_selection.RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "from sklearn.svm import SVR\n", 419 | "from sklearn.model_selection import RandomizedSearchCV\n", 420 | "\n", 421 | "# 1. find hyperparameters\n", 422 | "params = {\"C\": np.logspace(-8, 8, 17), \"gamma\": np.logspace(-8, 8, 17)}\n", 423 | "rsearch = RandomizedSearchCV(\n", 424 | " estimator=SVR(),\n", 425 | " n_iter=30,\n", 426 | " cv=5,\n", 427 | " n_jobs=-1,\n", 428 | " param_distributions=params)\n", 429 | "rsearch.fit(X_train, y_train)\n", 430 | "model_svm = rsearch.best_estimator_\n", 431 | "\n", 432 | "# 2. predict\n", 433 | "model_svm.fit(X_train, y_train)\n", 434 | "y_pred_svm = model_svm.predict(X_test)\n", 435 | "\n", 436 | "print_regression_metrics(y_pred_svm, \"SVM\", \"Minor\")\n", 437 | "utils.plot_regression_results(y_test, y_pred_svm, \"SVM\")" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "***\n", 445 | "\n", 446 | "## 3.4 k-Nearest Neighbors\n", 447 | "\n", 448 | "Types:\n", 449 | "\n", 450 | "- [3.4.1 Without weighting](#3.4.1-Without-weighting)\n", 451 | "- [3.4.2 With distance weighting](#3.4.2-With-distance-weighting)\n", 452 | "\n", 453 | "Implementation: [sklearn.neighbors.KNeighborsRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html#sklearn.neighbors.KNeighborsRegressor)\n", 454 | "\n", 455 | "### 3.4.1 Without weighting" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "from sklearn.neighbors import KNeighborsRegressor\n", 465 | "\n", 466 | "model_knn = KNeighborsRegressor(n_neighbors=5)\n", 467 | "model_knn.fit(X_train, y_train)\n", 468 | "y_pred_knn = model_knn.predict(X_test)\n", 469 | "\n", 470 | "print_regression_metrics(y_pred_knn, \"k-NN\", \"Minor\")\n", 471 | "utils.plot_regression_results(y_test, y_pred_knn, \"kNN\")" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "### 3.4.2 With distance weighting" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "from sklearn.neighbors import KNeighborsRegressor\n", 488 | "\n", 489 | "model_knnw = KNeighborsRegressor(n_neighbors=5, weights=\"distance\")\n", 490 | "model_knnw.fit(X_train, y_train)\n", 491 | "y_pred_knnw = model_knnw.predict(X_test)\n", 492 | "\n", 493 | "print_regression_metrics(y_pred_knnw, \"k-NN (weighted)\", \"Minor\")\n", 494 | "utils.plot_regression_results(y_test, y_pred_knnw, \"kNN weighted\")" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "***\n", 502 | "\n", 503 | "## 3.5 Artificial Neural Networks, ANN\n", 504 | "\n", 505 | "Types:\n", 506 | "\n", 507 | "- [3.5.1 Fully-connected ANNs](#3.5.1-Fully-connected-ANNs)\n", 508 | "- [3.5.2 CNN with Keras and TensorFlow](#3.5.2-CNN-with-Keras-and-TensorFlow)\n", 509 | "\n", 510 | "### 3.5.1 Fully-connected ANNs\n", 511 | "#### scikit-learn\n", 512 | "Implementation: [sklearn.neural_network.MLPRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "from sklearn.neural_network import MLPRegressor\n", 522 | "\n", 523 | "model_ann = MLPRegressor(hidden_layer_sizes=(20, 20, 20), batch_size=10, max_iter=500)\n", 524 | "model_ann.fit(X_train, y_train)\n", 525 | "y_pred_ann = model_ann.predict(X_test)\n", 526 | "\n", 527 | "print_regression_metrics(y_pred_ann, \"ANN (sklearn)\", \"Major\")\n", 528 | "utils.plot_regression_results(y_test, y_pred_ann, \"ANN (sklearn)\")" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "#### Keras with TensorFlow" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "from tensorflow import keras\n", 545 | "from tensorflow.keras.models import Sequential\n", 546 | "from tensorflow.keras.layers import Dense\n", 547 | "\n", 548 | "keras.backend.clear_session()\n", 549 | "\n", 550 | "# define model\n", 551 | "model = Sequential()\n", 552 | "model.add(Dense(20, input_dim=X_train.shape[1], activation=\"relu\"))\n", 553 | "model.add(Dense(10, activation=\"relu\"))\n", 554 | "model.add(Dense(1, activation=\"linear\"))\n", 555 | "\n", 556 | "# compile and train model\n", 557 | "model.compile(loss=\"mean_squared_error\", optimizer=\"nadam\")\n", 558 | "model.fit(X_train, y_train, epochs=1000, verbose=0, batch_size=10,\n", 559 | " validation_data=(X_test, y_test))\n", 560 | "y_pred_annk = model.predict(X_test)\n", 561 | "\n", 562 | "print_regression_metrics(y_pred_annk, \"ANN (keras)\", \"Major\")\n", 563 | "utils.plot_regression_results(y_test, y_pred_annk, \"ANN (keras)\")" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "### 3.5.2 CNN with Keras and TensorFlow" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "from tensorflow import keras\n", 580 | "from tensorflow.keras.models import Sequential\n", 581 | "from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten\n", 582 | "\n", 583 | "keras.backend.clear_session()\n", 584 | "\n", 585 | "# define model\n", 586 | "model = Sequential()\n", 587 | "\n", 588 | "model.add(Conv1D(filters=8, kernel_size=3, activation=\"relu\",\n", 589 | " input_shape=(X_train.shape[1],1)))\n", 590 | "model.add(MaxPooling1D(pool_size=2))\n", 591 | "\n", 592 | "model.add(Conv1D(filters=16, kernel_size=3, activation=\"relu\"))\n", 593 | "model.add(MaxPooling1D(pool_size=2))\n", 594 | "\n", 595 | "model.add(Conv1D(filters=32, kernel_size=3, activation=\"relu\"))\n", 596 | "model.add(MaxPooling1D(pool_size=2))\n", 597 | "\n", 598 | "model.add(Flatten())\n", 599 | "\n", 600 | "model.add(Dense(20, activation=\"relu\"))\n", 601 | "model.add(Dense(1, activation=\"linear\"))\n", 602 | "\n", 603 | "# compile and train model\n", 604 | "model.compile(loss=\"mean_squared_error\", optimizer=\"nadam\")\n", 605 | "model.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train,\n", 606 | " epochs=500, verbose=0, batch_size=10,\n", 607 | " validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test))\n", 608 | "y_pred_cnn = model.predict(X_test.reshape(X_test.shape[0], X_test.shape[1], 1))\n", 609 | "\n", 610 | "print_regression_metrics(y_pred_cnn, \"CNN\", \"Major\")\n", 611 | "utils.plot_regression_results(y_test, y_pred_cnn, \"CNN\")" 612 | ] 613 | }, 614 | { 615 | "cell_type": "markdown", 616 | "metadata": {}, 617 | "source": [ 618 | "***\n", 619 | "\n", 620 | "## 3.6 SUSI: Supervised Self-organizing Maps in Python\n", 621 | "Implementation: [felixriese/susi](https://github.com/felixriese/susi)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": {}, 628 | "outputs": [], 629 | "source": [ 630 | "import susi\n", 631 | "\n", 632 | "model_som = susi.SOMRegressor(\n", 633 | " n_rows=35,\n", 634 | " n_columns=35,\n", 635 | " n_iter_unsupervised=10000,\n", 636 | " n_iter_supervised=10000,\n", 637 | " n_jobs=-1)\n", 638 | "model_som.fit(X_train, y_train)\n", 639 | "y_pred_som = model_som.predict(X_test)\n", 640 | "\n", 641 | "print_regression_metrics(y_pred_som, \"SOM\", \"Minor\")\n", 642 | "utils.plot_regression_results(y_test, y_pred_som, \"SOM\")" 643 | ] 644 | }, 645 | { 646 | "cell_type": "markdown", 647 | "metadata": {}, 648 | "source": [ 649 | "***\n", 650 | "\n", 651 | "## 3.7 Overall results" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": null, 657 | "metadata": {}, 658 | "outputs": [], 659 | "source": [ 660 | "# save results to CSV\n", 661 | "dt = datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n", 662 | "results.to_csv(\"results/results_\"+dt+\".csv\")" 663 | ] 664 | }, 665 | { 666 | "cell_type": "raw", 667 | "metadata": {}, 668 | "source": [ 669 | "# load results from CSV\n", 670 | "# results = pd.read_csv(\"results/results.csv\")\n", 671 | "results = pd.read_csv(\"results/results_20190704-112011.csv\")" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "# plot horizontal bar plot for results\n", 681 | "fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(15,5))\n", 682 | "\n", 683 | "results.plot(x=\"model\", y=\"r2\", kind=\"barh\", ax=ax1, title=\"$R^2$\", legend=False)\n", 684 | "results.plot(x=\"model\", y=\"mae\", kind=\"barh\", ax=ax2, title=\"MAE\", legend=False)\n", 685 | "results.plot(x=\"model\", y=\"rmse\", kind=\"barh\", ax=ax3, title=\"RMSE\", legend=False)\n", 686 | "for ax in [ax1, ax2, ax3]:\n", 687 | " ax.set_ylabel(\"\")\n", 688 | "\n", 689 | "plt.tight_layout()\n", 690 | "plt.savefig(\"plots/results_bar.pdf\", bbox_inches=\"tight\")" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "# generate LaTeX table\n", 700 | "utils.write_results_to_latex_table(results)" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [] 709 | } 710 | ], 711 | "metadata": { 712 | "kernelspec": { 713 | "display_name": "Python 3 (ipykernel)", 714 | "language": "python", 715 | "name": "python3" 716 | }, 717 | "language_info": { 718 | "codemirror_mode": { 719 | "name": "ipython", 720 | "version": 3 721 | }, 722 | "file_extension": ".py", 723 | "mimetype": "text/x-python", 724 | "name": "python", 725 | "nbconvert_exporter": "python", 726 | "pygments_lexer": "ipython3", 727 | "version": "3.10.12" 728 | }, 729 | "toc": { 730 | "nav_menu": {}, 731 | "number_sections": false, 732 | "sideBar": true, 733 | "skip_h1_title": false, 734 | "toc_cell": false, 735 | "toc_position": {}, 736 | "toc_section_display": "block", 737 | "toc_window_display": false 738 | }, 739 | "varInspector": { 740 | "cols": { 741 | "lenName": 16, 742 | "lenType": 16, 743 | "lenVar": 40 744 | }, 745 | "kernels_config": { 746 | "python": { 747 | "delete_cmd_postfix": "", 748 | "delete_cmd_prefix": "del ", 749 | "library": "var_list.py", 750 | "varRefreshCmd": "print(var_dic_list())" 751 | }, 752 | "r": { 753 | "delete_cmd_postfix": ") ", 754 | "delete_cmd_prefix": "rm(", 755 | "library": "var_list.r", 756 | "varRefreshCmd": "cat(var_dic_list()) " 757 | } 758 | }, 759 | "types_to_exclude": [ 760 | "module", 761 | "function", 762 | "builtin_function_or_method", 763 | "instance", 764 | "_Feature" 765 | ], 766 | "window_display": false 767 | } 768 | }, 769 | "nbformat": 4, 770 | "nbformat_minor": 4 771 | } 772 | --------------------------------------------------------------------------------