├── cimcb_logo.png
├── notebooks
    ├── data
    │   ├── MTBLS90.xlsx
    │   └── ST001047.xlsx
    ├── results
    │   ├── PLSDA_MTBLS90.xlsx
    │   ├── PLSDA_ST001047.xlsx
    │   ├── ANNSigSig_MTBLS90.xlsx
    │   └── ANNSigSig_ST001047.xlsx
    ├── PLSDA_MTBLS90.ipynb
    ├── PLSDA_ST001047.ipynb
    ├── ANNSigSig_MTBLS90.ipynb
    └── ANNSigSig_ST001047.ipynb
├── _config.yml
├── environment.yml
├── .gitignore
└── README.md


/cimcb_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CIMCB/MetabProjectionViz/master/cimcb_logo.png


--------------------------------------------------------------------------------
/notebooks/data/MTBLS90.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CIMCB/MetabProjectionViz/master/notebooks/data/MTBLS90.xlsx


--------------------------------------------------------------------------------
/notebooks/data/ST001047.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CIMCB/MetabProjectionViz/master/notebooks/data/ST001047.xlsx


--------------------------------------------------------------------------------
/notebooks/results/PLSDA_MTBLS90.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CIMCB/MetabProjectionViz/master/notebooks/results/PLSDA_MTBLS90.xlsx


--------------------------------------------------------------------------------
/notebooks/results/PLSDA_ST001047.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CIMCB/MetabProjectionViz/master/notebooks/results/PLSDA_ST001047.xlsx


--------------------------------------------------------------------------------
/notebooks/results/ANNSigSig_MTBLS90.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CIMCB/MetabProjectionViz/master/notebooks/results/ANNSigSig_MTBLS90.xlsx


--------------------------------------------------------------------------------
/notebooks/results/ANNSigSig_ST001047.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CIMCB/MetabProjectionViz/master/notebooks/results/ANNSigSig_ST001047.xlsx


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal
2 | title: MetabProjectionViz
3 | description: "Supplementary information for Mendez et al. (2019) DOI: 10.1007/s11306-020-1640-0"
4 | show_downloads: True
5 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: MetabProjectionViz
 2 | channels:
 3 |   - defaults
 4 |   - conda-forge
 5 |   - cimcb
 6 | dependencies:
 7 |   - python=3.7.3
 8 |   - jupyter=1.0.0
 9 |   - notebook=6.4.7
10 |   - bokeh=1.3.1
11 |   - numpy=1.16.3
12 |   - pandas=0.24.2
13 |   - openpyxl=2.6.1
14 |   - theano=1.0.4
15 |   - cimcb=2.1.2
16 |   - xlrd=1.2.0
17 |   - keras=2.2.4
18 |   - Jinja2==3.0.3
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="cimcb_logo.png" style="width: 160px; float: right;">
 2 | 
 3 | # README.md - `SI_Mendez_etal_2020`   
 4 | 
 5 | <br/>
 6 | 
 7 | <p align="justify">This repository contains the supplementary information for the journal article,<a href="https://doi.org/10.1007/s11306-020-1640-0">"Migrating from Partial Least Squares Discriminant Analysis to Artificial Neural Networks: A Comparison of Functionally Equivalent Feature Importance and Visualisation Tools using Jupyter Notebooks."</a>. There are two types of workflows included in this repository: a standardised visualisation and interrogation partial least squares (PLS) regression workflow, and an equivalent artificial neural network workflow.</p>
 8 | 
 9 | <p align="justify">Two previously published datasets are used as examples of the standardised PLS workflow and the proposed equivalent ANN workflow. The first, by <a href="https://www.nature.com/articles/bjc2015414">Chan et al. (2016)</a> is a urine NMR dataset comprised of 149 named metabolites, publicly available on Metabolomics Workbench (Study ID: <a href="http://dx.doi.org/DOI:10.21228/M8B10B">ST0001047</a>). Two classes were used: gastric cancer (n=43) vs. healthy controls (n=40). The second, by <a href="https://doi.org/10.1371/journal.pgen.1004801">Ganna et al. (2014)</a> and <a href="https://doi.org/10.1101/002782">Ganna et al. (2015)</a> is a plasma LC-MS with 189 named metabolites, publicly available on Metabolights (Study ID: <a href="https://www.ebi.ac.uk/metabolights/MTBLS90">MTBLS90</a>). Samples were split into two classes by sex: males (n=485) and females (n=483).</p>
10 | 
11 | <p align="justify">Due to the structural equivalence with PLS, a shallow (2-layer) ANN is used in this study. Provided the success of this approach towards visualisation and interrogation in shallow ANNs, it may then be possible to adapt this further to deeper ANN architectures. This shallow (2-layer) ANN architecture has a hidden layer consisting of multiple neurons (n = 2 to 6) with a sigmoidal activation, and an output layer consisting of a single neuron with a sigmoidal activation function.</p>
12 | 
13 | <p align="justify">The standardised PLS workflow and the proposed equivalent ANN workflow include the following steps: hyperparameter optimisation, building and training the model, bootstrap resampling of the model, model evaluation, and model visualisation. All steps and accompanying visualisation methods are described in detail above each corresponding code cell within the workflows. These workflows were implemented using the Python programming language, and are presented as Jupyter Notebooks. There are three ways to that these can be accessed: as a static HTML file, in the cloud (using Binder), or downloaded and run on a local machine.</p>
14 | 
15 | <br/>
16 | 
17 | ### *To open notebooks as static HTML files:* 
18 | -  [PLSDA_ST001047.html (Method: PLS-DA; Dataset: ST001047)](https://cimcb.github.io/MetabProjectionViz/html/PLSDA_ST001047.html) 
19 | -  [ANNSigSig_ST001047.html (Method: ANN-SS; Dataset: ST001047)](https://cimcb.github.io/MetabProjectionViz/html/ANNSigSig_ST001047.html) 
20 | -  [PLSDA_MTBLS90.html (Method: PLS-DA; Dataset: MTBLS90)](https://cimcb.github.io/MetabProjectionViz/html/PLSDA_MTBLS90.html) 
21 | -  [ANNSigSig_MTBLS90.html (Method: ANN-SS; Dataset: MTBLS90)](https://cimcb.github.io/MetabProjectionViz/html/ANNSigSig_MTBLS90.html)
22 | 
23 | <br/>
24 | 
25 | ### *To launch the notebooks in the cloud (using Binder):* 
26 | -  [PLSDA_ST001047.ipynb (Method: PLS-DA; Dataset: ST001047)](https://mybinder.org/v2/gh/cimcb/MetabProjectionViz/master?filepath=notebooks/PLSDA_ST001047.ipynb) 
27 | -  [ANNSigSig_ST001047.ipynb (Method: ANN-SS; Dataset: ST001047)](https://mybinder.org/v2/gh/cimcb/MetabProjectionViz/master?filepath=notebooks/ANNSigSig_ST001047.ipynb) 
28 | -  [PLSDA_MTBLS90.ipynb (Method: PLS-DA; Dataset: MTBLS90)](https://mybinder.org/v2/gh/cimcb/MetabProjectionViz/master?filepath=notebooks/PLSDA_MTBLS90.ipynb) 
29 | -  [ANNSigSig_MTBLS90.ipynb (Method: ANN-SS; Dataset: MTBLS90)](https://mybinder.org/v2/gh/cimcb/MetabProjectionViz/master?filepath=notebooks/ANNSigSig_MTBLS90.ipynb) 
30 | 
31 | <br/>
32 | 
33 | ### *To download and run notebooks on a local machine*
34 | <p align="justify">This requires Python 3.x and Jupyter to be installed on your local machine. We recommend using the Anaconda Distribution, which can be download from the Anaconda Webpage (<a href="https://www.anaconda.com/distribution/#download-section/">https://www.anaconda.com/distribution/</a>). For information on installing Python and using Jupyter Notebooks, refer to the tutorial, "Toward collaborative open data science in metabolomics using Jupyter Notebooks and cloud computing" by <a href="https://doi.org/10.1007/s11306-019-1588-0">Mendez et al. (2019)</a>.</p>
35 | 
36 | <i>Note: If you are using Windows, you need to install git using the following:<i/> [Git for Windows](https://gitforwindows.org/)
37 | 
38 | 1. Open Terminal on Linux/MacOS or Command Prompt on Windows
39 | 2. Enter the following into the console (one line at a time)
40 | 
41 | ```console
42 | git clone https://github.com/cimcb/MetabProjectionViz
43 | cd MetabProjectionViz
44 | conda env create -f environment.yml
45 | conda activate MetabProjectionViz
46 | jupyter notebook
47 | ```
48 | 
49 | 
50 | <br/>
51 | 


--------------------------------------------------------------------------------
/notebooks/PLSDA_MTBLS90.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc-hr-collapsed": false
  7 |    },
  8 |    "source": [
  9 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 10 |     "    <font color='red'>To begin: Click anywhere in this cell and press <kbd>Run</kbd> on the menu bar. This executes the current cell and then highlights the next cell. There are two types of cells. A <i>text cell</i> and a <i>code cell</i>. When you <kbd>Run</kbd> a text cell (<i>we are in a text cell now</i>), you advance to the next cell without executing any code. When you <kbd>Run</kbd> a code cell (<i>identified by <span style=\"font-family: courier; color:black; background-color:white;\">In[ ]:</span> to the left of the cell</i>) you advance to the next cell after executing all the Python code within that cell. Any visual results produced by the code (text/figures) are reported directly below that cell. Press <kbd>Run</kbd> again. Repeat this process until the end of the notebook. <b>NOTE:</b> All the cells in this notebook can be automatically executed sequentially by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart and Run All</kbd>. Should anything crash then restart the Jupyter Kernal by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart</kbd>, and start again from the top.\n",
 11 |     "        \n",
 12 |     "</div>"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 20 |     "<img src=\"https://github.com/CIMCB/MetabComparisonBinaryML/blob/master/cimcb_logo.png?raw=true\" width=\"180px\" align=\"right\" style=\"padding: 20px\">\n",
 21 |     "\n",
 22 |     "<a id=\"introduction\"></a>\n",
 23 |     "\n",
 24 |     "<h1> Metabolomics Data Visualisation Workflow for PLS-DA</h1>\n",
 25 |     "\n",
 26 |     "<br>\n",
 27 |     "<br>\n",
 28 |     "<br>\n",
 29 |     "<p  style=\"text-align: justify\">This Jupyter Notebook described a metabolomics data analysis and visualisation workflow for partial least squares regression (a.k.a. projection to latent structure) with a binary classification outcome.</p>\n",
 30 |     "\n",
 31 |     "<p style=\"text-align: justify\">This computational workflow is described using a previously published NMR dataset by <a href=\"https://doi.org/10.1371/journal.pgen.1004801\">Ganna et al. (2014)</a> and <a href=\"https://doi.org/10.1101/002782\">Ganna et al. (2015)</a>.The study compared the plasma metabolomic profile comparison across a large prospective epidemiological study of men (n=485) and women (n=483) at age 70 living in Uppsala, Sweden. For the purpose of this computational workflow, we compare only the males (Class=1) and females (Class=0) in a binary discriminant analysis. The deconvolved and annotated data from this study is deposited on <a href=\"https://www.ebi.ac.uk/metabolights/\">Metabolights</a>, and can be accessed directly via its Study ID: <a href=\"https://www.ebi.ac.uk/metabolights/MTBLS90\">MTBLS90</a>. The Excel file used in this workflow can be accessed via the following link: <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/dynamic/data/MTBLS90.xlsx?raw=true\">MTBLS90.xlsx</a>.</p>\n",
 32 |     "\n",
 33 |     "<p style=\"text-align: justify\">This computational workflow requires a dataset to be in, or converted to, a previously described standardised Excel file format <a href=\"https://doi.org/10.1007/s11306-019-1588-0\">(Mendez et al. 2019)</a>. This format uses the Tidy Data Framework <a href=\"https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf\">(Wickham, 2014)</a>, where each row represents an observation (e.g. sample) and each column represents a variable (e.g. age or metabolite). Each excel file (per study) contains two sheets; a data sheet and a peak sheet. The data sheet contains the metabolite concentration together with the metadata associated for each observation (requiring the inclusion of the columns: Idx, SampleID, and Class). The peak sheet contains the additional metadata that pertains to the metabolites in the data sheet (requiring the inclusion of the columns: Idx, Name, and Label). The standardisation of this format allows for the efficient re-use of this computational workflow.</p>\n",
 34 |     "\n",
 35 |     "<br>\n",
 36 |     "The steps included in this data analysis and visualisation workflow are: \n",
 37 |     "<br>\n",
 38 |     "\n",
 39 |     "1. <a href=\"#1\">Import Packages</a><br>\n",
 40 |     "2. <a href=\"#2\">Load Data & Peak Sheet</a><br>\n",
 41 |     "3. <a href=\"#3\">Extract X & Y</a><br>\n",
 42 |     "4. <a href=\"#4\">Split Data into Train & Test Set</a><br>\n",
 43 |     "5. <a href=\"#5\">Extract, Transform, & Scale X Data with Missing Values Imputed</a><br>\n",
 44 |     "6. <a href=\"#6\">Hyperparameter Optimisation</a><br>\n",
 45 |     "    6.1. <a href=\"#6.1\">Plot R² & Q²</a><br>\n",
 46 |     "    6.2. <a href=\"#6.2\">Plot Latent Projections: Full & CV</a><br>\n",
 47 |     "7. <a href=\"#7\">Build Model & Evaluate</a><br>\n",
 48 |     "8. <a href=\"#8\">Permutation Test</a><br>\n",
 49 |     "9. <a href=\"#9\">Bootstrap Resampling of the Model</a><br> \n",
 50 |     "10. <a href=\"#10\">Model Evaluation using Bootstrap Resampling</a><br> \n",
 51 |     "11. <a href=\"#11\">Model Visualisation</a><br> \n",
 52 |     "    11.1. <a href=\"#11.1\">Plot Latent Projections: in-bag & out-of-bag</a><br>\n",
 53 |     "    11.2. <a href=\"#11.2\">Plot Weight Vectors</a><br>\n",
 54 |     "12. <a href=\"#12\">Variable Contribution Plots</a><br>  \n",
 55 |     "13. <a href=\"#12\">Export Results</a><br>\n",
 56 |     "\n",
 57 |     "</div>"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "toc-hr-collapsed": true
 64 |    },
 65 |    "source": [
 66 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
 67 |     "    \n",
 68 |     "<a id=\"1\"></a>\n",
 69 |     "<h2 style=\"text-align: justify\">1. Import Packages</h2>\n",
 70 |     "\n",
 71 |     "<p style=\"text-align: justify\"><em>Packages</em> provide additional tools that extend beyond the basic functionality of the Python programming. Prior to usage, <em>packages</em> need to be imported into the Jupyter environment. The following <em>packages</em> need to be imported for this computational workflow:<br></p>\n",
 72 |     "\n",
 73 |     "<ul>\n",
 74 |     "<li style=\"text-align: justify\"><a href=\"http://www.numpy.org/\"><code>numpy</code></a>: A standard package primarily used for the manipulation of arrays</li>\n",
 75 |     "\n",
 76 |     "<li style=\"text-align: justify\"><a href=\"https://pandas.pydata.org/\"><code>pandas</code></a>: A standard package primarily used for the manipulation of data tables</li>\n",
 77 |     "\n",
 78 |     "<li style=\"text-align: justify\"><a href=\"https://github.com/CIMCB/cimcb\"><code>cimcb</code></a>: A library of helpful functions and tools provided by the authors</li>\n",
 79 |     "\n",
 80 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/\"><code>sklearn</code></a>: A standard package with tools for machine learning\n",
 81 |     "\n",
 82 |     "<ul>\n",
 83 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\"><code>train_test_split</code></a>: A method to split arrays into training and test subsets</li></ul>\n",
 84 |     "\n",
 85 |     "</li>\n",
 86 |     "\n",
 87 |     "</ul>\n",
 88 |     "\n",
 89 |     "<br>\n",
 90 |     "\n",
 91 |     "</div>"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import numpy as np\n",
101 |     "import pandas as pd\n",
102 |     "import cimcb as cb\n",
103 |     "from sklearn.model_selection import train_test_split\n",
104 |     "\n",
105 |     "print('All packages successfully loaded')"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "<div style=\"background-color:rgb(240,248,255); border: 1px solid lightgrey; padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
113 |     "    \n",
114 |     "<a id=\"1\"></a>\n",
115 |     "<h2 style=\"text-align: justify\"><i>Optional: Set Random Seed for Splitting Data into Training & Test sets</i></h2>\n",
116 |     "\n",
117 |     "<p style=\"text-align: justify\">To reproduce the figures in the research article, set the random seed to 8. This seed is used in the <code>train_test_split</code> method to reproducibly split the source data into a training and test set.</p>\n",
118 |     "\n",
119 |     "<ul>\n",
120 |     "    <li style=\"text-align: justify\"><p style=\"text-align: justify\"><code>seed_split</code>: Seed the generator using an integer value e.g. 42 (default = None ; no seed set)</li>\n",
121 |     "</ul>\n",
122 |     "<br>\n",
123 |     "\n",
124 |     "</div>"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "seed_split = 8\n",
134 |     "# seed_split = None"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
142 |     "\n",
143 |     "<a id=\"2\"></a>\n",
144 |     "<h2 style=\"text-align: justify\">2. Load Data & Peak Sheet</h2>\n",
145 |     "\n",
146 |     "<p style=\"text-align: justify\">This CIMCB helper function <code>load_dataXL()</code> loads the <em>Data</em> and <em>Peak</em> sheet from an Excel file. In addition, this helper function checks that the data is in the standardised Excel file format described <a href=#introduction>above</a>. After the initial checks, <code>load_dataXL()</code> outputs two individual Pandas DataFrames (i.e. tables) called <code>DataTable</code> and <code>PeakTable</code> from the Excel file <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/MTBLS90.xlsx?raw=true\">MTBLS90.xlsx</a>. This helper function requires values for the following parameters:</p>\n",
147 |     "<ul>\n",
148 |     "    <li><code>filename</code>: The name of the excel file (.xlsx file)</li>\n",
149 |     "    <li><code>DataSheet</code>: The name of the data sheet in the file</li>\n",
150 |     "    <li><code>PeakSheet</code>: The name of the peak sheet in the file</li>\n",
151 |     "</ul>   \n",
152 |     "<br>\n",
153 |     "\n",
154 |     "</div>"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "home = 'data/'\n",
164 |     "file = 'MTBLS90.xlsx'  \n",
165 |     "\n",
166 |     "DataTable,PeakTable = cb.utils.load_dataXL(filename=home + file, DataSheet='Data', PeakSheet='Peak')"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
174 |     "\n",
175 |     "<a id=\"3\"></a>\n",
176 |     "<h2 style=\"text-align: justify\">3. Extract X & Y</h2>\n",
177 |     "\n",
178 |     "<p style=\"text-align: justify\">Prior to performing any statistical or machine learning modelling, it is best practice to assess the quality of the data and remove metabolites that lack reproducible measurements  <a href=\"https://link.springer.com/article/10.1007/s11306-018-1367-3\">(Broadhurst et al. 2018)</a>. \n",
179 |     "\n",
180 |     "<br>\n",
181 |     "<p style=\"text-align: justify\">The following steps are needed to extract the <code>X</code> matrix of metabolite concentrations and associated <code>Y</code> vector of classification labels (“M”=1 and “F”=0):\n",
182 |     "    \n",
183 |     "<ul>\n",
184 |     "    \n",
185 |     "<li style=\"text-align: justify\">Create a subset of <code>DataTable</code> called <code>DataTable2</code>, with samples only in the Class “M” or “F”</li>\n",
186 |     "    \n",
187 |     "\n",
188 |     "<li style=\"text-align: justify\">Create the variable <code>PeakList</code> to hold the names (M1...Mn) of the metabolites to be used</li>\n",
189 |     "\n",
190 |     "<li style=\"text-align: justify\">Using this <code>PeakList</code>, extract all corresponding columns (i.e. metabolite data) from <code>DataTable2</code>, and place it in matrix <code>X</code></li>\n",
191 |     "\n",
192 |     "<li style=\"text-align: justify\">Set <code>Y</code> to a list (or 1D array) of binary outcomes based on the Class column from <code>DataTable2</code> (“M”=1 and “F”=0)</li>\n",
193 |     "\n",
194 |     "</ul>\n",
195 |     "\n",
196 |     "<br>\n",
197 |     "</div>"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "# Extract PeakList\n",
207 |     "PeakList = PeakTable['Name']  \n",
208 |     "\n",
209 |     "# Select Subset of Data (Class \"GC\" or \"HE\" only)\n",
210 |     "DataTable2 = DataTable[(DataTable.Class == 1) | (DataTable.Class == 0)]\n",
211 |     "\n",
212 |     "# Extract X Data\n",
213 |     "X = DataTable2[PeakList]  \n",
214 |     "\n",
215 |     "# Create a Binary Y Vector \n",
216 |     "Outcomes = DataTable2['Class']                                  \n",
217 |     "Y = np.array(Outcomes)   \n",
218 |     "\n",
219 |     "# Optional: Save Class Labels for Figure Legends\n",
220 |     "Class = DataTable2.Sex"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
228 |     "\n",
229 |     "<a id=\"4\"></a>\n",
230 |     "<h2 style=\"text-align: justify\">4. Split Data into Train & Test Set</h2>\n",
231 |     "\n",
232 |     "\n",
233 |     "<p style=\"text-align: justify\">The <code>train_test_split</code> method is used to split the X and Y data into training (2/3rd) and test (1/3rd) sets using stratified random selection. Additionally, the Class data is split for use in figure legends. The seed is selected in the optional section above. For further information on this method, refer to the scikit learn <a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\">documentation</a>.\n",
234 |     "\n",
235 |     "<br>\n",
236 |     "</div>"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "# Split Data into Train (2/3rd) and Test (1/3rd)\n",
246 |     "XTrain, XTest, YTrain, YTest, ClassTrain, ClassTest = train_test_split(X,\n",
247 |     "                                                                       Y,\n",
248 |     "                                                                       Class,\n",
249 |     "                                                                       test_size=1/3,\n",
250 |     "                                                                       stratify=Y,\n",
251 |     "                                                                       random_state=seed_split)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
259 |     "    \n",
260 |     "<a id=\"5\"></a>\n",
261 |     "<h2 style=\"text-align: justify\"> 5. Extract, Transform, & Scale X Data with Missing Values Imputed </h2>\n",
262 |     "\n",
263 |     "<p style=\"text-align: justify\">The X Data (<code>XTrain</code> and <code>XTest</code>) is log transformed, mean centred, and scaled to unit variance (with missing values imputed using K-Nearest Neighbour) prior to modelling following standard protocols for metabolomics <a href=\"https://link.springer.com/article/10.1007/s11306-006-0037-z\">(Broadhurst and Kell, 2006)</a>.</p>\n",
264 |     "<ul>\n",
265 |     "    \n",
266 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTrain</code></li>\n",
267 |     "\n",
268 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTrainLog</code>) to the unit variance (a.k.a. auto scaling), while also returning mu & sigma.</li>\n",
269 |     "\n",
270 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTrainKnn</code></li>\n",
271 |     "\n",
272 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTest</code></li>\n",
273 |     "\n",
274 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTestLog</code>) to the unit variance (a.k.a. auto scaling) using the mu & sigma from above.\n",
275 |     "    \n",
276 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTestKnn</code></li>  \n",
277 |     "    \n",
278 |     "</ul>\n",
279 |     "\n",
280 |     "</div>"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": [
289 |     "# Extract X Train Data                          \n",
290 |     "# XTrainLog = np.log(XTrain)   # No need to log transform                                       \n",
291 |     "XTrainScale, mu, sigma = cb.utils.scale(XTrain, method='auto', return_mu_sigma=True)              \n",
292 |     "XTrainKnn = cb.utils.knnimpute(XTrainScale, k=3)    \n",
293 |     "\n",
294 |     "# Extract X Test Data\n",
295 |     "# XTestLog = np.log(XTest)                                          \n",
296 |     "XTestScale = cb.utils.scale(XTest, method='auto', mu=mu, sigma=sigma)           \n",
297 |     "XTestKnn = cb.utils.knnimpute(XTestScale, k=3)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
305 |     "    \n",
306 |     "<a id=\"6\"></a>\n",
307 |     "<h2 style=\"text-align: justify\"> 6. Hyperparameter Optimisation </h2>\n",
308 |     "\n",
309 |     "<p style=\"text-align: justify\">The CIMCB helper function <code>cb.cross_val.KFold()</code> is used to carry out <em>k</em>-fold cross-validation (<em>k</em>=5) on a set of PLS-DA models with varying number of latent variables (1 to 6) to determine the optimal number. In <em>k</em>-fold cross-validation, the original dataset is randomly split into k sized folds and subsequently trained for <em>k</em> iterations, where the model is trained on 1 – <em>k</em> folds and tested on the <em>k</em> fold <a href='http://ai.stanford.edu/~ronnyk/accEst.pdf'>(Kohavi 1995)</a>. This helper function requires values for the following parameters:</p>\n",
310 |     "    \n",
311 |     "<ul>\n",
312 |     "    <li><code>model</code>: The class of model used by the function, <code>cb.model.PLS_SIMPLS</code></li>\n",
313 |     "    <li><code>X</code>: The metabolite data matrix, <code>XTrainKnn</code></li>\n",
314 |     "    <li><code>Y</code>: The binary outcome vector, <code>YTrain</code></li>\n",
315 |     "    <li><code>param_dict</code>: a dictionary, <code>param_dict</code>, that describes all key:value pairs to search, with the key name corresponding to the hyperparameter in the model class and the value as the list of possible values</li>\n",
316 |     "    <li><code>folds</code>: The number of folds in the <em>k</em>-fold cross validation</li>\n",
317 |     "    <li><code>n_mc</code>: The number of Monte Carlo repetitions of the <em>k</em>-fold CV</li>\n",
318 |     "</ul>\n",
319 |     "<br>\n",
320 |     "</div>"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "# Parameter Dictionary\n",
330 |     "param_dict = {'n_components': [1, 2, 3, 4, 5, 6]}                   \n",
331 |     "\n",
332 |     "# Initialise\n",
333 |     "cv = cb.cross_val.KFold(model=cb.model.PLS_SIMPLS,                      \n",
334 |     "                        X=XTrainKnn,                                 \n",
335 |     "                        Y=YTrain,                               \n",
336 |     "                        param_dict=param_dict,                   \n",
337 |     "                        folds=5,\n",
338 |     "                        n_mc=10)       \n",
339 |     "\n",
340 |     "# Run and Plot\n",
341 |     "cv.run()  "
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
349 |     "    \n",
350 |     "<a id=\"6.1\"></a>\n",
351 |     "<h3 style=\"text-align: justify\"> 6.1. Plot R² & Q²</h3>\n",
352 |     "\n",
353 |     "<p style=\"text-align: justify\">When <code>cv.plot(metric='r2q2', method='absolute')</code> is run, 2 plots of $R^2$ and $Q^2$ statistics are displayed: (a) the absolute difference of ($R^2 - Q^2$) vs. $Q^2$, and (b) $R^2$ and $Q^2$ against the number of latent variables. Alternatively, if <code>method='ratio'</code>, plot (a) is the absolute difference of ($R^2 - Q^2$) / $R^2$  vs. $Q^2$. The optimal number of hyperparameters is selected based on the point of inflection in figure b, or if a clear inflection point is not present, where | ($R^2 - Q^2$) | = 0.2. Note, the $R^2$ is the mean coefficient of determination for the full dataset, and the $Q^2$ is the mean coefficient of determination for cross-validated prediction dataset over the 10 Monte Carlo repetitions. The following parameters of <code>cv.plot()</code> can be altered:</p>\n",
354 |     "<ul>\n",
355 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'\n",
356 |     "    <li><code>method</code>: The types of plots displayed (default = 'absolute'). Alternative value is 'ratio'\n",
357 |     "    <li><code>ci</code>: The confidence interval in figure b (default = 95)\n",
358 |     "    <li><code>legend</code>: to show legend (default = True). Alternative value is False\n",
359 |     "</ul>\n",
360 |     "\n",
361 |     "<br>\n",
362 |     "</div>"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "cv.plot(metric='r2q2',\n",
372 |     "        method='absolute',\n",
373 |     "        ci=95,\n",
374 |     "        legend=True)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
382 |     "\n",
383 |     "<a id=\"6.2\"></a>\n",
384 |     "<h3 style=\"text-align: justify\"> 6.2. Plot Latent Projections: Full & CV </h3>\n",
385 |     "    \n",
386 |     "<p style=\"text-align: justify\">When <code>cv.plot_projections()</code> is run, an <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of latent variables (LV) to interrogate. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
387 |     "\n",
388 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). Each score plot includes the full scores (as circles) and CV scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the full scores (as solid lines) and CV scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
389 |     "\n",
390 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each LV scores). The distribution of the full and CV scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
391 |     "\n",
392 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) ROC curves (a ROC curve for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). As the ROC curves are for every combination of 2 LVs, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve for the full model (green), and ROC curve for the cv model with 95% confidence intervals (yellow). Additionally, the equal distribution line (dashed black line) is shown.</p>\n",
393 |     "\n",
394 |     " <ul>\n",
395 |     "    <li><code>components</code>: LVs to plot (default = \"all\" ; plot all components). Alternatively, list the components to plot e.g. [1,3,4]</li>\n",
396 |     "    <li><code>plot</code>: Data to show (default = 'ci' ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'full', 'cv', and 'all'</li>\n",
397 |     "    <li><code>label</code>: Add labels to groups (default = None ; refers to groups as 0/1)\n",
398 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'.</li>\n",
399 |     " </ul>\n",
400 |     "\n",
401 |     "<br>\n",
402 |     "</div>"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "cv.plot_projections(components=[1,2,3,4],\n",
412 |     "                    plot='ci',\n",
413 |     "                    label=ClassTrain,\n",
414 |     "                    legend='all')"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "cv.plot_projections(components=[1,2,3,4],\n",
424 |     "                    plot='meanci',\n",
425 |     "                    label=ClassTrain,\n",
426 |     "                    legend='all')"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
434 |     "\n",
435 |     "<a id=\"7\"></a>\n",
436 |     "<h2 style=\"text-align: justify\"> 7. Build Model & Evaluate </h2>\n",
437 |     "\n",
438 |     "<p style=\"text-align: justify\">A PLS-DA model using <code>cb.model.PLS_SIMPLS</code> is created and initialised using the optimal hyperparameter values determined in step 4 (i.e. the number of latent variables). The implementation of PLS in the <code>cb.model.PLS_SIMPLS</code> class uses the SIMPLS algorithm <a href=\"https://doi.org/10.1016/0169-7439(93)85002-X\">(De Jong, 1993)</a>.<p>\n",
439 |     "     \n",
440 |     "<p style=\"text-align: justify\">Following this initialisation, the PLS-DA model is trained using the <code>.train(X, Y)</code> method where the X matrix is <code>XTrainKnn</code> and the Y vector is <code>YTrain</code>, returning the Y predicted value <code>YPredTrain</code>. This model is then tested using the <code>.test(X, Y)</code> method where the X matrix is <code>XTestKnn</code> and the Y vector is <code>YTest</code>, returning the Y predicted value <code>YPredTest</code>.</p>\n",
441 |     "\n",
442 |     "<p style=\"text-align: justify\">The <code>.evaluate()</code> method can be used to evaluate the predictability of the model using the train and test set. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the predicted score for the train and test (by group). The distribution plot shows the probability density function of the predicted scores for the train and test (by group). The ROC curve shows the ROC curve for the train (green) and test (yellow). The following parameter values in <code>.evaluate()</code> can be altered:\n",
443 |     "    \n",
444 |     "<ul>\n",
445 |     "    <li><code>testset</code>: Plot test dataset (default = None). Alternative, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
446 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
447 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
448 |     "\n",
449 |     "<br>\n",
450 |     "</div>"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "# Build Model\n",
460 |     "model = cb.model.PLS_SIMPLS(n_components=2)\n",
461 |     "YPredTrain = model.train(XTrainKnn, YTrain)\n",
462 |     "YPredTest = model.test(XTestKnn)\n",
463 |     "\n",
464 |     "# Put YTrain and YPredTrain in a List\n",
465 |     "EvalTrain = [YTrain, YPredTrain]\n",
466 |     "\n",
467 |     "# Put YTest and YPrestTest in a List\n",
468 |     "EvalTest = [YTest, YPredTest]\n",
469 |     "\n",
470 |     "# Save Weights & Feature Importance \n",
471 |     "model_weights = model.x_weights_            # [LV1, LV2]\n",
472 |     "model_fi = model.feature_importance_        # [VIP, Coefficient]\n",
473 |     "\n",
474 |     "# Evaluate Model (include Test Dataset)\n",
475 |     "model.evaluate(testset=EvalTest,\n",
476 |     "               label=ClassTrain,\n",
477 |     "               legend='all') "
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {},
483 |    "source": [
484 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
485 |     "\n",
486 |     "<a id=\"8\"></a>\n",
487 |     "<h2 style=\"text-align: justify\">8. Permutation Test</h2>\n",
488 |     "\n",
489 |     "<p style=\"text-align: justify\">After a model has been trained, the <code>.permutation_test()</code> method can be used to assess the reliability of the trained model (after selecting the number of latent variables). For the permutation test, the metabolite data matrix is randomised (permuted or 'shuffled'), while the Y (i.e. outcome) is fixed, and subsequently trained and tested on this randomised data <a href='https://link.springer.com/article/10.1007/s11306-011-0330-3'>(Szymańska et al. 2012)</a>. This process is repeated (in this case, n=100) to construct a distribution to fairly access the model. For a dataset with features that have with no meaningful contribution, we would expect a similar $R^2$ and $Q^2$ to a randomised dataset, while for a dataset with features with meaningful contribution, we would expect a $R^2$ and $Q^2$ significantly higher than that of the randomised dataset. When <code>.permutation_test()</code> is run, 2 plots are displayed: (a) $R^2$ and $Q^2$ against \"correlation of permuted data against original data\", and (b) probability density functions for $R^2$ and $Q^2$, with the $R^2$ and $Q^2$ values found for the model trained on original data presented as ball-and-stick. The following parameter value of <code>.permutation_test()</code> can be altered: \n",
490 |     "\n",
491 |     "<ul>\n",
492 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'. Multiple metrics can be plotted using a list e.g. ['r2q2', 'auc]\n",
493 |     "    <li><code>nperm</code>: The number of permutations. (default = 100)\n",
494 |     "    <li><code>legend</code>: To show legend (default = True). Alternative value is False\n",
495 |     "</ul>\n",
496 |     "\n",
497 |     "<br>\n",
498 |     "</div>"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {},
505 |    "outputs": [],
506 |    "source": [
507 |     "model.permutation_test(metric='r2q2',\n",
508 |     "                       nperm=100,\n",
509 |     "                       legend=True)"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "markdown",
514 |    "metadata": {},
515 |    "source": [
516 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
517 |     "\n",
518 |     "<a id=\"9\"></a>\n",
519 |     "<h2 style=\"text-align: justify\"> 9. Bootstrap Resampling of the Model</h2>\n",
520 |     "\n",
521 |     "<p style=\"text-align: justify\">Bootstrap resampling is a resampling method based on random resampling with replacement, commonly used to provide an estimate of sampling distribution of a test statistic <a href=\"https://epubs.siam.org/doi/book/10.1137/1.9781611970319?mobileUi=0\">(Efron, 1982)</a>. In the context of this workflow, the PLS model from step 5 with its fixed hyperparameter values (i.e. number of LVs = 2) is retrained on the resampled with replacement data (in-bag) and evaluated on the unused data (out-of-bag) for 100 resamples. After the model is evaluated for each bootstrap, metrics including the predicted values (ypred), LV scores, LV loadings, and feature importance (VIP and coefficients) are stored and used to calculate 95% confidence intervals. To calculate the 95% confidence intervals, various methods can be used including the basic percentile method, corrected percentile method (a.k.a. bias-corrected method), and the commonly used bias-corrected and accelerated (BCA) method. In this example, the BCA  method is used with the class <code>cb.boostrap.BCA</code>. Alternatively, use <code>cb.boostrap.Per</code> to use the percentile method, or <code>cb.bootstrap.CPer</code> for the corrected percentile method. To create and run the <code>bootmodel</code> for any method, the following parameter values need to be set:\n",
522 |     "  \n",
523 |     " <ul>\n",
524 |     "    <li><code>model</code>: A model with fixed hyperparameter values for boostrap resampling</li>\n",
525 |     "    <li><code>bootnum</code>: The number of bootstrap resamples (default = 100)</li>\n",
526 |     "\n",
527 |     "<br>\n",
528 |     "</div>"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": null,
534 |    "metadata": {},
535 |    "outputs": [],
536 |    "source": [
537 |     "# Extract X Data and Train Model\n",
538 |     "XBoot = DataTable2[PeakList]\n",
539 |     "# XBootLog = np.log(XBoot)\n",
540 |     "XBootScale = cb.utils.scale(XBoot, method='auto')\n",
541 |     "XBootKnn = cb.utils.knnimpute(XBootScale, k=3)\n",
542 |     "YPredBoot = model.train(XBootKnn, Y)\n",
543 |     "\n",
544 |     "# Build Boostrap Models\n",
545 |     "bootmodel = cb.bootstrap.BCA(model, bootnum=100) \n",
546 |     "bootmodel.run()"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {},
552 |    "source": [
553 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
554 |     "\n",
555 |     "<a id=\"10\"></a>\n",
556 |     "<h2 style=\"text-align: justify\"> 10. Model Evaluation using Bootstrap Resampling</h2>\n",
557 |     "\n",
558 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.evaluate()</code> method can be used to provide an estimate of the robustness and a measure of the generalised predictability of the model. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the distribution of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The distribution plot shows the probability density function of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The ROC curve shows the ROC curve with the median (green) and 95% CI for the in-bag (light green band) and the median (yellow) and 95% CI for the out-of-bag (light yellow band). The method used to calculate the 95% CI for the in-bag (green) is the class selected in the previous cell. In this example, the bias-corrected and accelerated method is used as <code>cb.bootstrap.BCA</code> was used in the previous cell to create <code>bootmodel</code>. \n",
559 |     " \n",
560 |     "<ul>\n",
561 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
562 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
563 |     "    <li><code>trainset</code>: Plot train dataset instead of median in-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
564 |     "    <li><code>testset</code>: Plot test dataset instead of median out-of-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
565 |     "<br>\n",
566 |     "</div>"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": null,
572 |    "metadata": {},
573 |    "outputs": [],
574 |    "source": [
575 |     "bootmodel.evaluate(label=Class,\n",
576 |     "                   legend='all') "
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "markdown",
581 |    "metadata": {},
582 |    "source": [
583 |     "\n",
584 |     "\n",
585 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
586 |     "\n",
587 |     "<a id=\"11\"></a>\n",
588 |     "<h2 style=\"text-align: justify\"> 11. Model Visualisation </h2>\n",
589 |     "\n",
590 |     "<br>\n",
591 |     "</div>"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "markdown",
596 |    "metadata": {},
597 |    "source": [
598 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
599 |     "    \n",
600 |     "<a id=\"11.1\"></a>\n",
601 |     "<h3 style=\"text-align: justify\"> 11.1 Plot Latent Projections: in-bag & out-of-bag </h3>\n",
602 |     "    \n",
603 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_projections()</code> method can be used to visualise the latent variable (LV) scores. When this method is run, a <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of LVs. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
604 |     "\n",
605 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). Each score plot includes the in-bag scores (as circles) and out-of-bag scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the in-bag scores (as solid lines) and out-of-bag scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
606 |     "\n",
607 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each LV scores). The distribution of the in-bag and out-of-bag scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
608 |     "\n",
609 |     "<p style=\"text-align: justify\">There are C(n,2) ROC curves (a ROC curve for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). As the ROC curves are for every combination of 2 LVs, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve with the LV score for the initial model with the 95% confidence intervals using the in-bag LV scores (green), and a ROC curve for the out-of-bag LV scores with 95% confidence intervals. The method used to calculate the 95% CI for the in-bag (green) is the class used to create the <code>bootmodel</code>. In this example, the bias-corrected and accelerated method is used (<code>cb.bootstrap.BCA</code>). Additionally, the equal distribution line (dashed black line) is shown. \n",
610 |     "\n",
611 |     "</p>\n",
612 |     "\n",
613 |     " <ul>\n",
614 |     "    <li><code>plot</code>: Data to show in plot (default = \"ci\" ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'ib', 'oob', and 'all'</li>\n",
615 |     "    <li><code>label</code>: Add labels to groups in scores plot (default = None ; refer to groups as 0/1).\n",
616 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'</li>\n",
617 |     "\n",
618 |     "<br>\n",
619 |     "</div>"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "metadata": {},
626 |    "outputs": [],
627 |    "source": [
628 |     "bootmodel.plot_projections(plot='ib',\n",
629 |     "                           label=Class,\n",
630 |     "                           legend='all')"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "metadata": {},
637 |    "outputs": [],
638 |    "source": [
639 |     "bootmodel.plot_projections(plot='oob',\n",
640 |     "                           label=Class,\n",
641 |     "                           legend='all')"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": null,
647 |    "metadata": {},
648 |    "outputs": [],
649 |    "source": [
650 |     "bootmodel.plot_projections(plot='ci',\n",
651 |     "                           label=Class,\n",
652 |     "                           legend='all')"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "markdown",
657 |    "metadata": {},
658 |    "source": [
659 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
660 |     "    \n",
661 |     "<a id=\"11.2\"></a>\n",
662 |     "<h3 style=\"text-align: justify\"> 11.2 Plot Weight Vectors </h3>\n",
663 |     "\n",
664 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_weights()</code> method can be used to visualise the latent variable (LV) weight vectors. When this method is run, <em>n</em> plots are displayed, where <em>n</em> is the number of LVs. The circles in each plot represent the LV weight vectors for the initial model. The 95% confidence intervals are calculated using bias-correct (BC) bootstrap method in step 6. Any metabolite weights with a confidence interval crossing the zero line are considered non-significant to the latent variable. This method requires values for the following parameters:</p>\n",
665 |     "    \n",
666 |     "<ul>\n",
667 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
668 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
669 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [LV1, LV2, etc.]\n",
670 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
671 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
672 |     "</ul>\n",
673 |     "   \n",
674 |     "<br>\n",
675 |     "</div>"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": null,
681 |    "metadata": {
682 |     "scrolled": false
683 |    },
684 |    "outputs": [],
685 |    "source": [
686 |     "bootmodel.plot_weights(PeakTable,\n",
687 |     "                        PeakList,\n",
688 |     "                        plot='median',\n",
689 |     "                        ylabel='Label',  \n",
690 |     "                        sort=False)    "
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "markdown",
695 |    "metadata": {},
696 |    "source": [
697 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
698 |     "\n",
699 |     "<a id=\"12\"></a>\n",
700 |     "<h2 style=\"text-align: justify\">12. Variable Contribution Plots </h2>\n",
701 |     "\n",
702 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_featureimportance()</code> method can be used to visualise the feature importance metrics. When this method is run, <em>2</em> plots are displayed; the Coefficient Plot and Variable Importance in Projection (VIP) plot. The circles represent the values in the initial model. The 95% confidence intervals are calculated using bias-correct (BC) bootstrap method in step 6.</p> \n",
703 |     "    \n",
704 |     "<p style=\"text-align: justify\"> The coefficients (in the Coefficient plot) contain information about the overall contribution of each metabolite. The coefficient values can either a positive or negative number, and therefore, negatively or positively contribute to the model. Any metabolite coefficient value with a confidence interval crossing the zero line is considered non-significant to the model.</p>\n",
705 |     "    \n",
706 |     "<p style=\"text-align: justify\"> The values in the VIP plot contain information about the overall contribution of each metabolite. Unlike the coefficient values, the VIP is absolute, with the higher values representing a higher significance to the model. Typically, metabolites with a VIP greater than 1 are considered \"important\" in the model.</p>\n",
707 |     "    \n",
708 |     "<p style=\"text-align: justify\">This method, <code>bootmodel</code> exports the feature importance metrics as a pandas DataFrame (table). This method also requires values for the following parameters:</p>\n",
709 |     "    \n",
710 |     "<ul>\n",
711 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
712 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
713 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [Coef, VIP].\n",
714 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
715 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
716 |     "</ul>\n",
717 |     "   \n",
718 |     "<br>\n",
719 |     "</div>"
720 |    ]
721 |   },
722 |   {
723 |    "cell_type": "code",
724 |    "execution_count": null,
725 |    "metadata": {},
726 |    "outputs": [],
727 |    "source": [
728 |     "feature_importance = bootmodel.plot_featureimportance(PeakTable,\n",
729 |     "                                                      PeakList,\n",
730 |     "                                                      plot='median',\n",
731 |     "                                                      ylabel='Label',  \n",
732 |     "                                                      sort=False)      "
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "markdown",
737 |    "metadata": {},
738 |    "source": [
739 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
740 |     "\n",
741 |     "<a id=\"13\"></a>\n",
742 |     "<h2 style=\"text-align: justify\"> 13. Export Results </h2>\n",
743 |     "\n",
744 |     "<p style=\"text-align: justify\">The feature importance table created in step 8.3 can be exported using the inbuilt <code>.to_excel()</code> function within a pandas DataFrame. This function requires an input with the name of the file to create, and it can include directories by using the ‘ / ’ symbol. In the cell below, the table <code>feature_importance</code> is exported as an excel file called 'PLSDA_ST001047.xlsx' in the 'results' folder.<p>\n",
745 |     "\n",
746 |     "</div>"
747 |    ]
748 |   },
749 |   {
750 |    "cell_type": "code",
751 |    "execution_count": null,
752 |    "metadata": {},
753 |    "outputs": [],
754 |    "source": [
755 |     "export_folder = 'results/'\n",
756 |     "export_file = 'PLSDA_MTBLS90.xlsx'\n",
757 |     "\n",
758 |     "feature_importance.to_excel(export_folder + export_file)\n",
759 |     "print(\"Done!\")"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "code",
764 |    "execution_count": null,
765 |    "metadata": {},
766 |    "outputs": [],
767 |    "source": []
768 |   },
769 |   {
770 |    "cell_type": "code",
771 |    "execution_count": null,
772 |    "metadata": {},
773 |    "outputs": [],
774 |    "source": []
775 |   }
776 |  ],
777 |  "metadata": {
778 |   "kernelspec": {
779 |    "display_name": "Python 3",
780 |    "language": "python",
781 |    "name": "python3"
782 |   },
783 |   "language_info": {
784 |    "codemirror_mode": {
785 |     "name": "ipython",
786 |     "version": 3
787 |    },
788 |    "file_extension": ".py",
789 |    "mimetype": "text/x-python",
790 |    "name": "python",
791 |    "nbconvert_exporter": "python",
792 |    "pygments_lexer": "ipython3",
793 |    "version": "3.7.3"
794 |   },
795 |   "toc": {
796 |    "base_numbering": 1,
797 |    "nav_menu": {},
798 |    "number_sections": false,
799 |    "sideBar": false,
800 |    "skip_h1_title": false,
801 |    "title_cell": "Table of Contents",
802 |    "title_sidebar": "Contents",
803 |    "toc_cell": false,
804 |    "toc_position": {},
805 |    "toc_section_display": false,
806 |    "toc_window_display": false
807 |   },
808 |   "toc-autonumbering": false,
809 |   "toc-showmarkdowntxt": false
810 |  },
811 |  "nbformat": 4,
812 |  "nbformat_minor": 4
813 | }
814 | 


--------------------------------------------------------------------------------
/notebooks/PLSDA_ST001047.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc-hr-collapsed": false
  7 |    },
  8 |    "source": [
  9 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 10 |     "    <font color='red'>To begin: Click anywhere in this cell and press <kbd>Run</kbd> on the menu bar. This executes the current cell and then highlights the next cell. There are two types of cells. A <i>text cell</i> and a <i>code cell</i>. When you <kbd>Run</kbd> a text cell (<i>we are in a text cell now</i>), you advance to the next cell without executing any code. When you <kbd>Run</kbd> a code cell (<i>identified by <span style=\"font-family: courier; color:black; background-color:white;\">In[ ]:</span> to the left of the cell</i>) you advance to the next cell after executing all the Python code within that cell. Any visual results produced by the code (text/figures) are reported directly below that cell. Press <kbd>Run</kbd> again. Repeat this process until the end of the notebook. <b>NOTE:</b> All the cells in this notebook can be automatically executed sequentially by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart and Run All</kbd>. Should anything crash then restart the Jupyter Kernal by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart</kbd>, and start again from the top.\n",
 11 |     "        \n",
 12 |     "</div>"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 20 |     "<img src=\"https://github.com/CIMCB/MetabComparisonBinaryML/blob/master/cimcb_logo.png?raw=true\" width=\"180px\" align=\"right\" style=\"padding: 20px\">\n",
 21 |     "\n",
 22 |     "<a id=\"introduction\"></a>\n",
 23 |     "\n",
 24 |     "<h1> Metabolomics Data Visualisation Workflow for PLS-DA</h1>\n",
 25 |     "\n",
 26 |     "<br>\n",
 27 |     "<br>\n",
 28 |     "<br>\n",
 29 |     "<p  style=\"text-align: justify\">This Jupyter Notebook described a metabolomics data analysis and visualisation workflow for partial least squares regression (a.k.a. projection to latent structure) with a binary classification outcome.</p>\n",
 30 |     "\n",
 31 |     "<p style=\"text-align: justify\">This computational workflow is described using a previously published NMR dataset by <a href=\"https://www.nature.com/articles/bjc2015414\">Chan et al. (2016)</a>. The study compared the urine metabolomic profile comparison across patients characterised as Gastric Cancer (GC; n=43), Benign Gastric Disease (BN; n=40), and Healthy Control (HE; n=40) using 149 named metabolites. For the purpose of this computational workflow, we compare only the GC vs HE samples in a binary discriminant analysis. The deconvolved and annotated data from this study are deposited on <a href=\"https://www.metabolomicsworkbench.org/\">Metabolomics Workbench</a> (Study ID: ST001047), and can be accessed directly via its Project DOI: <a href=\"http://dx.doi.org/DOI:10.21228/M8B10B\">10.21228/M8B10B</a>. The Excel file used in this workflow can be accessed via the following link: <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/ST001047.xlsx?raw=true\">ST001047.xlsx</a>.</p>\n",
 32 |     "\n",
 33 |     "<p style=\"text-align: justify\">This computational workflow requires a dataset to be in, or converted to, a previously described standardised Excel file format <a href=\"https://doi.org/10.1007/s11306-019-1588-0\">(Mendez et al. 2019)</a>. This format uses the Tidy Data Framework <a href=\"https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf\">(Wickham, 2014)</a>, where each row represents an observation (e.g. sample) and each column represents a variable (e.g. age or metabolite). Each excel file (per study) contains two sheets; a data sheet and a peak sheet. The data sheet contains the metabolite concentration together with the metadata associated for each observation (requiring the inclusion of the columns: Idx, SampleID, and Class). The peak sheet contains the additional metadata that pertains to the metabolites in the data sheet (requiring the inclusion of the columns: Idx, Name, and Label). The standardisation of this format allows for the efficient re-use of this computational workflow.</p>\n",
 34 |     "\n",
 35 |     "<br>\n",
 36 |     "The steps included in this data analysis and visualisation workflow are: \n",
 37 |     "<br>\n",
 38 |     "\n",
 39 |     "1. <a href=\"#1\">Import Packages</a><br>\n",
 40 |     "2. <a href=\"#2\">Load Data & Peak Sheet</a><br>\n",
 41 |     "3. <a href=\"#3\">Extract X & Y</a><br>\n",
 42 |     "4. <a href=\"#4\">Split Data into Train & Test Set</a><br>\n",
 43 |     "5. <a href=\"#5\">Extract, Transform, & Scale X Data with Missing Values Imputed</a><br>\n",
 44 |     "6. <a href=\"#6\">Hyperparameter Optimisation</a><br>\n",
 45 |     "    6.1. <a href=\"#6.1\">Plot R² & Q²</a><br>\n",
 46 |     "    6.2. <a href=\"#6.2\">Plot Latent Projections: Full & CV</a><br>\n",
 47 |     "7. <a href=\"#7\">Build Model & Evaluate</a><br>\n",
 48 |     "8. <a href=\"#8\">Permutation Test</a><br>\n",
 49 |     "9. <a href=\"#9\">Bootstrap Resampling of the Model</a><br> \n",
 50 |     "10. <a href=\"#10\">Model Evaluation using Bootstrap Resampling</a><br> \n",
 51 |     "11. <a href=\"#11\">Model Visualisation</a><br> \n",
 52 |     "    11.1. <a href=\"#11.1\">Plot Latent Projections: in-bag & out-of-bag</a><br>\n",
 53 |     "    11.2. <a href=\"#11.2\">Plot Weight Vectors</a><br>\n",
 54 |     "12. <a href=\"#12\">Variable Contribution Plots</a><br>  \n",
 55 |     "13. <a href=\"#12\">Export Results</a><br>\n",
 56 |     "\n",
 57 |     "</div>"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "toc-hr-collapsed": true
 64 |    },
 65 |    "source": [
 66 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
 67 |     "    \n",
 68 |     "<a id=\"1\"></a>\n",
 69 |     "<h2 style=\"text-align: justify\">1. Import Packages</h2>\n",
 70 |     "\n",
 71 |     "<p style=\"text-align: justify\"><em>Packages</em> provide additional tools that extend beyond the basic functionality of the Python programming. Prior to usage, <em>packages</em> need to be imported into the Jupyter environment. The following <em>packages</em> need to be imported for this computational workflow:<br></p>\n",
 72 |     "\n",
 73 |     "<ul>\n",
 74 |     "<li style=\"text-align: justify\"><a href=\"http://www.numpy.org/\"><code>numpy</code></a>: A standard package primarily used for the manipulation of arrays</li>\n",
 75 |     "\n",
 76 |     "<li style=\"text-align: justify\"><a href=\"https://pandas.pydata.org/\"><code>pandas</code></a>: A standard package primarily used for the manipulation of data tables</li>\n",
 77 |     "\n",
 78 |     "<li style=\"text-align: justify\"><a href=\"https://github.com/CIMCB/cimcb\"><code>cimcb</code></a>: A library of helpful functions and tools provided by the authors</li>\n",
 79 |     "\n",
 80 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/\"><code>sklearn</code></a>: A standard package with tools for machine learning\n",
 81 |     "\n",
 82 |     "<ul>\n",
 83 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\"><code>train_test_split</code></a>: A method to split arrays into training and test subsets</li></ul>\n",
 84 |     "\n",
 85 |     "</li>\n",
 86 |     "\n",
 87 |     "</ul>\n",
 88 |     "\n",
 89 |     "<br>\n",
 90 |     "\n",
 91 |     "</div>"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import numpy as np\n",
101 |     "import pandas as pd\n",
102 |     "import cimcb as cb\n",
103 |     "from sklearn.model_selection import train_test_split\n",
104 |     "\n",
105 |     "print('All packages successfully loaded')"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "<div style=\"background-color:rgb(240,248,255); border: 1px solid lightgrey; padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
113 |     "    \n",
114 |     "<a id=\"1\"></a>\n",
115 |     "<h2 style=\"text-align: justify\"><i>Optional: Set Random Seed for Splitting Data into Training & Test sets</i></h2>\n",
116 |     "\n",
117 |     "<p style=\"text-align: justify\">To reproduce the figures in the research article, set the random seed to 40. This seed is used in the <code>train_test_split</code> method to reproducibly split the source data into a training and test set.</p>\n",
118 |     "\n",
119 |     "<ul>\n",
120 |     "    <li style=\"text-align: justify\"><p style=\"text-align: justify\"><code>seed_split</code>: Seed the generator using an integer value e.g. 42 (default = None ; no seed set)</li>\n",
121 |     "</ul>\n",
122 |     "<br>\n",
123 |     "\n",
124 |     "</div>"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "seed_split = 40\n",
134 |     "# seed_split = None"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
142 |     "\n",
143 |     "<a id=\"2\"></a>\n",
144 |     "<h2 style=\"text-align: justify\">2. Load Data & Peak Sheet</h2>\n",
145 |     "\n",
146 |     "<p style=\"text-align: justify\">This CIMCB helper function <code>load_dataXL()</code> loads the <em>Data</em> and <em>Peak</em> sheet from an Excel file. In addition, this helper function checks that the data is in the standardised Excel file format described <a href=#introduction>above</a>. After the initial checks, <code>load_dataXL()</code> outputs two individual Pandas DataFrames (i.e. tables) called <code>DataTable</code> and <code>PeakTable</code> from the Excel file <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/ST001047.xlsx?raw=true\">ST001047.xlsx</a>. This helper function requires values for the following parameters:</p>\n",
147 |     "<ul>\n",
148 |     "    <li><code>filename</code>: The name of the excel file (.xlsx file)</li>\n",
149 |     "    <li><code>DataSheet</code>: The name of the data sheet in the file</li>\n",
150 |     "    <li><code>PeakSheet</code>: The name of the peak sheet in the file</li>\n",
151 |     "</ul>   \n",
152 |     "<br>\n",
153 |     "\n",
154 |     "</div>"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "home = 'data/'\n",
164 |     "file = 'ST001047.xlsx'\n",
165 |     "\n",
166 |     "DataTable,PeakTable = cb.utils.load_dataXL(filename=home + file, DataSheet='Data', PeakSheet='Peak')"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
174 |     "\n",
175 |     "<a id=\"3\"></a>\n",
176 |     "<h2 style=\"text-align: justify\">3. Extract X & Y</h2>\n",
177 |     "\n",
178 |     "<p style=\"text-align: justify\">Prior to performing any statistical or machine learning modelling, it is best practice to assess the quality of the data and remove metabolites that lack reproducible measurements  <a href=\"https://link.springer.com/article/10.1007/s11306-018-1367-3\">(Broadhurst et al. 2018)</a>. In this dataset <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/ST001047.xlsx?raw=true\">ST001047.xlsx</a>, we can find that the QC-RSD and percentage of missing value has been previously calculated (refer to the peak sheet). In this Jupyter Notebook, we remove all metabolites that do not meet the following criteria:</p>\n",
179 |     "\n",
180 |     "<ul>\n",
181 |     "<li style=\"text-align: justify\">QC-RSD less than 20% </li>\n",
182 |     "\n",
183 |     "<li style=\"text-align: justify\">Fewer than 10% of values are missing</li>\n",
184 |     "</ul>\n",
185 |     "\n",
186 |     "<br>\n",
187 |     "<p style=\"text-align: justify\">The following steps are needed to extract the <code>X</code> matrix of metabolite concentrations and associated <code>Y</code> vector of classification labels (“GC”=1 and “HE”=0):\n",
188 |     "    \n",
189 |     "<ul>\n",
190 |     "    \n",
191 |     "<li style=\"text-align: justify\">Create a subset of <code>DataTable</code> called <code>DataTable2</code>, with samples only in the Class “GC” or “HE”</li>\n",
192 |     "    \n",
193 |     "\n",
194 |     "<li style=\"text-align: justify\">Create the variable <code>PeakList</code> to hold the names (M1...Mn) of the metabolites to be used</li>\n",
195 |     "\n",
196 |     "<li style=\"text-align: justify\">Using this <code>PeakList</code>, extract all corresponding columns (i.e. metabolite data) from <code>DataTable2</code>, and place it in matrix <code>X</code></li>\n",
197 |     "\n",
198 |     "<li style=\"text-align: justify\">Set <code>Y</code> to a list (or 1D array) of binary outcomes based on the Class column from <code>DataTable2</code> (“GC”=1 and “HE”=0)</li>\n",
199 |     "\n",
200 |     "</ul>\n",
201 |     "\n",
202 |     "<br>\n",
203 |     "</div>"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "# Clean PeakTable and Extract PeakList\n",
213 |     "RSD = PeakTable['QC_RSD']   \n",
214 |     "PercMiss = PeakTable['Perc_missing']  \n",
215 |     "PeakTableClean = PeakTable[(RSD < 20) & (PercMiss < 10)]   \n",
216 |     "PeakList = PeakTableClean['Name']  \n",
217 |     "\n",
218 |     "# Select Subset of Data (Class \"GC\" or \"HE\" only)\n",
219 |     "DataTable2 = DataTable[(DataTable.Class == \"GC\") | (DataTable.Class == \"HE\")]\n",
220 |     "\n",
221 |     "# Extract X Data\n",
222 |     "X = DataTable2[PeakList]  \n",
223 |     "\n",
224 |     "# Create a Binary Y Vector \n",
225 |     "Outcomes = DataTable2['Class']                                  \n",
226 |     "Y = [1 if outcome == 'GC' else 0 for outcome in Outcomes]         \n",
227 |     "Y = np.array(Y)   \n",
228 |     "\n",
229 |     "# Optional: Save Class Labels for Figure Legends\n",
230 |     "Class = DataTable2.Class"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
238 |     "\n",
239 |     "<a id=\"4\"></a>\n",
240 |     "<h2 style=\"text-align: justify\">4. Split Data into Train & Test Set</h2>\n",
241 |     "\n",
242 |     "\n",
243 |     "<p style=\"text-align: justify\">The <code>train_test_split</code> method is used to split the X and Y data into training (2/3rd) and test (1/3rd) sets using stratified random selection. Additionally, the Class data is split for use in figure legends. The seed is selected in the optional section above. For further information on this method, refer to the scikit learn <a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\">documentation</a>.\n",
244 |     "\n",
245 |     "<br>\n",
246 |     "</div>"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "# Split Data into Train (2/3rd) and Test (1/3rd)\n",
256 |     "XTrain, XTest, YTrain, YTest, ClassTrain, ClassTest = train_test_split(X,\n",
257 |     "                                                                       Y,\n",
258 |     "                                                                       Class,\n",
259 |     "                                                                       test_size=1/3,\n",
260 |     "                                                                       stratify=Y,\n",
261 |     "                                                                       random_state=seed_split)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
269 |     "    \n",
270 |     "<a id=\"5\"></a>\n",
271 |     "<h2 style=\"text-align: justify\"> 5. Extract, Transform, & Scale X Data with Missing Values Imputed </h2>\n",
272 |     "\n",
273 |     "<p style=\"text-align: justify\">The X Data (<code>XTrain</code> and <code>XTest</code>) is log10 transformed, mean centred, and scaled to unit variance (with missing values imputed using K-Nearest Neighbour) prior to modelling following standard protocols for metabolomics <a href=\"https://link.springer.com/article/10.1007/s11306-006-0037-z\">(Broadhurst and Kell, 2006)</a>.</p>\n",
274 |     "<ul>\n",
275 |     "    \n",
276 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTrain</code></li>\n",
277 |     "\n",
278 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTrainLog</code>) to the unit variance (a.k.a. auto scaling), while also returning mu & sigma.</li>\n",
279 |     "\n",
280 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTrainKnn</code></li>\n",
281 |     "\n",
282 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTest</code></li>\n",
283 |     "\n",
284 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTestLog</code>) to the unit variance (a.k.a. auto scaling) using the mu & sigma from above.\n",
285 |     "    \n",
286 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTestKnn</code></li>  \n",
287 |     "    \n",
288 |     "</ul>\n",
289 |     "\n",
290 |     "</div>"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "# Extract X Train Data                          \n",
300 |     "XTrainLog = np.log(XTrain)                                          \n",
301 |     "XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)              \n",
302 |     "XTrainKnn = cb.utils.knnimpute(XTrainScale, k=3)    \n",
303 |     "\n",
304 |     "# Extract X Test Data\n",
305 |     "XTestLog = np.log(XTest)                                          \n",
306 |     "XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)           \n",
307 |     "XTestKnn = cb.utils.knnimpute(XTestScale, k=3)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
315 |     "    \n",
316 |     "<a id=\"6\"></a>\n",
317 |     "<h2 style=\"text-align: justify\"> 6. Hyperparameter Optimisation </h2>\n",
318 |     "\n",
319 |     "<p style=\"text-align: justify\">The CIMCB helper function <code>cb.cross_val.KFold()</code> is used to carry out <em>k</em>-fold cross-validation (<em>k</em>=5) on a set of PLS-DA models with varying number of latent variables (1 to 6) to determine the optimal number. In <em>k</em>-fold cross-validation, the original dataset is randomly split into k sized folds and subsequently trained for <em>k</em> iterations, where the model is trained on 1 – <em>k</em> folds and tested on the <em>k</em> fold <a href='http://ai.stanford.edu/~ronnyk/accEst.pdf'>(Kohavi 1995)</a>. This helper function requires values for the following parameters:</p>\n",
320 |     "    \n",
321 |     "<ul>\n",
322 |     "    <li><code>model</code>: The class of model used by the function, <code>cb.model.PLS_SIMPLS</code></li>\n",
323 |     "    <li><code>X</code>: The metabolite data matrix, <code>XTrainKnn</code></li>\n",
324 |     "    <li><code>Y</code>: The binary outcome vector, <code>YTrain</code></li>\n",
325 |     "    <li><code>param_dict</code>: a dictionary, <code>param_dict</code>, that describes all key:value pairs to search, with the key name corresponding to the hyperparameter in the model class and the value as the list of possible values</li>\n",
326 |     "    <li><code>folds</code>: The number of folds in the <em>k</em>-fold cross validation</li>\n",
327 |     "    <li><code>n_mc</code>: The number of Monte Carlo repetitions of the <em>k</em>-fold CV</li>\n",
328 |     "</ul>\n",
329 |     "<br>\n",
330 |     "</div>"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "# Parameter Dictionary\n",
340 |     "param_dict = {'n_components': [1, 2, 3, 4, 5, 6]}                   \n",
341 |     "\n",
342 |     "# Initialise\n",
343 |     "cv = cb.cross_val.KFold(model=cb.model.PLS_SIMPLS,                      \n",
344 |     "                        X=XTrainKnn,                                 \n",
345 |     "                        Y=YTrain,                               \n",
346 |     "                        param_dict=param_dict,                   \n",
347 |     "                        folds=5,\n",
348 |     "                        n_mc=10)       \n",
349 |     "\n",
350 |     "# Run and Plot\n",
351 |     "cv.run()  "
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
359 |     "    \n",
360 |     "<a id=\"6.1\"></a>\n",
361 |     "<h3 style=\"text-align: justify\"> 6.1. Plot R² & Q²</h3>\n",
362 |     "\n",
363 |     "<p style=\"text-align: justify\">When <code>cv.plot(metric='r2q2', method='absolute')</code> is run, 2 plots of $R^2$ and $Q^2$ statistics are displayed: (a) the absolute difference of ($R^2 - Q^2$) vs. $Q^2$, and (b) $R^2$ and $Q^2$ against the number of latent variables. Alternatively, if <code>method='ratio'</code>, plot (a) is the absolute difference of ($R^2 - Q^2$) / $R^2$  vs. $Q^2$. The optimal number of hyperparameters is selected based on the point of inflection in figure b, or if a clear inflection point is not present, where | ($R^2 - Q^2$) | = 0.2. Note, the $R^2$ is the mean coefficient of determination for the full dataset, and the $Q^2$ is the mean coefficient of determination for cross-validated prediction dataset over the 10 Monte Carlo repetitions. The following parameters of <code>cv.plot()</code> can be altered:</p>\n",
364 |     "<ul>\n",
365 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'\n",
366 |     "    <li><code>method</code>: The types of plots displayed (default = 'absolute'). Alternative value is 'ratio'\n",
367 |     "    <li><code>ci</code>: The confidence interval in figure b (default = 95)\n",
368 |     "    <li><code>legend</code>: to show legend (default = True). Alternative value is False\n",
369 |     "</ul>\n",
370 |     "\n",
371 |     "<br>\n",
372 |     "</div>"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "cv.plot(metric='r2q2',\n",
382 |     "        method='absolute',\n",
383 |     "        ci=95,\n",
384 |     "        legend=True)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
392 |     "\n",
393 |     "<a id=\"6.2\"></a>\n",
394 |     "<h3 style=\"text-align: justify\"> 6.2. Plot Latent Projections: Full & CV </h3>\n",
395 |     "    \n",
396 |     "<p style=\"text-align: justify\">When <code>cv.plot_projections()</code> is run, an <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of latent variables (LV) to interrogate. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
397 |     "\n",
398 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). Each score plot includes the full scores (as circles) and CV scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the full scores (as solid lines) and CV scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
399 |     "\n",
400 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each LV scores). The distribution of the full and CV scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
401 |     "\n",
402 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) ROC curves (a ROC curve for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). As the ROC curves are for every combination of 2 LVs, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve for the full model (green), and ROC curve for the cv model with 95% confidence intervals (yellow). Additionally, the equal distribution line (dashed black line) is shown.</p>\n",
403 |     "\n",
404 |     " <ul>\n",
405 |     "    <li><code>components</code>: LVs to plot (default = \"all\" ; plot all components). Alternatively, list the components to plot e.g. [1,3,4]</li>\n",
406 |     "    <li><code>plot</code>: Data to show (default = 'ci' ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'full', 'cv', and 'all'</li>\n",
407 |     "    <li><code>label</code>: Add labels to groups (default = None ; refers to groups as 0/1)\n",
408 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'.</li>\n",
409 |     " </ul>\n",
410 |     "\n",
411 |     "<br>\n",
412 |     "</div>"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "cv.plot_projections(components=[1,2,3],\n",
422 |     "                    plot='ci',\n",
423 |     "                    label=ClassTrain,\n",
424 |     "                    legend='all')"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
432 |     "\n",
433 |     "<a id=\"7\"></a>\n",
434 |     "<h2 style=\"text-align: justify\"> 7. Build Model & Evaluate </h2>\n",
435 |     "\n",
436 |     "<p style=\"text-align: justify\">A PLS-DA model using <code>cb.model.PLS_SIMPLS</code> is created and initialised using the optimal hyperparameter values determined in step 4 (i.e. the number of latent variables). The implementation of PLS in the <code>cb.model.PLS_SIMPLS</code> class uses the SIMPLS algorithm <a href=\"https://doi.org/10.1016/0169-7439(93)85002-X\">(De Jong, 1993)</a>.<p>\n",
437 |     "     \n",
438 |     "<p style=\"text-align: justify\">Following this initialisation, the PLS-DA model is trained using the <code>.train(X, Y)</code> method where the X matrix is <code>XTrainKnn</code> and the Y vector is <code>YTrain</code>, returning the Y predicted value <code>YPredTrain</code>. This model is then tested using the <code>.test(X, Y)</code> method where the X matrix is <code>XTestKnn</code> and the Y vector is <code>YTest</code>, returning the Y predicted value <code>YPredTest</code>.</p>\n",
439 |     "\n",
440 |     "<p style=\"text-align: justify\">The <code>.evaluate()</code> method can be used to evaluate the predictability of the model using the train and test set. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the predicted score for the train and test (by group). The distribution plot shows the probability density function of the predicted scores for the train and test (by group). The ROC curve shows the ROC curve for the train (green) and test (yellow). The following parameter values in <code>.evaluate()</code> can be altered:\n",
441 |     "    \n",
442 |     "<ul>\n",
443 |     "    <li><code>testset</code>: Plot test dataset (default = None). Alternative, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
444 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
445 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
446 |     "\n",
447 |     "<br>\n",
448 |     "</div>"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "# Build Model\n",
458 |     "model = cb.model.PLS_SIMPLS(n_components=2)\n",
459 |     "YPredTrain = model.train(XTrainKnn, YTrain)\n",
460 |     "YPredTest = model.test(XTestKnn)\n",
461 |     "\n",
462 |     "# Put YTrain and YPredTrain in a List\n",
463 |     "EvalTrain = [YTrain, YPredTrain]\n",
464 |     "\n",
465 |     "# Put YTest and YPrestTest in a List\n",
466 |     "EvalTest = [YTest, YPredTest]\n",
467 |     "\n",
468 |     "# Save Loadings & Feature Importance \n",
469 |     "model_weights = model.x_weights_            # [LV1, LV2, ...]\n",
470 |     "model_fi = model.feature_importance_        # [VIP, Coefficient]\n",
471 |     "\n",
472 |     "# Evaluate Model (include Test Dataset)\n",
473 |     "model.evaluate(testset=EvalTest,\n",
474 |     "               label=ClassTrain,\n",
475 |     "               legend='all') "
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "markdown",
480 |    "metadata": {},
481 |    "source": [
482 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
483 |     "\n",
484 |     "<a id=\"8\"></a>\n",
485 |     "<h2 style=\"text-align: justify\">8. Permutation Test</h2>\n",
486 |     "\n",
487 |     "<p style=\"text-align: justify\">After a model has been trained, the <code>.permutation_test()</code> method can be used to assess the reliability of the trained model (after selecting the number of latent variables). For the permutation test, the metabolite data matrix is randomised (permuted or 'shuffled'), while the Y (i.e. outcome) is fixed, and subsequently trained and tested on this randomised data <a href='https://link.springer.com/article/10.1007/s11306-011-0330-3'>(Szymańska et al. 2012)</a>. This process is repeated (in this case, n=100) to construct a distribution to fairly access the model. For a dataset with features that have with no meaningful contribution, we would expect a similar $R^2$ and $Q^2$ to a randomised dataset, while for a dataset with features with meaningful contribution, we would expect a $R^2$ and $Q^2$ significantly higher than that of the randomised dataset. When <code>.permutation_test()</code> is run, 2 plots are displayed: (a) $R^2$ and $Q^2$ against \"correlation of permuted data against original data\", and (b) probability density functions for $R^2$ and $Q^2$, with the $R^2$ and $Q^2$ values found for the model trained on original data presented as ball-and-stick. The following parameter value of <code>.permutation_test()</code> can be altered: \n",
488 |     "\n",
489 |     "<ul>\n",
490 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'. Multiple metrics can be plotted using a list e.g. ['r2q2', 'auc]\n",
491 |     "    <li><code>nperm</code>: The number of permutations. (default = 100)\n",
492 |     "    <li><code>legend</code>: To show legend (default = True). Alternative value is False\n",
493 |     "</ul>\n",
494 |     "\n",
495 |     "<br>\n",
496 |     "</div>"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": null,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "model.permutation_test(metric='r2q2',\n",
506 |     "                       nperm=100,\n",
507 |     "                       legend=True)"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
515 |     "\n",
516 |     "<a id=\"9\"></a>\n",
517 |     "<h2 style=\"text-align: justify\"> 9. Bootstrap Resampling of the Model</h2>\n",
518 |     "\n",
519 |     "<p style=\"text-align: justify\">Bootstrap resampling is a resampling method based on random resampling with replacement, commonly used to provide an estimate of sampling distribution of a test statistic <a href=\"https://epubs.siam.org/doi/book/10.1137/1.9781611970319?mobileUi=0\">(Efron, 1982)</a>. In the context of this workflow, the PLS model from step 5 with its fixed hyperparameter values (i.e. number of LVs = 2) is retrained on the resampled with replacement data (in-bag) and evaluated on the unused data (out-of-bag) for 100 resamples. After the model is evaluated for each bootstrap, metrics including the predicted values (ypred), LV scores, LV loadings, and feature importance (VIP and coefficients) are stored and used to calculate 95% confidence intervals. To calculate the 95% confidence intervals, various methods can be used including the basic percentile method, corrected percentile method (a.k.a. bias-corrected method), and the commonly used bias-corrected and accelerated (BCA) method. In this example, the BCA  method is used with the class <code>cb.boostrap.BCA</code>. Alternatively, use <code>cb.boostrap.Per</code> to use the percentile method, or <code>cb.bootstrap.CPer</code> for the corrected percentile method. To create and run the <code>bootmodel</code> for any method, the following parameter values need to be set:\n",
520 |     "  \n",
521 |     " <ul>\n",
522 |     "    <li><code>model</code>: A model with fixed hyperparameter values for boostrap resampling</li>\n",
523 |     "    <li><code>bootnum</code>: The number of bootstrap resamples (default = 100)</li>\n",
524 |     "\n",
525 |     "<br>\n",
526 |     "</div>"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "# Extract X Data and Train Model\n",
536 |     "XBoot = DataTable2[PeakList]\n",
537 |     "XBootLog = np.log(XBoot)\n",
538 |     "XBootScale = cb.utils.scale(XBootLog, method='auto')\n",
539 |     "XBootKnn = cb.utils.knnimpute(XBootScale, k=3)\n",
540 |     "YPredBoot = model.train(XBootKnn, Y)\n",
541 |     "\n",
542 |     "# Build Boostrap Models\n",
543 |     "bootmodel = cb.bootstrap.BCA(model, bootnum=100) \n",
544 |     "bootmodel.run()"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "markdown",
549 |    "metadata": {},
550 |    "source": [
551 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
552 |     "\n",
553 |     "<a id=\"10\"></a>\n",
554 |     "<h2 style=\"text-align: justify\"> 10. Model Evaluation using Bootstrap Resampling</h2>\n",
555 |     "\n",
556 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.evaluate()</code> method can be used to provide an estimate of the robustness and a measure of the generalised predictability of the model. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the distribution of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The distribution plot shows the probability density function of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The ROC curve shows the ROC curve with the median (green) and 95% CI for the in-bag (light green band) and the median (yellow) and 95% CI for the out-of-bag (light yellow band). The method used to calculate the 95% CI for the in-bag (green) is the class selected in the previous cell. In this example, the bias-corrected and accelerated method is used as <code>cb.bootstrap.BCA</code> was used in the previous cell to create <code>bootmodel</code>. \n",
557 |     " \n",
558 |     "<ul>\n",
559 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
560 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
561 |     "    <li><code>trainset</code>: Plot train dataset instead of median in-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
562 |     "    <li><code>testset</code>: Plot test dataset instead of median out-of-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
563 |     "<br>\n",
564 |     "</div>"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "bootmodel.evaluate(label=Class,\n",
574 |     "                   legend='all') "
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "markdown",
579 |    "metadata": {},
580 |    "source": [
581 |     "\n",
582 |     "\n",
583 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
584 |     "\n",
585 |     "<a id=\"11\"></a>\n",
586 |     "<h2 style=\"text-align: justify\"> 11. Model Visualisation </h2>\n",
587 |     "\n",
588 |     "<br>\n",
589 |     "</div>"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "markdown",
594 |    "metadata": {},
595 |    "source": [
596 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
597 |     "    \n",
598 |     "<a id=\"11.1\"></a>\n",
599 |     "<h3 style=\"text-align: justify\"> 11.1 Plot Latent Projections: in-bag & out-of-bag </h3>\n",
600 |     "    \n",
601 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_projections()</code> method can be used to visualise the latent variable (LV) scores. When this method is run, a <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of LVs. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
602 |     "\n",
603 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). Each score plot includes the in-bag scores (as circles) and out-of-bag scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the in-bag scores (as solid lines) and out-of-bag scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
604 |     "\n",
605 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each LV scores). The distribution of the in-bag and out-of-bag scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
606 |     "\n",
607 |     "<p style=\"text-align: justify\">There are C(n,2) ROC curves (a ROC curve for every combination of 2 LVs e.g. LV1 scores vs. LV2 scores). As the ROC curves are for every combination of 2 LVs, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve with the LV score for the initial model with the 95% confidence intervals using the in-bag LV scores (green), and a ROC curve for the out-of-bag LV scores with 95% confidence intervals. The method used to calculate the 95% CI for the in-bag (green) is the class used to create the <code>bootmodel</code>. In this example, the bias-corrected and accelerated method is used (<code>cb.bootstrap.BCA</code>). Additionally, the equal distribution line (dashed black line) is shown. \n",
608 |     "\n",
609 |     "</p>\n",
610 |     "\n",
611 |     " <ul>\n",
612 |     "    <li><code>plot</code>: Data to show in plot (default = \"ci\" ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'ib', 'oob', and 'all'</li>\n",
613 |     "    <li><code>label</code>: Add labels to groups in scores plot (default = None ; refer to groups as 0/1).\n",
614 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'</li>\n",
615 |     "\n",
616 |     "<br>\n",
617 |     "</div>"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": null,
623 |    "metadata": {},
624 |    "outputs": [],
625 |    "source": [
626 |     "bootmodel.plot_projections(plot='ib',\n",
627 |     "                           label=Class,\n",
628 |     "                           legend='all')"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": null,
634 |    "metadata": {},
635 |    "outputs": [],
636 |    "source": [
637 |     "bootmodel.plot_projections(plot='oob',\n",
638 |     "                           label=Class,\n",
639 |     "                           legend='all')"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": null,
645 |    "metadata": {},
646 |    "outputs": [],
647 |    "source": [
648 |     "bootmodel.plot_projections(plot='ci',\n",
649 |     "                           label=Class,\n",
650 |     "                           legend='all')"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "markdown",
655 |    "metadata": {},
656 |    "source": [
657 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
658 |     "    \n",
659 |     "<a id=\"11.2\"></a>\n",
660 |     "<h3 style=\"text-align: justify\"> 11.2 Plot Weight Vectors </h3>\n",
661 |     "\n",
662 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_weights()</code> method can be used to visualise the latent variable (LV) weight vectors. When this method is run, <em>n</em> plots are displayed, where <em>n</em> is the number of LVs. The circles in each plot represent the LV weight vectors for the initial model. The 95% confidence intervals are calculated using bias-correct (BC) bootstrap method in step 6. Any metabolite weights with a confidence interval crossing the zero line are considered non-significant to the latent variable. This method requires values for the following parameters:</p>\n",
663 |     "    \n",
664 |     "<ul>\n",
665 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
666 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
667 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [LV1, LV2, etc.]\n",
668 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
669 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
670 |     "</ul>\n",
671 |     "   \n",
672 |     "<br>\n",
673 |     "</div>"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": null,
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "bootmodel.plot_weights(PeakTable,\n",
683 |     "                        PeakList,\n",
684 |     "                        plot='median',\n",
685 |     "                        ylabel='Label',  \n",
686 |     "                        sort=False)    "
687 |    ]
688 |   },
689 |   {
690 |    "cell_type": "markdown",
691 |    "metadata": {},
692 |    "source": [
693 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
694 |     "\n",
695 |     "<a id=\"12\"></a>\n",
696 |     "<h2 style=\"text-align: justify\">12. Variable Contribution Plots </h2>\n",
697 |     "\n",
698 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_featureimportance()</code> method can be used to visualise the feature importance metrics. When this method is run, <em>2</em> plots are displayed; the Coefficient Plot and Variable Importance in Projection (VIP) plot. The circles represent the values in the initial model. The 95% confidence intervals are calculated using bias-correct (BC) bootstrap method in step 6.</p> \n",
699 |     "    \n",
700 |     "<p style=\"text-align: justify\"> The coefficients (in the Coefficient plot) contain information about the overall contribution of each metabolite. The coefficient values can either a positive or negative number, and therefore, negatively or positively contribute to the model. Any metabolite coefficient value with a confidence interval crossing the zero line is considered non-significant to the model.</p>\n",
701 |     "    \n",
702 |     "<p style=\"text-align: justify\"> The values in the VIP plot contain information about the overall contribution of each metabolite. Unlike the coefficient values, the VIP is absolute, with the higher values representing a higher significance to the model. Typically, metabolites with a VIP greater than 1 are considered \"important\" in the model.</p>\n",
703 |     "    \n",
704 |     "<p style=\"text-align: justify\">This method, <code>bootmodel</code> exports the feature importance metrics as a pandas DataFrame (table). This method also requires values for the following parameters:</p>\n",
705 |     "    \n",
706 |     "<ul>\n",
707 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
708 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
709 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [Coef, VIP].\n",
710 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
711 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
712 |     "</ul>\n",
713 |     "   \n",
714 |     "<br>\n",
715 |     "</div>"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": null,
721 |    "metadata": {},
722 |    "outputs": [],
723 |    "source": [
724 |     "feature_importance = bootmodel.plot_featureimportance(PeakTable,\n",
725 |     "                                                      PeakList,\n",
726 |     "                                                      plot='median',\n",
727 |     "                                                      ylabel='Label',  \n",
728 |     "                                                      sort=False)      "
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "markdown",
733 |    "metadata": {},
734 |    "source": [
735 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
736 |     "\n",
737 |     "<a id=\"13\"></a>\n",
738 |     "<h2 style=\"text-align: justify\"> 13. Export Results </h2>\n",
739 |     "\n",
740 |     "<p style=\"text-align: justify\">The feature importance table created in step 8.3 can be exported using the inbuilt <code>.to_excel()</code> function within a pandas DataFrame. This function requires an input with the name of the file to create, and it can include directories by using the ‘ / ’ symbol. In the cell below, the table <code>feature_importance</code> is exported as an excel file called 'PLSDA_ST001047.xlsx' in the 'results' folder.<p>\n",
741 |     "\n",
742 |     "</div>"
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "code",
747 |    "execution_count": null,
748 |    "metadata": {},
749 |    "outputs": [],
750 |    "source": [
751 |     "export_folder = 'results/'\n",
752 |     "export_file = 'PLSDA_ST001047.xlsx'\n",
753 |     "\n",
754 |     "feature_importance.to_excel(export_folder + export_file)\n",
755 |     "print(\"Done!\")"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": null,
761 |    "metadata": {},
762 |    "outputs": [],
763 |    "source": []
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": null,
768 |    "metadata": {},
769 |    "outputs": [],
770 |    "source": []
771 |   }
772 |  ],
773 |  "metadata": {
774 |   "kernelspec": {
775 |    "display_name": "Python 3",
776 |    "language": "python",
777 |    "name": "python3"
778 |   },
779 |   "language_info": {
780 |    "codemirror_mode": {
781 |     "name": "ipython",
782 |     "version": 3
783 |    },
784 |    "file_extension": ".py",
785 |    "mimetype": "text/x-python",
786 |    "name": "python",
787 |    "nbconvert_exporter": "python",
788 |    "pygments_lexer": "ipython3",
789 |    "version": "3.7.3"
790 |   },
791 |   "toc": {
792 |    "base_numbering": 1,
793 |    "nav_menu": {},
794 |    "number_sections": false,
795 |    "sideBar": false,
796 |    "skip_h1_title": false,
797 |    "title_cell": "Table of Contents",
798 |    "title_sidebar": "Contents",
799 |    "toc_cell": false,
800 |    "toc_position": {},
801 |    "toc_section_display": false,
802 |    "toc_window_display": false
803 |   },
804 |   "toc-autonumbering": false,
805 |   "toc-showmarkdowntxt": false
806 |  },
807 |  "nbformat": 4,
808 |  "nbformat_minor": 4
809 | }
810 | 


--------------------------------------------------------------------------------
/notebooks/ANNSigSig_MTBLS90.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc-hr-collapsed": false
  7 |    },
  8 |    "source": [
  9 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 10 |     "    <font color='red'>To begin: Click anywhere in this cell and press <kbd>Run</kbd> on the menu bar. This executes the current cell and then highlights the next cell. There are two types of cells. A <i>text cell</i> and a <i>code cell</i>. When you <kbd>Run</kbd> a text cell (<i>we are in a text cell now</i>), you advance to the next cell without executing any code. When you <kbd>Run</kbd> a code cell (<i>identified by <span style=\"font-family: courier; color:black; background-color:white;\">In[ ]:</span> to the left of the cell</i>) you advance to the next cell after executing all the Python code within that cell. Any visual results produced by the code (text/figures) are reported directly below that cell. Press <kbd>Run</kbd> again. Repeat this process until the end of the notebook. <b>NOTE:</b> All the cells in this notebook can be automatically executed sequentially by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart and Run All</kbd>. Should anything crash then restart the Jupyter Kernal by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart</kbd>, and start again from the top.\n",
 11 |     "        \n",
 12 |     "</div>"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 20 |     "<img src=\"https://github.com/CIMCB/MetabComparisonBinaryML/blob/master/cimcb_logo.png?raw=true\" width=\"180px\" align=\"right\" style=\"padding: 20px\">\n",
 21 |     "\n",
 22 |     "<a id=\"introduction\"></a>\n",
 23 |     "\n",
 24 |     "<h1> Metabolomics Data Visualisation Workflow for ANN-SS</h1>\n",
 25 |     "\n",
 26 |     "<br>\n",
 27 |     "<br>\n",
 28 |     "<br>\n",
 29 |     "<p  style=\"text-align: justify\">This Jupyter Notebook described a metabolomics data analysis and visualisation workflow for a 2 layer artificial neural network with layer 1 consisting of multiple neurons (n = 2 to 6) with a sigmoidal activation, and layer 2 (output layer) consisting of a single neuron with a sigmoidal activation function (ANN-SS) for a binary classification outcome.</p>\n",
 30 |     "\n",
 31 |     "<p style=\"text-align: justify\">This computational workflow is described using a previously published NMR dataset by <a href=\"https://doi.org/10.1371/journal.pgen.1004801\">Ganna et al. (2014)</a> and <a href=\"https://doi.org/10.1101/002782\">Ganna et al. (2015)</a>.The study compared the plasma metabolomic profile comparison across a large prospective epidemiological study of men (n=485) and women (n=483) at age 70 living in Uppsala, Sweden. For the purpose of this computational workflow, we compare only the males (Class=1) and females (Class=0) in a binary discriminant analysis. The deconvolved and annotated data from this study is deposited on <a href=\"https://www.ebi.ac.uk/metabolights/\">Metabolights</a>, and can be accessed directly via its Study ID: <a href=\"https://www.ebi.ac.uk/metabolights/MTBLS90\">MTBLS90</a>. The Excel file used in this workflow can be accessed via the following link: <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/dynamic/data/MTBLS90.xlsx?raw=true\">MTBLS90.xlsx</a>.</p>\n",
 32 |     "\n",
 33 |     "<p style=\"text-align: justify\">This computational workflow requires a dataset to be in, or converted to, a previously described standardised Excel file format <a href=\"https://doi.org/10.1007/s11306-019-1588-0\">(Mendez et al. 2019)</a>. This format uses the Tidy Data Framework <a href=\"https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf\">(Wickham, 2014)</a>, where each row represents an observation (e.g. sample) and each column represents a variable (e.g. age or metabolite). Each excel file (per study) contains two sheets; a data sheet and a peak sheet. The data sheet contains the metabolite concentration together with the metadata associated for each observation (requiring the inclusion of the columns: Idx, SampleID, and Class). The peak sheet contains the additional metadata that pertains to the metabolites in the data sheet (requiring the inclusion of the columns: Idx, Name, and Label). The standardisation of this format allows for the efficient re-use of this computational workflow.</p>\n",
 34 |     "\n",
 35 |     "<br>\n",
 36 |     "The steps included in this data analysis and visualisation workflow are: \n",
 37 |     "<br>\n",
 38 |     "\n",
 39 |     "1. <a href=\"#1\">Import Packages</a><br>\n",
 40 |     "2. <a href=\"#2\">Load Data & Peak Sheet</a><br>\n",
 41 |     "3. <a href=\"#3\">Extract X & Y</a><br>\n",
 42 |     "4. <a href=\"#4\">Split Data into Train & Test Set</a><br>\n",
 43 |     "5. <a href=\"#5\">Extract, Transform, & Scale X Data with Missing Values Imputed</a><br>\n",
 44 |     "6. <a href=\"#6\">Hyperparameter Optimisation</a><br>\n",
 45 |     "    6.1. <a href=\"#6.1\">Plot R² & Q²</a><br>\n",
 46 |     "    6.2. <a href=\"#6.2\">Plot Latent Projections: Full & CV</a><br>\n",
 47 |     "7. <a href=\"#7\">Build Model & Evaluate</a><br>\n",
 48 |     "8. <a href=\"#8\">Permutation Test</a><br>\n",
 49 |     "9. <a href=\"#9\">Bootstrap Resampling of the Model</a><br> \n",
 50 |     "10. <a href=\"#10\">Model Evaluation using Bootstrap Resampling</a><br> \n",
 51 |     "11. <a href=\"#11\">Model Visualisation</a><br> \n",
 52 |     "    11.1. <a href=\"#11.1\">Plot Latent Projections: in-bag & out-of-bag</a><br>\n",
 53 |     "    11.2. <a href=\"#11.2\">Plot Weight Vectors</a><br>\n",
 54 |     "12. <a href=\"#12\">Variable Contribution Plots</a><br>  \n",
 55 |     "13. <a href=\"#12\">Export Results</a><br>\n",
 56 |     "\n",
 57 |     "</div>"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "toc-hr-collapsed": true
 64 |    },
 65 |    "source": [
 66 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
 67 |     "    \n",
 68 |     "<a id=\"1\"></a>\n",
 69 |     "<h2 style=\"text-align: justify\">1. Import Packages</h2>\n",
 70 |     "\n",
 71 |     "<p style=\"text-align: justify\"><em>Packages</em> provide additional tools that extend beyond the basic functionality of the Python programming. Prior to usage, <em>packages</em> need to be imported into the Jupyter environment. The following <em>packages</em> need to be imported for this computational workflow:<br></p>\n",
 72 |     "\n",
 73 |     "<ul>\n",
 74 |     "<li style=\"text-align: justify\"><a href=\"http://www.numpy.org/\"><code>numpy</code></a>: A standard package primarily used for the manipulation of arrays</li>\n",
 75 |     "\n",
 76 |     "<li style=\"text-align: justify\"><a href=\"https://pandas.pydata.org/\"><code>pandas</code></a>: A standard package primarily used for the manipulation of data tables</li>\n",
 77 |     "\n",
 78 |     "<li style=\"text-align: justify\"><a href=\"https://github.com/CIMCB/cimcb\"><code>cimcb</code></a>: A library of helpful functions and tools provided by the authors</li>\n",
 79 |     "\n",
 80 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/\"><code>sklearn</code></a>: A standard package with tools for machine learning\n",
 81 |     "\n",
 82 |     "<ul>\n",
 83 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\"><code>train_test_split</code></a>: A method to split arrays into training and test subsets</li></ul>\n",
 84 |     "\n",
 85 |     "</li>\n",
 86 |     "\n",
 87 |     "</ul>\n",
 88 |     "\n",
 89 |     "<br>\n",
 90 |     "\n",
 91 |     "</div>"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import numpy as np\n",
101 |     "import pandas as pd\n",
102 |     "import cimcb as cb\n",
103 |     "from sklearn.model_selection import train_test_split\n",
104 |     "\n",
105 |     "print('All packages successfully loaded')"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "<div style=\"background-color:rgb(240,248,255); border: 1px solid lightgrey; padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
113 |     "    \n",
114 |     "<a id=\"1\"></a>\n",
115 |     "<h2 style=\"text-align: justify\"><i>Optional: Set Random Seed for Splitting Data into Training & Test sets</i></h2>\n",
116 |     "\n",
117 |     "<p style=\"text-align: justify\">To reproduce the figures in the research article, set the random seed to 8. This seed is used in the <code>train_test_split</code> method to reproducibly split the source data into a training and test set.</p>\n",
118 |     "\n",
119 |     "<ul>\n",
120 |     "    <li style=\"text-align: justify\"><p style=\"text-align: justify\"><code>seed_split</code>: Seed the generator using an integer value e.g. 42 (default = None ; no seed set)</li>\n",
121 |     "</ul>\n",
122 |     "<br>\n",
123 |     "\n",
124 |     "    \n",
125 |     "<a id=\"1\"></a>\n",
126 |     "<h2 style=\"text-align: justify\"><i>Optional: Set Random Seed for Weight Initialisation</i></h2>\n",
127 |     "\n",
128 |     "<p style=\"text-align: justify\">To reproduce the figures in the research article, set the random seed to 8. When a neural network is first compilied, the weights are initialised. By default in <a href=\"https://keras.io/initializers/\">Keras</a>, the glorot normal initializer (a.k.a. Xavier normal initializer) is used where the weights are randomly drawn from a truncated normal distribution. The seed is used to set reproducible initial weights from this distribution.</p>\n",
129 |     "\n",
130 |     "<ul>\n",
131 |     "    <li style=\"text-align: justify\"><p style=\"text-align: justify\"><code>seed_init</code>: seed the generator using an integer value e.g. 42 (default = None ; no seed set)</li>\n",
132 |     "</ul>\n",
133 |     "<br>\n",
134 |     "\n",
135 |     "</div>"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "seed_split = 8\n",
145 |     "seed_init = 8\n",
146 |     "# seed_split = None\n",
147 |     "# seed_init = None"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
155 |     "\n",
156 |     "<a id=\"2\"></a>\n",
157 |     "<h2 style=\"text-align: justify\">2. Load Data & Peak Sheet</h2>\n",
158 |     "\n",
159 |     "<p style=\"text-align: justify\">This CIMCB helper function <code>load_dataXL()</code> loads the <em>Data</em> and <em>Peak</em> sheet from an Excel file. In addition, this helper function checks that the data is in the standardised Excel file format described <a href=#introduction>above</a>. After the initial checks, <code>load_dataXL()</code> outputs two individual Pandas DataFrames (i.e. tables) called <code>DataTable</code> and <code>PeakTable</code> from the Excel file <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/MTBLS90.xlsx?raw=true\">MTBLS90.xlsx</a>. This helper function requires values for the following parameters:</p>\n",
160 |     "<ul>\n",
161 |     "    <li><code>filename</code>: The name of the excel file (.xlsx file)</li>\n",
162 |     "    <li><code>DataSheet</code>: The name of the data sheet in the file</li>\n",
163 |     "    <li><code>PeakSheet</code>: The name of the peak sheet in the file</li>\n",
164 |     "</ul>   \n",
165 |     "<br>\n",
166 |     "\n",
167 |     "</div>"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "home = 'data/'\n",
177 |     "file = 'MTBLS90.xlsx'  \n",
178 |     "\n",
179 |     "DataTable,PeakTable = cb.utils.load_dataXL(filename=home + file, DataSheet='Data', PeakSheet='Peak')"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
187 |     "\n",
188 |     "<a id=\"3\"></a>\n",
189 |     "<h2 style=\"text-align: justify\">3. Extract X & Y</h2>\n",
190 |     "\n",
191 |     "<p style=\"text-align: justify\">Prior to performing any statistical or machine learning modelling, it is best practice to assess the quality of the data and remove metabolites that lack reproducible measurements  <a href=\"https://link.springer.com/article/10.1007/s11306-018-1367-3\">(Broadhurst et al. 2018)</a>. \n",
192 |     "\n",
193 |     "<br>\n",
194 |     "<p style=\"text-align: justify\">The following steps are needed to extract the <code>X</code> matrix of metabolite concentrations and associated <code>Y</code> vector of classification labels (“M”=1 and “F”=0):\n",
195 |     "    \n",
196 |     "<ul>\n",
197 |     "    \n",
198 |     "<li style=\"text-align: justify\">Create a subset of <code>DataTable</code> called <code>DataTable2</code>, with samples only in the Class “M” or “F”</li>\n",
199 |     "    \n",
200 |     "\n",
201 |     "<li style=\"text-align: justify\">Create the variable <code>PeakList</code> to hold the names (M1...Mn) of the metabolites to be used</li>\n",
202 |     "\n",
203 |     "<li style=\"text-align: justify\">Using this <code>PeakList</code>, extract all corresponding columns (i.e. metabolite data) from <code>DataTable2</code>, and place it in matrix <code>X</code></li>\n",
204 |     "\n",
205 |     "<li style=\"text-align: justify\">Set <code>Y</code> to a list (or 1D array) of binary outcomes based on the Class column from <code>DataTable2</code> (“M”=1 and “F”=0)</li>\n",
206 |     "\n",
207 |     "</ul>\n",
208 |     "\n",
209 |     "<br>\n",
210 |     "</div>"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "# Extract PeakList\n",
220 |     "PeakList = PeakTable['Name']  \n",
221 |     "\n",
222 |     "# Select Subset of Data (Class \"GC\" or \"HE\" only)\n",
223 |     "DataTable2 = DataTable[(DataTable.Class == 1) | (DataTable.Class == 0)]\n",
224 |     "\n",
225 |     "# Extract X Data\n",
226 |     "X = DataTable2[PeakList]  \n",
227 |     "\n",
228 |     "# Create a Binary Y Vector \n",
229 |     "Outcomes = DataTable2['Class']                                  \n",
230 |     "Y = np.array(Outcomes)   \n",
231 |     "\n",
232 |     "# Optional: Save Class Labels (M/F) for Figure Legends\n",
233 |     "Class = DataTable2.Sex"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
241 |     "\n",
242 |     "<a id=\"4\"></a>\n",
243 |     "<h2 style=\"text-align: justify\">4. Split Data into Train & Test Set</h2>\n",
244 |     "\n",
245 |     "\n",
246 |     "<p style=\"text-align: justify\">The <code>train_test_split</code> method is used to split the X and Y data into training (2/3rd) and test (1/3rd) sets using stratified random selection. Additionally, the Class data is split for use in figure legends. The seed is selected in the optional section above. For further information on this method, refer to the scikit learn <a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\">documentation</a>.\n",
247 |     "\n",
248 |     "<br>\n",
249 |     "</div>"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "# Split Data into Train (2/3rd) and Test (1/3rd)\n",
259 |     "XTrain, XTest, YTrain, YTest, ClassTrain, ClassTest = train_test_split(X,\n",
260 |     "                                                                       Y,\n",
261 |     "                                                                       Class,\n",
262 |     "                                                                       test_size=1/3,\n",
263 |     "                                                                       stratify=Y,\n",
264 |     "                                                                       random_state=seed_split)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
272 |     "    \n",
273 |     "<a id=\"5\"></a>\n",
274 |     "<h2 style=\"text-align: justify\"> 5. Extract, Transform, & Scale X Data with Missing Values Imputed </h2>\n",
275 |     "\n",
276 |     "<p style=\"text-align: justify\">The X Data (<code>XTrain</code> and <code>XTest</code>) is log transformed, mean centred, and scaled to unit variance (with missing values imputed using K-Nearest Neighbour) prior to modelling following standard protocols for metabolomics <a href=\"https://link.springer.com/article/10.1007/s11306-006-0037-z\">(Broadhurst and Kell, 2006)</a>.</p>\n",
277 |     "<ul>\n",
278 |     "    \n",
279 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTrain</code></li>\n",
280 |     "\n",
281 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTrainLog</code>) to the unit variance (a.k.a. auto scaling), while also returning mu & sigma.</li>\n",
282 |     "\n",
283 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTrainKnn</code></li>\n",
284 |     "\n",
285 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTest</code></li>\n",
286 |     "\n",
287 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTestLog</code>) to the unit variance (a.k.a. auto scaling) using the mu & sigma from above.\n",
288 |     "    \n",
289 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTestKnn</code></li>  \n",
290 |     "    \n",
291 |     "</ul>\n",
292 |     "\n",
293 |     "</div>"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "# Extract X Train Data                          \n",
303 |     "# XTrainLog = np.log(XTrain)                                          \n",
304 |     "XTrainScale, mu, sigma = cb.utils.scale(XTrain, method='auto', return_mu_sigma=True)              \n",
305 |     "XTrainKnn = cb.utils.knnimpute(XTrainScale, k=3)    \n",
306 |     "\n",
307 |     "# Extract X Test Data\n",
308 |     "# XTestLog = np.log(XTest)                                          \n",
309 |     "XTestScale = cb.utils.scale(XTest, method='auto', mu=mu, sigma=sigma)           \n",
310 |     "XTestKnn = cb.utils.knnimpute(XTestScale, k=3)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
318 |     "    \n",
319 |     "<a id=\"6\"></a>\n",
320 |     "<h2 style=\"text-align: justify\"> 6. Hyperparameter Optimisation </h2>\n",
321 |     "\n",
322 |     "<p style=\"text-align: justify\">The CIMCB helper function <code>cb.cross_val.kfold()</code> is used to carry out <em>k</em>-fold cross-validation (<em>k</em>=5) on a set of ANN-SS models with varying number of neurons (1 to 6) and learning rate (0.01 to 0.05) to determine the optimal hyperparamater values. In <em>k</em>-fold cross-validation, the original dataset is randomly split into k sized folds and subsequently trained for <em>k</em> iterations, where the model is trained on 1 – <em>k</em> folds and tested on the <em>k</em> fold <a href='http://ai.stanford.edu/~ronnyk/accEst.pdf'>(Kohavi 1995)</a>. This helper function requires values for the following parameters:</p>\n",
323 |     "    \n",
324 |     "<ul>\n",
325 |     "    <li><code>model</code>: the class of model used by the function, <code>cb.model.NN_SigmoidSigmoid</code></li>\n",
326 |     "    <li><code>X</code>: The metabolite data matrix, <code>XTrainKnn</code></li>\n",
327 |     "    <li><code>Y</code>: The binary outcome vector, <code>YTrain</code></li>\n",
328 |     "    <li><code>param_dict</code>: a dictionary, <code>param_dict</code>, that describes all key:value pairs to search, with the key name corresponding to the hyperparameter in the model class and the value as the list of possible values</li>\n",
329 |     "    <li><code>folds</code>: the number of folds in the <em>k</em>-fold cross validation</li>\n",
330 |     "    <li><code>n_mc</code>: the number of Monte Carlo repetitions of the <em>k</em>-fold CV</li>\n",
331 |     "</ul>\n",
332 |     "<br>\n",
333 |     "</div>"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "scrolled": false
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "# Parameter Dictionary\n",
345 |     "#lr = [0.001,0.005,0.01,0.05,0.1,1]\n",
346 |     "lr = [0.008, 0.009, 0.01, 0.02, 0.03]\n",
347 |     "neurons = [2, 3, 4, 5, 6]\n",
348 |     "\n",
349 |     "param_dict = dict(learning_rate=lr,\n",
350 |     "                  n_neurons=neurons,\n",
351 |     "                  epochs=1000,\n",
352 |     "                  momentum=0.5,\n",
353 |     "                  decay=0,\n",
354 |     "                  loss='binary_crossentropy',\n",
355 |     "                  seed=seed_init)\n",
356 |     "\n",
357 |     "# Initialise\n",
358 |     "cv = cb.cross_val.KFold(model=cb.model.NN_SigmoidSigmoid,                      \n",
359 |     "                                X=XTrainKnn,                                 \n",
360 |     "                                Y=YTrain,                               \n",
361 |     "                                param_dict=param_dict,                   \n",
362 |     "                                folds=5,\n",
363 |     "                                n_mc=10)                              \n",
364 |     "\n",
365 |     "# Run \n",
366 |     "cv.run()  "
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
374 |     "    \n",
375 |     "<a id=\"6.1\"></a>\n",
376 |     "<h3 style=\"text-align: justify\"> 6.1. Plot R² & Q²</h3>\n",
377 |     "\n",
378 |     "<p style=\"text-align: justify\">When <code>cv.plot(metric='r2q2', method='absolute')</code> is run, 6 plots of $R^2$ and $Q^2$ statistics are displayed: (a) heatmap of $R^2$, (b) heatmap of $Q^2$, (c) heatmap of 1 - | ($R^2 - Q^2$) |, (d) | ($R^2 - Q^2$) | vs. $Q^2$, (e) $R^2$ and $Q^2$ against the learning rate, and (f) $R^2$ and $Q^2$ against the number of neurons. Alternatively, if <code>method='ratio'</code>, | ($R^2 - Q^2$) / $R^2$ | is used instead of | ($R^2 - Q^2$) | . The optimal number of hyperparameters is selected based on the point of inflection in figure b, or if a clear inflection point is not present, where | ($R^2 - Q^2$) | = 0.2.  Note, the $R^2$ is the mean coefficient of determination for the full dataset, and the $Q^2$ is the mean coefficient of determination for cross-validated prediction dataset over the 100 Monte Carlo repetitions. When <code>cv.plot(metric='auc')</code> is run, the predictability of the model is presented as area under the ROC curve (AUC), $AUC(full)$ & $AUC(cv)$, a non-parametric alternative to $R^2$ & $Q^2$. The following parameters of <code>cv.plot()</code> can be altered:</p>\n",
379 |     "\n",
380 |     "<ul>\n",
381 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'\n",
382 |     "    <li><code>method</code>: The types of plots displayed (default = 'absolute'). Alternative value is 'ratio'\n",
383 |     "    <li><code>ci</code>: The confidence interval in figure e & f (default = 95)\n",
384 |     "</ul>\n",
385 |     "\n",
386 |     "<br>\n",
387 |     "</div>"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": null,
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "cv.plot(metric='auc', method='absolute', ci=95)\n",
397 |     "cv.plot(metric='r2q2', method='absolute', ci=95)"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
405 |     "\n",
406 |     "<a id=\"6.2\"></a>\n",
407 |     "<h3 style=\"text-align: justify\"> 6.2. Plot Latent Projections: Full & CV </h3>\n",
408 |     "    \n",
409 |     "<p style=\"text-align: justify\">When <code>cv.plot_projections()</code> is run, an <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of neurons in the hidden layer to interrogate. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
410 |     "\n",
411 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). Each score plot includes the full scores (as circles) and CV scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the full scores (as solid lines) and CV scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
412 |     "\n",
413 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each neuron scores). The distribution of the full and CV scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
414 |     "\n",
415 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) ROC curves (a ROC curve for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). As the ROC curves are for every combination of 2 neurons, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve for the full model (green), and ROC curve for the cv model with 95% confidence intervals (yellow). Additionally, the equal distribution line (dashed black line) is shown.</p>\n",
416 |     "\n",
417 |     " <ul>\n",
418 |     "    <li><code>**optional_arguments</code>: optional arguments to specify model hyperparameters if they are changed in this search e.g. learning_rate=0.02 (except number of components). By default, the max value of each hyperparameter is used (unless specificied).</li>\n",
419 |     "    <li><code>components</code>: Neurons to plot (default = \"all\" ; plot all components). Alternatively, list the components to plot e.g. [1,3,4]</li>\n",
420 |     "    <li><code>plot</code>: Data to show (default = 'ci' ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'full', 'cv', and 'all'</li>\n",
421 |     "    <li><code>label</code>: Add labels to groups in scores plot (default = None ; refers to groups as 0/1)\n",
422 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'</li>\n",
423 |     " </ul>\n",
424 |     "\n",
425 |     "<br>\n",
426 |     "</div>"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "cv.plot_projections(learning_rate=0.02,\n",
436 |     "                    components=[1,2,3,4],  \n",
437 |     "                    plot=\"ci\",\n",
438 |     "                    label=ClassTrain,\n",
439 |     "                    legend=\"all\")"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "cv.plot_projections(learning_rate=0.02,\n",
449 |     "                    components=[1,2,3,4],  \n",
450 |     "                    plot=\"meanci\",\n",
451 |     "                    label=ClassTrain,\n",
452 |     "                    legend=\"all\")"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
460 |     "\n",
461 |     "<a id=\"7\"></a>\n",
462 |     "<h2 style=\"text-align: justify\"> 7. Build Model & Evaluate </h2>\n",
463 |     "\n",
464 |     "<p style=\"text-align: justify\">A ANN-SS model using <code>cb.model.NN_SigmoidSigmoid</code> is created and initialised using the optimal hyperparameter values determined in step 4. The implementation of ANN-SS in the <code>cb.model.NN_SigmoidSigmoid</code> class uses using <a href=\"https://keras.io\">Keras</a> with a <a href=\"http://www.deeplearning.net/software/theano/\">Theano backend.</a><p>\n",
465 |     "\n",
466 |     "<p style=\"text-align: justify\">Following this initialisation, the ANN-SS model is trained using the <code>.train(X, Y)</code> method where the X matrix is <code>XTrainKnn</code> and the Y vector is <code>YTrain</code>, returning the Y predicted value <code>YPredTrain</code>. This model is then tested using the <code>.test(X, Y)</code> method where the X matrix is <code>XTestKnn</code> and the Y vector is <code>YTest</code>, returning the Y predicted value <code>YPredTest</code>.</p>\n",
467 |     "\n",
468 |     "<p style=\"text-align: justify\">The <code>.evaluate()</code> method can be used to evaluate the predictability of the model using the train and test set. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the predicted score for the train and test (by group). The distribution plot shows the probability density function of the predicted scores for the train and test (by group). The ROC curve shows the ROC curve for the train (green) and test (yellow). The following parameter values in <code>.evaluate()</code> can be altered:\n",
469 |     "    \n",
470 |     "<ul>\n",
471 |     "    <li><code>testset</code>: Plot test dataset (default = None). Alternative, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
472 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
473 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
474 |     "\n",
475 |     "<br>\n",
476 |     "</div>"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {
483 |     "scrolled": false
484 |    },
485 |    "outputs": [],
486 |    "source": [
487 |     "# Build Model\n",
488 |     "model = cb.model.NN_SigmoidSigmoid(learning_rate=0.02, \n",
489 |     "                                   n_neurons=3,\n",
490 |     "                                   epochs=1000,\n",
491 |     "                                   momentum=0.5, \n",
492 |     "                                   decay=0, \n",
493 |     "                                   loss='binary_crossentropy',\n",
494 |     "                                   seed=seed_init)\n",
495 |     "YPredTrain = model.train(XTrainKnn, YTrain)\n",
496 |     "YPredTest = model.test(XTestKnn)\n",
497 |     "\n",
498 |     "# Put YTrain and YPredTrain in a List\n",
499 |     "EvalTrain = [YTrain, YPredTrain]\n",
500 |     "\n",
501 |     "# Put YTest and YPrestTest in a List\n",
502 |     "EvalTest = [YTest, YPredTest]\n",
503 |     "\n",
504 |     "# Save Weights & Feature Importance \n",
505 |     "model_weights = model.x_weights_            # [N1, N2, ...]\n",
506 |     "model_fi = model.feature_importance_        # [CW, GA]\n",
507 |     "\n",
508 |     "# Evaluate Model (include Test Dataset)\n",
509 |     "model.evaluate(testset=EvalTest,\n",
510 |     "               label=ClassTrain,\n",
511 |     "               legend='all')"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "markdown",
516 |    "metadata": {},
517 |    "source": [
518 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
519 |     "\n",
520 |     "<a id=\"8\"></a>\n",
521 |     "<h2 style=\"text-align: justify\">8. Permutation Test</h2>\n",
522 |     "\n",
523 |     "<p style=\"text-align: justify\">After a model has been trained, the <code>.permutation_test()</code> method can be used to assess the reliability of the trained model (after selecting the number of latent variables). For the permutation test, the metabolite data matrix is randomised (permuted or 'shuffled'), while the Y (i.e. outcome) is fixed, and subsequently trained and tested on this randomised data <a href='https://link.springer.com/article/10.1007/s11306-011-0330-3'>(Szymańska et al. 2012)</a>. This process is repeated (in this case, n=100) to construct a distribution to fairly access the model. For a dataset with features that have with no meaningful contribution, we would expect a similar $R^2$ and $Q^2$ to a randomised dataset, while for a dataset with features with meaningful contribution, we would expect a $R^2$ and $Q^2$ significantly higher than that of the randomised dataset. When <code>.permutation_test()</code> is run, 2 plots are displayed: (a) $R^2$ and $Q^2$ against \"correlation of permuted data against original data\", and (b) probability density functions for $R^2$ and $Q^2$, with the $R^2$ and $Q^2$ values found for the model trained on original data presented as ball-and-stick. The following parameter value of <code>.permutation_test()</code> can be altered: \n",
524 |     "\n",
525 |     "<ul>\n",
526 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'. Multiple metrics can be plotted using a list e.g. ['r2q2', 'auc]\n",
527 |     "    <li><code>nperm</code>: The number of permutations. (default = 100)\n",
528 |     "    <li><code>legend</code>: To show legend (default = True). Alternative value is False\n",
529 |     "</ul>\n",
530 |     "\n",
531 |     "<br>\n",
532 |     "</div>"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {
539 |     "scrolled": false
540 |    },
541 |    "outputs": [],
542 |    "source": [
543 |     "model.permutation_test(metric=['r2q2', 'auc'],\n",
544 |     "                       nperm=100,\n",
545 |     "                       legend=True)"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "markdown",
550 |    "metadata": {},
551 |    "source": [
552 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
553 |     "\n",
554 |     "<a id=\"9\"></a>\n",
555 |     "<h2 style=\"text-align: justify\"> 9. Bootstrap Resampling of the Model</h2>\n",
556 |     "\n",
557 |     "<p style=\"text-align: justify\">Bootstrap resampling is a resampling method based on random resampling with replacement, commonly used to provide an estimate of sampling distribution of a test statistic <a href=\"https://epubs.siam.org/doi/book/10.1137/1.9781611970319?mobileUi=0\">(Efron, 1982)</a>. In the context of this workflow, the PLS model from step 5 with its fixed hyperparameter values (i.e. number of LVs = 2) is retrained on the resampled with replacement data (in-bag) and evaluated on the unused data (out-of-bag) for 100 resamples. After the model is evaluated for each bootstrap, metrics including the predicted values (ypred), LV scores, LV loadings, and feature importance (VIP and coefficients) are stored and used to calculate 95% confidence intervals. To calculate the 95% confidence intervals, various methods can be used including the basic percentile method, corrected percentile method (a.k.a. bias-corrected method), and the commonly used bias-corrected and accelerated (BCA) method. In this example, the BCA  method is used with the class <code>cb.boostrap.BCA</code>. Alternatively, use <code>cb.boostrap.Per</code> to use the percentile method, or <code>cb.bootstrap.CPer</code> for the corrected percentile method. To create and run the <code>bootmodel</code> for any method, the following parameter values need to be set:\n",
558 |     "  \n",
559 |     " <ul>\n",
560 |     "    <li><code>model</code>: A model with fixed hyperparameter values for boostrap resampling</li>\n",
561 |     "    <li><code>bootnum</code>: The number of bootstrap resamples (default = 100)</li>\n",
562 |     "\n",
563 |     "<br>\n",
564 |     "</div>"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {
571 |     "scrolled": false
572 |    },
573 |    "outputs": [],
574 |    "source": [
575 |     "# Extract X Data and Train Model\n",
576 |     "XBoot = DataTable2[PeakList]\n",
577 |     "# XBootLog = np.log(XBoot)\n",
578 |     "XBootScale = cb.utils.scale(XBoot, method='auto')\n",
579 |     "XBootKnn = cb.utils.knnimpute(XBootScale, k=3)\n",
580 |     "YPredBoot = model.train(XBootKnn, Y)\n",
581 |     "\n",
582 |     "# Build Boostrap Models\n",
583 |     "bootmodel = cb.bootstrap.BCA(model, bootnum=100) \n",
584 |     "bootmodel.run()"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "markdown",
589 |    "metadata": {},
590 |    "source": [
591 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
592 |     "\n",
593 |     "<a id=\"10\"></a>\n",
594 |     "<h2 style=\"text-align: justify\"> 10. Model Evaluation using Bootstrap Resampling</h2>\n",
595 |     "\n",
596 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.evaluate()</code> method can be used to provide an estimate of the robustness and a measure of the generalised predictability of the model. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the distribution of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The distribution plot shows the probability density function of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The ROC curve shows the ROC curve with the median (green) and 95% CI for the in-bag (light green band) and the median (yellow) and 95% CI for the out-of-bag (light yellow band). The method used to calculate the 95% CI for the in-bag (green) is the class selected in the previous cell. In this example, the bias-corrected and accelerated method is used as <code>cb.bootstrap.BCA</code> was used in the previous cell to create <code>bootmodel</code>. \n",
597 |     " \n",
598 |     "<ul>\n",
599 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
600 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
601 |     "    <li><code>trainset</code>: Plot train dataset instead of median in-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
602 |     "    <li><code>testset</code>: Plot test dataset instead of median out-of-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
603 |     "        \n",
604 |     "<br>\n",
605 |     "</div>"
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "code",
610 |    "execution_count": null,
611 |    "metadata": {
612 |     "scrolled": false
613 |    },
614 |    "outputs": [],
615 |    "source": [
616 |     "bootmodel.evaluate(label=Class,\n",
617 |     "                   legend='all') "
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "markdown",
622 |    "metadata": {},
623 |    "source": [
624 |     "\n",
625 |     "\n",
626 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
627 |     "\n",
628 |     "<a id=\"11\"></a>\n",
629 |     "<h2 style=\"text-align: justify\"> 11. Model Visualisation </h2>\n",
630 |     "\n",
631 |     "<br>\n",
632 |     "</div>"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "markdown",
637 |    "metadata": {},
638 |    "source": [
639 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
640 |     "    \n",
641 |     "<a id=\"11.1\"></a>\n",
642 |     "<h3 style=\"text-align: justify\"> 11.1 Plot Latent Projections: in-bag & out-of-bag </h3>\n",
643 |     "    \n",
644 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_projections()</code> method can be used to visualise the latent variable (LV) scores. When this method is run, an <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of neurons in the hidden layer. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
645 |     "\n",
646 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). Each score plot includes the in-bag scores (as circles) and out-of-bag scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the in-bag scores (as solid lines) and out-of-bag scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
647 |     "\n",
648 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each neuron scores). The distribution of the in-bag and out-of-bag scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
649 |     "\n",
650 |     "<p style=\"text-align: justify\">There are C(n,2) ROC curves (a ROC curve for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). As the ROC curves are for every combination of 2 neurons, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve with the LV score for the initial model with the 95% confidence intervals using the in-bag LV scores (green), and a ROC curve for the out-of-bag LV scores with 95% confidence intervals. The method used to calculate the 95% CI for the in-bag (green) is the class used to create the <code>bootmodel</code>. In this example, the bias corrected and accelerated method is used (<code>cb.bootstrap.BCA</code>). Additionally, the equal distribution line (dashed black line) is shown. \n",
651 |     "\n",
652 |     " <ul>\n",
653 |     "    <li><code>plot</code>: Data to show in plot (default = \"ci\" ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'ib', 'oob', and 'all'</li>\n",
654 |     "    <li><code>label</code>: Add labels to groups in scores plot (default = None ; refer to groups as 0/1).\n",
655 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'</li>\n",
656 |     "\n",
657 |     "<br>\n",
658 |     "</div>"
659 |    ]
660 |   },
661 |   {
662 |    "cell_type": "code",
663 |    "execution_count": null,
664 |    "metadata": {},
665 |    "outputs": [],
666 |    "source": [
667 |     "bootmodel.plot_projections(plot='ib',\n",
668 |     "                           label=Class,\n",
669 |     "                           legend='all')"
670 |    ]
671 |   },
672 |   {
673 |    "cell_type": "code",
674 |    "execution_count": null,
675 |    "metadata": {},
676 |    "outputs": [],
677 |    "source": [
678 |     "bootmodel.plot_projections(plot='oob',\n",
679 |     "                           label=Class,\n",
680 |     "                           legend='all')"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": null,
686 |    "metadata": {},
687 |    "outputs": [],
688 |    "source": [
689 |     "bootmodel.plot_projections(plot='ci',\n",
690 |     "                           label=Class,\n",
691 |     "                           legend='all')"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "markdown",
696 |    "metadata": {},
697 |    "source": [
698 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
699 |     "    \n",
700 |     "<a id=\"11.2\"></a>\n",
701 |     "<h3 style=\"text-align: justify\"> 11.2 Plot Weight Vectors </h3>\n",
702 |     "\n",
703 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_weights()</code> method can be used to visualise the neuron weight vectors. When this method is run, <em>n</em> plots are displayed, where <em>n</em> is the number of neurons. The circles in each plot represent the neuron weight vectors for the model. The 95% confidence intervals are calculated using bias-correct (BC) bootstrap method in step 6. Any metabolite weights with a confidence interval crossing the zero line are considered non-significant to the neuron. This method requires values for the following parameters:</p>\n",
704 |     "    \n",
705 |     "<ul>\n",
706 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
707 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
708 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [N1, N2, etc.]\n",
709 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
710 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
711 |     "</ul>\n",
712 |     "   \n",
713 |     "<br>\n",
714 |     "</div>"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "code",
719 |    "execution_count": null,
720 |    "metadata": {
721 |     "scrolled": false
722 |    },
723 |    "outputs": [],
724 |    "source": [
725 |     "bootmodel.plot_weights(PeakTable,\n",
726 |     "                        PeakList,\n",
727 |     "                        plot='median',\n",
728 |     "                        ylabel='Label',\n",
729 |     "                        sort=False)"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "markdown",
734 |    "metadata": {},
735 |    "source": [
736 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
737 |     "\n",
738 |     "<a id=\"12\"></a>\n",
739 |     "<h2 style=\"text-align: justify\">12. Variable Contribution Plots </h2>\n",
740 |     "\n",
741 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_featureimportance()</code> method can be used to visualise the feature importance metrics. When this method is run, <em>2</em> plots are displayed; Connection Weight plot and Garson's Algorithm plot. These feature importance metrics are alternatives to the coefficient and variable importance in projection (VIP) in PLS.\n",
742 |     "    \n",
743 |     "<p style=\"text-align: justify\"> The values in the Connection Weight plot contain information about the overall contribution of each metabolite <a href=\"https://depts.washington.edu/oldenlab/wordpress/wp-content/uploads/2013/03/EcologicalModelling_2004.pdf\">(Olden et al. 2004)</a>. The values can either a positive or negative number, and therefore, negatively or positively contribute to the model. Any metabolite coefficient value with a confidence interval crossing the zero line is considered non-significant to the model.</p>\n",
744 |     "    \n",
745 |     "<p style=\"text-align: justify\"> The values in the Garson's Algorithm plot contain information about the overall contribution of each metabolite <a href=\"https://dl.acm.org/citation.cfm?id=129452\">(Garson 1991)</a>. These values are absolute, with the higher values representing a higher significance to the model. Unlike in a VIP plot, there is no standard cut-off used to determine whether metabolites are considered \"important\" in the model. One method (used below) is to use the average value across the metabolites as the cut-off.</p>\n",
746 |     "    \n",
747 |     "<p style=\"text-align: justify\">This method, <code>bootmodel</code> exports the feature importance metrics as a pandas DataFrame (table). This method also requires values for the following parameters:</p>\n",
748 |     "    \n",
749 |     "<ul>\n",
750 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
751 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
752 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [CW, GA].\n",
753 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
754 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
755 |     "</ul>\n",
756 |     "   \n",
757 |     "<br>\n",
758 |     "</div>"
759 |    ]
760 |   },
761 |   {
762 |    "cell_type": "code",
763 |    "execution_count": null,
764 |    "metadata": {
765 |     "scrolled": false
766 |    },
767 |    "outputs": [],
768 |    "source": [
769 |     "feature_importance = bootmodel.plot_featureimportance(PeakTable,\n",
770 |     "                                                      PeakList,\n",
771 |     "                                                      plot='median',\n",
772 |     "                                                      ylabel='Label',\n",
773 |     "                                                      sort=False)   "
774 |    ]
775 |   },
776 |   {
777 |    "cell_type": "markdown",
778 |    "metadata": {},
779 |    "source": [
780 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
781 |     "\n",
782 |     "<a id=\"13\"></a>\n",
783 |     "<h2 style=\"text-align: justify\"> 13. Export Results </h2>\n",
784 |     "\n",
785 |     "<p style=\"text-align: justify\">The feature importance table created in step 8.3 can be exported using the inbuilt <code>.to_excel()</code> function within a pandas DataFrame. This function requires an input with the name of the file to create, and it can include directories by using the ‘ / ’ symbol. In the cell below, the table <code>feature_importance</code> is exported as an excel file called 'ANNSigSig_ST001047.xlsx' in the 'results' folder.<p>\n",
786 |     "\n",
787 |     "</div>"
788 |    ]
789 |   },
790 |   {
791 |    "cell_type": "code",
792 |    "execution_count": null,
793 |    "metadata": {},
794 |    "outputs": [],
795 |    "source": [
796 |     "export_folder = 'results/'\n",
797 |     "export_file = 'ANNSigSig_MTBLS90.xlsx'\n",
798 |     "\n",
799 |     "feature_importance.to_excel(export_folder + export_file)\n",
800 |     "print(\"Done!\")"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": null,
806 |    "metadata": {},
807 |    "outputs": [],
808 |    "source": []
809 |   },
810 |   {
811 |    "cell_type": "code",
812 |    "execution_count": null,
813 |    "metadata": {},
814 |    "outputs": [],
815 |    "source": []
816 |   }
817 |  ],
818 |  "metadata": {
819 |   "kernelspec": {
820 |    "display_name": "Python 3",
821 |    "language": "python",
822 |    "name": "python3"
823 |   },
824 |   "language_info": {
825 |    "codemirror_mode": {
826 |     "name": "ipython",
827 |     "version": 3
828 |    },
829 |    "file_extension": ".py",
830 |    "mimetype": "text/x-python",
831 |    "name": "python",
832 |    "nbconvert_exporter": "python",
833 |    "pygments_lexer": "ipython3",
834 |    "version": "3.7.3"
835 |   },
836 |   "toc": {
837 |    "base_numbering": 1,
838 |    "nav_menu": {},
839 |    "number_sections": false,
840 |    "sideBar": false,
841 |    "skip_h1_title": false,
842 |    "title_cell": "Table of Contents",
843 |    "title_sidebar": "Contents",
844 |    "toc_cell": false,
845 |    "toc_position": {},
846 |    "toc_section_display": false,
847 |    "toc_window_display": false
848 |   },
849 |   "toc-autonumbering": false,
850 |   "toc-showmarkdowntxt": false
851 |  },
852 |  "nbformat": 4,
853 |  "nbformat_minor": 2
854 | }
855 | 


--------------------------------------------------------------------------------
/notebooks/ANNSigSig_ST001047.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "toc-hr-collapsed": false
  7 |    },
  8 |    "source": [
  9 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 10 |     "    <font color='red'>To begin: Click anywhere in this cell and press <kbd>Run</kbd> on the menu bar. This executes the current cell and then highlights the next cell. There are two types of cells. A <i>text cell</i> and a <i>code cell</i>. When you <kbd>Run</kbd> a text cell (<i>we are in a text cell now</i>), you advance to the next cell without executing any code. When you <kbd>Run</kbd> a code cell (<i>identified by <span style=\"font-family: courier; color:black; background-color:white;\">In[ ]:</span> to the left of the cell</i>) you advance to the next cell after executing all the Python code within that cell. Any visual results produced by the code (text/figures) are reported directly below that cell. Press <kbd>Run</kbd> again. Repeat this process until the end of the notebook. <b>NOTE:</b> All the cells in this notebook can be automatically executed sequentially by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart and Run All</kbd>. Should anything crash then restart the Jupyter Kernal by clicking <kbd>Kernel</kbd><font color='black'>→</font><kbd>Restart</kbd>, and start again from the top.\n",
 11 |     "        \n",
 12 |     "</div>"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "<div style=\"text-align: justify; padding:5px; background-color:rgb(252, 253, 255); border: 1px solid lightgrey; padding-left: 1em; padding-right: 1em;\">\n",
 20 |     "<img src=\"https://github.com/CIMCB/MetabComparisonBinaryML/blob/master/cimcb_logo.png?raw=true\" width=\"180px\" align=\"right\" style=\"padding: 20px\">\n",
 21 |     "\n",
 22 |     "<a id=\"introduction\"></a>\n",
 23 |     "\n",
 24 |     "<h1> Metabolomics Data Visualisation Workflow for ANN-SS</h1>\n",
 25 |     "\n",
 26 |     "<br>\n",
 27 |     "<br>\n",
 28 |     "<br>\n",
 29 |     "<p  style=\"text-align: justify\">This Jupyter Notebook described a metabolomics data analysis and visualisation workflow for a 2 layer artificial neural network with layer 1 consisting of multiple neurons (n = 2 to 6) with a sigmoidal activation, and layer 2 (output layer) consisting of a single neuron with a sigmoidal activation function (ANN-SS) for a binary classification outcome.</p>\n",
 30 |     "\n",
 31 |     "<p style=\"text-align: justify\">This computational workflow is described using a previously published NMR dataset by <a href=\"https://www.nature.com/articles/bjc2015414\">Chan et al. (2016)</a>. The study compared the urine metabolomic profile comparison across patients characterised as Gastric Cancer (GC; n=43), Benign Gastric Disease (BN; n=40), and Healthy Control (HE; n=40) using 149 named metabolites. For the purpose of this computational workflow, we compare only the GC vs HE samples in a binary discriminant analysis. The deconvolved and annotated data from this study are deposited on <a href=\"https://www.metabolomicsworkbench.org/\">Metabolomics Workbench</a> (Study ID: ST001047), and can be accessed directly via its Project DOI: <a href=\"http://dx.doi.org/DOI:10.21228/M8B10B\">10.21228/M8B10B</a>. The Excel file used in this workflow can be accessed via the following link: <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/ST001047.xlsx?raw=true\">ST001047.xlsx</a>.</p>\n",
 32 |     "\n",
 33 |     "<p style=\"text-align: justify\">This computational workflow requires a dataset to be in, or converted to, a previously described standardised Excel file format <a href=\"https://doi.org/10.1007/s11306-019-1588-0\">(Mendez et al. 2019)</a>. This format uses the Tidy Data Framework <a href=\"https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf\">(Wickham, 2014)</a>, where each row represents an observation (e.g. sample) and each column represents a variable (e.g. age or metabolite). Each excel file (per study) contains two sheets; a data sheet and a peak sheet. The data sheet contains the metabolite concentration together with the metadata associated for each observation (requiring the inclusion of the columns: Idx, SampleID, and Class). The peak sheet contains the additional metadata that pertains to the metabolites in the data sheet (requiring the inclusion of the columns: Idx, Name, and Label). The standardisation of this format allows for the efficient re-use of this computational workflow.</p>\n",
 34 |     "\n",
 35 |     "<br>\n",
 36 |     "The steps included in this data analysis and visualisation workflow are: \n",
 37 |     "<br>\n",
 38 |     "\n",
 39 |     "1. <a href=\"#1\">Import Packages</a><br>\n",
 40 |     "2. <a href=\"#2\">Load Data & Peak Sheet</a><br>\n",
 41 |     "3. <a href=\"#3\">Extract X & Y</a><br>\n",
 42 |     "4. <a href=\"#4\">Split Data into Train & Test Set</a><br>\n",
 43 |     "5. <a href=\"#5\">Extract, Transform, & Scale X Data with Missing Values Imputed</a><br>\n",
 44 |     "6. <a href=\"#6\">Hyperparameter Optimisation</a><br>\n",
 45 |     "    6.1. <a href=\"#6.1\">Plot R² & Q²</a><br>\n",
 46 |     "    6.2. <a href=\"#6.2\">Plot Latent Projections: Full & CV</a><br>\n",
 47 |     "7. <a href=\"#7\">Build Model & Evaluate</a><br>\n",
 48 |     "8. <a href=\"#8\">Permutation Test</a><br>\n",
 49 |     "9. <a href=\"#9\">Bootstrap Resampling of the Model</a><br> \n",
 50 |     "10. <a href=\"#10\">Model Evaluation using Bootstrap Resampling</a><br> \n",
 51 |     "11. <a href=\"#11\">Model Visualisation</a><br> \n",
 52 |     "    11.1. <a href=\"#11.1\">Plot Latent Projections: in-bag & out-of-bag</a><br>\n",
 53 |     "    11.2. <a href=\"#11.2\">Plot Weight Vectors</a><br>\n",
 54 |     "12. <a href=\"#12\">Variable Contribution Plots</a><br>  \n",
 55 |     "13. <a href=\"#12\">Export Results</a><br>\n",
 56 |     "\n",
 57 |     "</div>"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "toc-hr-collapsed": true
 64 |    },
 65 |    "source": [
 66 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
 67 |     "    \n",
 68 |     "<a id=\"1\"></a>\n",
 69 |     "<h2 style=\"text-align: justify\">1. Import Packages</h2>\n",
 70 |     "\n",
 71 |     "<p style=\"text-align: justify\"><em>Packages</em> provide additional tools that extend beyond the basic functionality of the Python programming. Prior to usage, <em>packages</em> need to be imported into the Jupyter environment. The following <em>packages</em> need to be imported for this computational workflow:<br></p>\n",
 72 |     "\n",
 73 |     "<ul>\n",
 74 |     "<li style=\"text-align: justify\"><a href=\"http://www.numpy.org/\"><code>numpy</code></a>: A standard package primarily used for the manipulation of arrays</li>\n",
 75 |     "\n",
 76 |     "<li style=\"text-align: justify\"><a href=\"https://pandas.pydata.org/\"><code>pandas</code></a>: A standard package primarily used for the manipulation of data tables</li>\n",
 77 |     "\n",
 78 |     "<li style=\"text-align: justify\"><a href=\"https://github.com/CIMCB/cimcb\"><code>cimcb</code></a>: A library of helpful functions and tools provided by the authors</li>\n",
 79 |     "\n",
 80 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/\"><code>sklearn</code></a>: A standard package with tools for machine learning\n",
 81 |     "\n",
 82 |     "<ul>\n",
 83 |     "<li style=\"text-align: justify\"><a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\"><code>train_test_split</code></a>: A method to split arrays into training and test subsets</li></ul>\n",
 84 |     "\n",
 85 |     "</li>\n",
 86 |     "\n",
 87 |     "</ul>\n",
 88 |     "\n",
 89 |     "<br>\n",
 90 |     "\n",
 91 |     "</div>"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import numpy as np\n",
101 |     "import pandas as pd\n",
102 |     "import cimcb as cb\n",
103 |     "from sklearn.model_selection import train_test_split\n",
104 |     "\n",
105 |     "print('All packages successfully loaded')"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "<div style=\"background-color:rgb(240,248,255); border: 1px solid lightgrey; padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
113 |     "    \n",
114 |     "<a id=\"1\"></a>\n",
115 |     "<h2 style=\"text-align: justify\"><i>Optional: Set Random Seed for Splitting Data into Training & Test sets</i></h2>\n",
116 |     "\n",
117 |     "<p style=\"text-align: justify\">To reproduce the figures in the research article, set the random seed to 100. This seed is used in the <code>train_test_split</code> method to reproducibly split the source data into a training and test set.</p>\n",
118 |     "\n",
119 |     "<ul>\n",
120 |     "    <li style=\"text-align: justify\"><p style=\"text-align: justify\"><code>seed_split</code>: Seed the generator using an integer value e.g. 42 (default = None ; no seed set)</li>\n",
121 |     "</ul>\n",
122 |     "<br>\n",
123 |     "\n",
124 |     "    \n",
125 |     "<a id=\"1\"></a>\n",
126 |     "<h2 style=\"text-align: justify\"><i>Optional: Set Random Seed for Weight Initialisation</i></h2>\n",
127 |     "\n",
128 |     "<p style=\"text-align: justify\">To reproduce the figures in the research article, set the random seed to 4. When a neural network is first compilied, the weights are initialised. By default in <a href=\"https://keras.io/initializers/\">Keras</a>, the glorot normal initializer (a.k.a. Xavier normal initializer) is used where the weights are randomly drawn from a truncated normal distribution. The seed is used to set reproducible initial weights from this distribution.</p>\n",
129 |     "\n",
130 |     "<ul>\n",
131 |     "    <li style=\"text-align: justify\"><p style=\"text-align: justify\"><code>seed_init</code>: seed the generator using an integer value e.g. 42 (default = None ; no seed set)</li>\n",
132 |     "</ul>\n",
133 |     "<br>\n",
134 |     "\n",
135 |     "</div>"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "seed_split = 100\n",
145 |     "seed_init = 4\n",
146 |     "# seed_split = None\n",
147 |     "# seed_init = None"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "<div style=\"background-color:rgb(255, 250, 250); padding:5px; padding-left: 1em; padding-right: 1em;\">\n",
155 |     "\n",
156 |     "<a id=\"2\"></a>\n",
157 |     "<h2 style=\"text-align: justify\">2. Load Data & Peak Sheet</h2>\n",
158 |     "\n",
159 |     "<p style=\"text-align: justify\">This CIMCB helper function <code>load_dataXL()</code> loads the <em>Data</em> and <em>Peak</em> sheet from an Excel file. In addition, this helper function checks that the data is in the standardised Excel file format described <a href=#introduction>above</a>. After the initial checks, <code>load_dataXL()</code> outputs two individual Pandas DataFrames (i.e. tables) called <code>DataTable</code> and <code>PeakTable</code> from the Excel file <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/ST001047.xlsx?raw=true\">ST001047.xlsx</a>. This helper function requires values for the following parameters:</p>\n",
160 |     "<ul>\n",
161 |     "    <li><code>filename</code>: The name of the excel file (.xlsx file)</li>\n",
162 |     "    <li><code>DataSheet</code>: The name of the data sheet in the file</li>\n",
163 |     "    <li><code>PeakSheet</code>: The name of the peak sheet in the file</li>\n",
164 |     "</ul>   \n",
165 |     "<br>\n",
166 |     "\n",
167 |     "</div>"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "home = 'data/'\n",
177 |     "file = 'ST001047.xlsx'\n",
178 |     "\n",
179 |     "DataTable,PeakTable = cb.utils.load_dataXL(filename=home + file, DataSheet='Data', PeakSheet='Peak')"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
187 |     "\n",
188 |     "<a id=\"3\"></a>\n",
189 |     "<h2 style=\"text-align: justify\">3. Extract X & Y</h2>\n",
190 |     "\n",
191 |     "<p style=\"text-align: justify\">Prior to performing any statistical or machine learning modelling, it is best practice to assess the quality of the data and remove metabolites that lack reproducible measurements  <a href=\"https://link.springer.com/article/10.1007/s11306-018-1367-3\">(Broadhurst et al. 2018)</a>. In this dataset <a href=\"https://github.com/CIMCB/MetabProjectionViz/blob/master/notebooks/data/ST001047.xlsx?raw=true\">ST001047.xlsx</a>, we can find that the QC-RSD and percentage of missing value has been previously calculated (refer to the peak sheet). In this Jupyter Notebook, we remove all metabolites that do not meet the following criteria:</p>\n",
192 |     "\n",
193 |     "<ul>\n",
194 |     "<li style=\"text-align: justify\">QC-RSD less than 20% </li>\n",
195 |     "\n",
196 |     "<li style=\"text-align: justify\">Fewer than 10% of values are missing</li>\n",
197 |     "</ul>\n",
198 |     "\n",
199 |     "<br>\n",
200 |     "<p style=\"text-align: justify\">The following steps are needed to extract the <code>X</code> matrix of metabolite concentrations and associated <code>Y</code> vector of classification labels (“GC”=1 and “HE”=0):\n",
201 |     "    \n",
202 |     "<ul>\n",
203 |     "    \n",
204 |     "<li style=\"text-align: justify\">Create a subset of <code>DataTable</code> called <code>DataTable2</code>, with samples only in the Class “GC” or “HE”</li>\n",
205 |     "    \n",
206 |     "\n",
207 |     "<li style=\"text-align: justify\">Create the variable <code>PeakList</code> to hold the names (M1...Mn) of the metabolites to be used</li>\n",
208 |     "\n",
209 |     "<li style=\"text-align: justify\">Using this <code>PeakList</code>, extract all corresponding columns (i.e. metabolite data) from <code>DataTable2</code>, and place it in matrix <code>X</code></li>\n",
210 |     "\n",
211 |     "<li style=\"text-align: justify\">Set <code>Y</code> to a list (or 1D array) of binary outcomes based on the Class column from <code>DataTable2</code> (“GC”=1 and “HE”=0)</li>\n",
212 |     "\n",
213 |     "</ul>\n",
214 |     "\n",
215 |     "<br>\n",
216 |     "</div>"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "# Clean PeakTable and Extract PeakList\n",
226 |     "RSD = PeakTable['QC_RSD']   \n",
227 |     "PercMiss = PeakTable['Perc_missing']  \n",
228 |     "PeakTableClean = PeakTable[(RSD < 20) & (PercMiss < 10)]   \n",
229 |     "PeakList = PeakTableClean['Name']  \n",
230 |     "\n",
231 |     "# Select Subset of Data (Class \"GC\" or \"HE\" only)\n",
232 |     "DataTable2 = DataTable[(DataTable.Class == \"GC\") | (DataTable.Class == \"HE\")]\n",
233 |     "\n",
234 |     "# Extract X Data\n",
235 |     "X = DataTable2[PeakList]  \n",
236 |     "\n",
237 |     "# Create a Binary Y Vector \n",
238 |     "Outcomes = DataTable2['Class']                                  \n",
239 |     "Y = [1 if outcome == 'GC' else 0 for outcome in Outcomes]         \n",
240 |     "Y = np.array(Y)   \n",
241 |     "\n",
242 |     "# Optional: Save Class Labels for Figure Legends\n",
243 |     "Class = DataTable2.Class"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
251 |     "\n",
252 |     "<a id=\"4\"></a>\n",
253 |     "<h2 style=\"text-align: justify\">4. Split Data into Train & Test Set</h2>\n",
254 |     "\n",
255 |     "\n",
256 |     "<p style=\"text-align: justify\">The <code>train_test_split</code> method is used to split the X and Y data into training (2/3rd) and test (1/3rd) sets using stratified random selection. Additionally, the Class data is split for use in figure legends. The seed is selected in the optional section above. For further information on this method, refer to the scikit learn <a href=\"https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html\">documentation</a>.\n",
257 |     "\n",
258 |     "<br>\n",
259 |     "</div>"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "# Split Data into Train (2/3rd) and Test (1/3rd)\n",
269 |     "XTrain, XTest, YTrain, YTest, ClassTrain, ClassTest = train_test_split(X,\n",
270 |     "                                                                       Y,\n",
271 |     "                                                                       Class,\n",
272 |     "                                                                       test_size=1/3,\n",
273 |     "                                                                       stratify=Y,\n",
274 |     "                                                                       random_state=seed_split)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
282 |     "    \n",
283 |     "<a id=\"5\"></a>\n",
284 |     "<h2 style=\"text-align: justify\"> 5. Extract, Transform, & Scale X Data with Missing Values Imputed </h2>\n",
285 |     "\n",
286 |     "<p style=\"text-align: justify\">The X Data (<code>XTrain</code> and <code>XTest</code>) is log10 transformed, mean centred, and scaled to unit variance (with missing values imputed using K-Nearest Neighbour) prior to modelling following standard protocols for metabolomics <a href=\"https://link.springer.com/article/10.1007/s11306-006-0037-z\">(Broadhurst and Kell, 2006)</a>.</p>\n",
287 |     "<ul>\n",
288 |     "    \n",
289 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTrain</code></li>\n",
290 |     "\n",
291 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTrainLog</code>) to the unit variance (a.k.a. auto scaling), while also returning mu & sigma.</li>\n",
292 |     "\n",
293 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTrainKnn</code></li>\n",
294 |     "\n",
295 |     "<li style=\"text-align: justify\">Log-transform the values in <code>XTest</code></li>\n",
296 |     "\n",
297 |     "<li style=\"text-align: justify\">Using the helper function <code>cb.utils.scale()</code>, scale the log-transformed data (<code>XTestLog</code>) to the unit variance (a.k.a. auto scaling) using the mu & sigma from above.\n",
298 |     "    \n",
299 |     "<li style=\"text-align: justify\">Impute the missing values by using a <em>k</em>-nearest neighbour approach (with three neighbours) using the helper function <code>cb.utils.knnimpute()</code> to give the final matrix, <code>XTestKnn</code></li>  \n",
300 |     "    \n",
301 |     "</ul>\n",
302 |     "\n",
303 |     "</div>"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "# Extract X Train Data                          \n",
313 |     "XTrainLog = np.log(XTrain)                                          \n",
314 |     "XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)              \n",
315 |     "XTrainKnn = cb.utils.knnimpute(XTrainScale, k=3)    \n",
316 |     "\n",
317 |     "# Extract X Test Data\n",
318 |     "XTestLog = np.log(XTest)                                          \n",
319 |     "XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)           \n",
320 |     "XTestKnn = cb.utils.knnimpute(XTestScale, k=3)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {},
326 |    "source": [
327 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
328 |     "    \n",
329 |     "<a id=\"6\"></a>\n",
330 |     "<h2 style=\"text-align: justify\"> 6. Hyperparameter Optimisation </h2>\n",
331 |     "\n",
332 |     "<p style=\"text-align: justify\">The CIMCB helper function <code>cb.cross_val.kfold()</code> is used to carry out <em>k</em>-fold cross-validation (<em>k</em>=5) on a set of ANN-SS models with varying number of neurons (1 to 6) and learning rate (0.01 to 0.05) to determine the optimal hyperparamater values. In <em>k</em>-fold cross-validation, the original dataset is randomly split into k sized folds and subsequently trained for <em>k</em> iterations, where the model is trained on 1 – <em>k</em> folds and tested on the <em>k</em> fold <a href='http://ai.stanford.edu/~ronnyk/accEst.pdf'>(Kohavi 1995)</a>. This helper function requires values for the following parameters:</p>\n",
333 |     "    \n",
334 |     "<ul>\n",
335 |     "    <li><code>model</code>: the class of model used by the function, <code>cb.model.NN_SigmoidSigmoid</code></li>\n",
336 |     "    <li><code>X</code>: The metabolite data matrix, <code>XTrainKnn</code></li>\n",
337 |     "    <li><code>Y</code>: The binary outcome vector, <code>YTrain</code></li>\n",
338 |     "    <li><code>param_dict</code>: a dictionary, <code>param_dict</code>, that describes all key:value pairs to search, with the key name corresponding to the hyperparameter in the model class and the value as the list of possible values</li>\n",
339 |     "    <li><code>folds</code>: the number of folds in the <em>k</em>-fold cross validation</li>\n",
340 |     "    <li><code>n_mc</code>: the number of Monte Carlo repetitions of the <em>k</em>-fold CV</li>\n",
341 |     "</ul>\n",
342 |     "<br>\n",
343 |     "</div>"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "scrolled": false
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "# Parameter Dictionary\n",
355 |     "lr = [0.01,0.02,0.03,0.04,0.05]\n",
356 |     "neurons = [2, 3, 4, 5, 6]\n",
357 |     "\n",
358 |     "param_dict = dict(learning_rate=lr,\n",
359 |     "                  n_neurons=neurons,\n",
360 |     "                  epochs=400,\n",
361 |     "                  momentum=0.5,\n",
362 |     "                  decay=0,\n",
363 |     "                  loss='binary_crossentropy',\n",
364 |     "                  seed=seed_init)\n",
365 |     "\n",
366 |     "# Initialise\n",
367 |     "cv = cb.cross_val.KFold(model=cb.model.NN_SigmoidSigmoid,                      \n",
368 |     "                                X=XTrainKnn,                                 \n",
369 |     "                                Y=YTrain,                               \n",
370 |     "                                param_dict=param_dict,                   \n",
371 |     "                                folds=5,\n",
372 |     "                                n_mc=10)                              \n",
373 |     "\n",
374 |     "# Run \n",
375 |     "cv.run()  "
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
383 |     "    \n",
384 |     "<a id=\"6.1\"></a>\n",
385 |     "<h3 style=\"text-align: justify\"> 6.1. Plot R² & Q²</h3>\n",
386 |     "\n",
387 |     "<p style=\"text-align: justify\">When <code>cv.plot(metric='r2q2', method='absolute')</code> is run, 6 plots of $R^2$ and $Q^2$ statistics are displayed: (a) heatmap of $R^2$, (b) heatmap of $Q^2$, (c) heatmap of 1 - | ($R^2 - Q^2$) |, (d) | ($R^2 - Q^2$) | vs. $Q^2$, (e) $R^2$ and $Q^2$ against the learning rate, and (f) $R^2$ and $Q^2$ against the number of neurons. Alternatively, if <code>method='ratio'</code>, | ($R^2 - Q^2$) / $R^2$ | is used instead of | ($R^2 - Q^2$) | . The optimal number of hyperparameters is selected based on the point of inflection in figure b, or if a clear inflection point is not present, where | ($R^2 - Q^2$) | = 0.2.  Note, the $R^2$ is the mean coefficient of determination for the full dataset, and the $Q^2$ is the mean coefficient of determination for cross-validated prediction dataset over the 100 Monte Carlo repetitions. When <code>cv.plot(metric='auc')</code> is run, the predictability of the model is presented as area under the ROC curve (AUC), $AUC(full)$ & $AUC(cv)$, a non-parametric alternative to $R^2$ & $Q^2$. The following parameters of <code>cv.plot()</code> can be altered:</p>\n",
388 |     "\n",
389 |     "<ul>\n",
390 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'\n",
391 |     "    <li><code>method</code>: The types of plots displayed (default = 'absolute'). Alternative value is 'ratio'\n",
392 |     "    <li><code>ci</code>: The confidence interval in figure e & f (default = 95)\n",
393 |     "</ul>\n",
394 |     "\n",
395 |     "<br>\n",
396 |     "</div>"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "cv.plot(metric='auc', method='absolute', ci=95)\n",
406 |     "cv.plot(metric='r2q2', method='absolute', ci=95)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "markdown",
411 |    "metadata": {},
412 |    "source": [
413 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
414 |     "\n",
415 |     "<a id=\"6.2\"></a>\n",
416 |     "<h3 style=\"text-align: justify\"> 6.2. Plot Latent Projections: Full & CV </h3>\n",
417 |     "    \n",
418 |     "<p style=\"text-align: justify\">When <code>cv.plot_projections()</code> is run, an <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of neurons in the hidden layer to interrogate. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
419 |     "\n",
420 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). Each score plot includes the full scores (as circles) and CV scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the full scores (as solid lines) and CV scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
421 |     "\n",
422 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each neuron scores). The distribution of the full and CV scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
423 |     "\n",
424 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) ROC curves (a ROC curve for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). As the ROC curves are for every combination of 2 neurons, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve for the full model (green), and ROC curve for the cv model with 95% confidence intervals (yellow). Additionally, the equal distribution line (dashed black line) is shown.</p>\n",
425 |     "\n",
426 |     " <ul>\n",
427 |     "    <li><code>**optional_arguments</code>: optional arguments to specify model hyperparameters if they are changed in this search e.g. learning_rate=0.02 (except number of components). By default, the max value of each hyperparameter is used (unless specificied).</li>\n",
428 |     "    <li><code>components</code>: Neurons to plot (default = \"all\" ; plot all components). Alternatively, list the components to plot e.g. [1,3,4]</li>\n",
429 |     "    <li><code>plot</code>: Data to show (default = 'ci' ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'full', 'cv', and 'all'</li>\n",
430 |     "    <li><code>label</code>: Add labels to groups in scores plot (default = None ; refers to groups as 0/1)\n",
431 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'</li>\n",
432 |     " </ul>\n",
433 |     "\n",
434 |     "<br>\n",
435 |     "</div>"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "cv.plot_projections(learning_rate=0.03,\n",
445 |     "                    components=[1,2,3],  \n",
446 |     "                    plot=\"ci\",\n",
447 |     "                    label=ClassTrain,\n",
448 |     "                    legend=\"all\")"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
456 |     "\n",
457 |     "<a id=\"7\"></a>\n",
458 |     "<h2 style=\"text-align: justify\"> 7. Build Model & Evaluate </h2>\n",
459 |     "\n",
460 |     "<p style=\"text-align: justify\">A ANN-SS model using <code>cb.model.NN_SigmoidSigmoid</code> is created and initialised using the optimal hyperparameter values determined in step 4. The implementation of ANN-SS in the <code>cb.model.NN_SigmoidSigmoid</code> class uses using <a href=\"https://keras.io\">Keras</a> with a <a href=\"http://www.deeplearning.net/software/theano/\">Theano backend.</a><p>\n",
461 |     "\n",
462 |     "<p style=\"text-align: justify\">Following this initialisation, the ANN-SS model is trained using the <code>.train(X, Y)</code> method where the X matrix is <code>XTrainKnn</code> and the Y vector is <code>YTrain</code>, returning the Y predicted value <code>YPredTrain</code>. This model is then tested using the <code>.test(X, Y)</code> method where the X matrix is <code>XTestKnn</code> and the Y vector is <code>YTest</code>, returning the Y predicted value <code>YPredTest</code>.</p>\n",
463 |     "\n",
464 |     "<p style=\"text-align: justify\">The <code>.evaluate()</code> method can be used to evaluate the predictability of the model using the train and test set. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the predicted score for the train and test (by group). The distribution plot shows the probability density function of the predicted scores for the train and test (by group). The ROC curve shows the ROC curve for the train (green) and test (yellow). The following parameter values in <code>.evaluate()</code> can be altered:\n",
465 |     "    \n",
466 |     "<ul>\n",
467 |     "    <li><code>testset</code>: Plot test dataset (default = None). Alternative, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
468 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
469 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
470 |     "\n",
471 |     "<br>\n",
472 |     "</div>"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {
479 |     "scrolled": false
480 |    },
481 |    "outputs": [],
482 |    "source": [
483 |     "# Build Model\n",
484 |     "model = cb.model.NN_SigmoidSigmoid(learning_rate=0.03, \n",
485 |     "                                   n_neurons=2,\n",
486 |     "                                   epochs=400,\n",
487 |     "                                   momentum=0.5, \n",
488 |     "                                   decay=0, \n",
489 |     "                                   loss='binary_crossentropy',\n",
490 |     "                                   seed=seed_init)\n",
491 |     "YPredTrain = model.train(XTrainKnn, YTrain)\n",
492 |     "YPredTest = model.test(XTestKnn)\n",
493 |     "\n",
494 |     "# Put YTrain and YPredTrain in a List\n",
495 |     "EvalTrain = [YTrain, YPredTrain]\n",
496 |     "\n",
497 |     "# Put YTest and YPrestTest in a List\n",
498 |     "EvalTest = [YTest, YPredTest]\n",
499 |     "\n",
500 |     "# Save Weights & Feature Importance \n",
501 |     "model_weights = model.x_weights_            # [N1, N2, ...]\n",
502 |     "model_fi = model.feature_importance_        # [CW, GA]\n",
503 |     "\n",
504 |     "# Evaluate Model (include Test Dataset)\n",
505 |     "model.evaluate(testset=EvalTest,\n",
506 |     "               label=ClassTrain,\n",
507 |     "               legend='all') "
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
515 |     "\n",
516 |     "<a id=\"8\"></a>\n",
517 |     "<h2 style=\"text-align: justify\">8. Permutation Test</h2>\n",
518 |     "\n",
519 |     "<p style=\"text-align: justify\">After a model has been trained, the <code>.permutation_test()</code> method can be used to assess the reliability of the trained model (after selecting the number of latent variables). For the permutation test, the metabolite data matrix is randomised (permuted or 'shuffled'), while the Y (i.e. outcome) is fixed, and subsequently trained and tested on this randomised data <a href='https://link.springer.com/article/10.1007/s11306-011-0330-3'>(Szymańska et al. 2012)</a>. This process is repeated (in this case, n=100) to construct a distribution to fairly access the model. For a dataset with features that have with no meaningful contribution, we would expect a similar $R^2$ and $Q^2$ to a randomised dataset, while for a dataset with features with meaningful contribution, we would expect a $R^2$ and $Q^2$ significantly higher than that of the randomised dataset. When <code>.permutation_test()</code> is run, 2 plots are displayed: (a) $R^2$ and $Q^2$ against \"correlation of permuted data against original data\", and (b) probability density functions for $R^2$ and $Q^2$, with the $R^2$ and $Q^2$ values found for the model trained on original data presented as ball-and-stick. The following parameter value of <code>.permutation_test()</code> can be altered: \n",
520 |     "\n",
521 |     "<ul>\n",
522 |     "    <li><code>metric</code>: The metric used for the plots (default = 'r2q2'). Alternative metrics include 'auc', 'acc', 'f1score', 'prec', 'sens', and 'spec'. Multiple metrics can be plotted using a list e.g. ['r2q2', 'auc]\n",
523 |     "    <li><code>nperm</code>: The number of permutations. (default = 100)\n",
524 |     "    <li><code>legend</code>: To show legend (default = True). Alternative value is False\n",
525 |     "</ul>\n",
526 |     "\n",
527 |     "<br>\n",
528 |     "</div>"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": null,
534 |    "metadata": {
535 |     "scrolled": false
536 |    },
537 |    "outputs": [],
538 |    "source": [
539 |     "model.permutation_test(metric=['r2q2', 'auc'],\n",
540 |     "                       nperm=100,\n",
541 |     "                       legend=True)"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "metadata": {},
547 |    "source": [
548 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
549 |     "\n",
550 |     "<a id=\"9\"></a>\n",
551 |     "<h2 style=\"text-align: justify\"> 9. Bootstrap Resampling of the Model</h2>\n",
552 |     "\n",
553 |     "<p style=\"text-align: justify\">Bootstrap resampling is a resampling method based on random resampling with replacement, commonly used to provide an estimate of sampling distribution of a test statistic <a href=\"https://epubs.siam.org/doi/book/10.1137/1.9781611970319?mobileUi=0\">(Efron, 1982)</a>. In the context of this workflow, the PLS model from step 5 with its fixed hyperparameter values (i.e. number of LVs = 2) is retrained on the resampled with replacement data (in-bag) and evaluated on the unused data (out-of-bag) for 100 resamples. After the model is evaluated for each bootstrap, metrics including the predicted values (ypred), LV scores, LV loadings, and feature importance (VIP and coefficients) are stored and used to calculate 95% confidence intervals. To calculate the 95% confidence intervals, various methods can be used including the basic percentile method, corrected percentile method (a.k.a. bias-corrected method), and the commonly used bias-corrected and accelerated (BCA) method. In this example, the BCA  method is used with the class <code>cb.boostrap.BCA</code>. Alternatively, use <code>cb.boostrap.Per</code> to use the percentile method, or <code>cb.bootstrap.CPer</code> for the corrected percentile method. To create and run the <code>bootmodel</code> for any method, the following parameter values need to be set:\n",
554 |     "  \n",
555 |     " <ul>\n",
556 |     "    <li><code>model</code>: A model with fixed hyperparameter values for boostrap resampling</li>\n",
557 |     "    <li><code>bootnum</code>: The number of bootstrap resamples (default = 100)</li>\n",
558 |     "\n",
559 |     "<br>\n",
560 |     "</div>"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "metadata": {
567 |     "scrolled": false
568 |    },
569 |    "outputs": [],
570 |    "source": [
571 |     "# Extract X Data and Train Model\n",
572 |     "XBoot = DataTable2[PeakList]\n",
573 |     "XBootLog = np.log(XBoot)\n",
574 |     "XBootScale = cb.utils.scale(XBootLog, method='auto')\n",
575 |     "XBootKnn = cb.utils.knnimpute(XBootScale, k=3)\n",
576 |     "YPredBoot = model.train(XBootKnn, Y)\n",
577 |     "\n",
578 |     "# Build Boostrap Models\n",
579 |     "bootmodel = cb.bootstrap.BCA(model, bootnum=100) \n",
580 |     "bootmodel.run()"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "markdown",
585 |    "metadata": {},
586 |    "source": [
587 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
588 |     "\n",
589 |     "<a id=\"10\"></a>\n",
590 |     "<h2 style=\"text-align: justify\"> 10. Model Evaluation using Bootstrap Resampling</h2>\n",
591 |     "\n",
592 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.evaluate()</code> method can be used to provide an estimate of the robustness and a measure of the generalised predictability of the model. There are three plots produced when this method is run including a violin plot, probability density function, and a ROC curve. The violin plots show the distribution of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The distribution plot shows the probability density function of the median predicted score for the in-bag and out-of-bag (i.e. train and test) by group. The ROC curve shows the ROC curve with the median (green) and 95% CI for the in-bag (light green band) and the median (yellow) and 95% CI for the out-of-bag (light yellow band). The method used to calculate the 95% CI for the in-bag (green) is the class selected in the previous cell. In this example, the bias-corrected and accelerated method is used as <code>cb.bootstrap.BCA</code> was used in the previous cell to create <code>bootmodel</code>. \n",
593 |     " \n",
594 |     "<ul>\n",
595 |     "    <li><code>label</code>: Add labels to groups (default = None ; refer to groups as 0/1)\n",
596 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'roc', 'dist', 'violin', and 'none'</li>\n",
597 |     "    <li><code>trainset</code>: Plot train dataset instead of median in-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
598 |     "    <li><code>testset</code>: Plot test dataset instead of median out-of-bag (default = None). Alternatively, add YTrue and YPredicted as a list e.g. [YTrue, YPredicted].\n",
599 |     "<br>\n",
600 |     "</div>"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "code",
605 |    "execution_count": null,
606 |    "metadata": {
607 |     "scrolled": false
608 |    },
609 |    "outputs": [],
610 |    "source": [
611 |     "bootmodel.evaluate(label=Class,\n",
612 |     "                   legend='all') "
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "markdown",
617 |    "metadata": {},
618 |    "source": [
619 |     "\n",
620 |     "\n",
621 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
622 |     "\n",
623 |     "<a id=\"11\"></a>\n",
624 |     "<h2 style=\"text-align: justify\"> 11. Model Visualisation </h2>\n",
625 |     "\n",
626 |     "<br>\n",
627 |     "</div>"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "markdown",
632 |    "metadata": {},
633 |    "source": [
634 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
635 |     "    \n",
636 |     "<a id=\"11.1\"></a>\n",
637 |     "<h3 style=\"text-align: justify\"> 11.1 Plot Latent Projections: in-bag & out-of-bag </h3>\n",
638 |     "    \n",
639 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_projections()</code> method can be used to visualise the latent variable (LV) scores. When this method is run, an <em>n</em> x <em>n</em> grid of plots are displayed, where <em>n</em> is the number of neurons in the hidden layer. These plots include score plots, distribution plots, and receiver operating characteristic (ROC) curves.</p>\n",
640 |     "\n",
641 |     "<p style=\"text-align: justify\">There are <em>C</em>(<em>n</em>,<em>2</em>) score plots (i.e. a score plot for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). Each score plot includes the in-bag scores (as circles) and out-of-bag scores (as crosses) coloured by group, as well as the 95% confidence interval ellipses for the in-bag scores (as solid lines) and out-of-bag scores (as dashed lines). Additionally, the optimal line of separation (dashed grey line) and orthogonal line (solid grey line) are shown.</p>\n",
642 |     "\n",
643 |     "<p style=\"text-align: justify\">There are <em>n</em> distribution plots (a distribution plot for each neuron scores). The distribution of the in-bag and out-of-bag scores for each corresponding group (i.e. 4 discrete distributions overlayed for 2 groups). Each distribution is calculated using kernel density estimation, a standard non-parametric method used for estimation of a probability density function based on a set of data points <a href=\"https://ned.ipac.caltech.edu/level5/March02/Silverman/paper.pdf\">(Silverman 1986)</a>.</p>\n",
644 |     "\n",
645 |     "<p style=\"text-align: justify\">There are C(n,2) ROC curves (a ROC curve for every combination of 2 neurons e.g. neuron 1 scores vs. neuron 2 scores). As the ROC curves are for every combination of 2 neurons, the discrimination is calculated based on optimal separation (i.e. the grey line from the corresponding score plot). For each ROC curve plot there is a ROC curve with the LV score for the initial model with the 95% confidence intervals using the in-bag LV scores (green), and a ROC curve for the out-of-bag LV scores with 95% confidence intervals. The method used to calculate the 95% CI for the in-bag (green) is the class used to create the <code>bootmodel</code>. In this example, the bias corrected and accelerated method is used (<code>cb.bootstrap.BCA</code>). Additionally, the equal distribution line (dashed black line) is shown. \n",
646 |     "\n",
647 |     " <ul>\n",
648 |     "    <li><code>plot</code>: Data to show in plot (default = \"ci\" ; plot only 95% confidence interval ellipses). Alternative values include 'meanci', 'ib', 'oob', and 'all'</li>\n",
649 |     "    <li><code>label</code>: Add labels to groups in scores plot (default = None ; refer to groups as 0/1).\n",
650 |     "    <li><code>legend</code>: Show legends for plots (default = 'all'). Alternative values are 'scatter', 'dist', 'roc', and 'none'</li>\n",
651 |     "\n",
652 |     "<br>\n",
653 |     "</div>"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": null,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": [
662 |     "bootmodel.plot_projections(plot='ib',\n",
663 |     "                           label=Class,\n",
664 |     "                           legend='all')"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": null,
670 |    "metadata": {},
671 |    "outputs": [],
672 |    "source": [
673 |     "bootmodel.plot_projections(plot='oob',\n",
674 |     "                           label=Class,\n",
675 |     "                           legend='all')"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "code",
680 |    "execution_count": null,
681 |    "metadata": {},
682 |    "outputs": [],
683 |    "source": [
684 |     "bootmodel.plot_projections(plot='ci',\n",
685 |     "                           label=Class,\n",
686 |     "                           legend='all')"
687 |    ]
688 |   },
689 |   {
690 |    "cell_type": "markdown",
691 |    "metadata": {},
692 |    "source": [
693 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
694 |     "    \n",
695 |     "<a id=\"11.2\"></a>\n",
696 |     "<h3 style=\"text-align: justify\"> 11.2 Plot Weight Vectors </h3>\n",
697 |     "\n",
698 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_weights()</code> method can be used to visualise the neuron weight vectors. When this method is run, <em>n</em> plots are displayed, where <em>n</em> is the number of neurons. The circles in each plot represent the neuron weight vectors for the model. The 95% confidence intervals are calculated using bias-correct (BC) bootstrap method in step 6. Any metabolite weights with a confidence interval crossing the zero line are considered non-significant to the neuron. This method requires values for the following parameters:</p>\n",
699 |     "    \n",
700 |     "<ul>\n",
701 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
702 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
703 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [N1, N2, etc.]\n",
704 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
705 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
706 |     "</ul>\n",
707 |     "   \n",
708 |     "<br>\n",
709 |     "</div>"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": null,
715 |    "metadata": {
716 |     "scrolled": false
717 |    },
718 |    "outputs": [],
719 |    "source": [
720 |     "bootmodel.plot_weights(PeakTable,\n",
721 |     "                        PeakList,\n",
722 |     "                        plot='median',\n",
723 |     "                        ylabel='Label',  \n",
724 |     "                        sort=False) "
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "markdown",
729 |    "metadata": {},
730 |    "source": [
731 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
732 |     "\n",
733 |     "<a id=\"12\"></a>\n",
734 |     "<h2 style=\"text-align: justify\">12. Variable Contribution Plots </h2>\n",
735 |     "\n",
736 |     "<p style=\"text-align: justify\">After the <code>bootmodel</code> has been run, the <code>.plot_featureimportance()</code> method can be used to visualise the feature importance metrics. When this method is run, <em>2</em> plots are displayed; Connection Weight plot and Garson's Algorithm plot. These feature importance metrics are alternatives to the coefficient and variable importance in projection (VIP) in PLS.\n",
737 |     "    \n",
738 |     "<p style=\"text-align: justify\"> The values in the Connection Weight plot contain information about the overall contribution of each metabolite <a href=\"https://depts.washington.edu/oldenlab/wordpress/wp-content/uploads/2013/03/EcologicalModelling_2004.pdf\">(Olden et al. 2004)</a>. The values can either a positive or negative number, and therefore, negatively or positively contribute to the model. Any metabolite coefficient value with a confidence interval crossing the zero line is considered non-significant to the model.</p>\n",
739 |     "    \n",
740 |     "<p style=\"text-align: justify\"> The values in the Garson's Algorithm plot contain information about the overall contribution of each metabolite <a href=\"https://dl.acm.org/citation.cfm?id=129452\">(Garson 1991)</a>. These values are absolute, with the higher values representing a higher significance to the model. Unlike in a VIP plot, there is no standard cut-off used to determine whether metabolites are considered \"important\" in the model. One method (used below) is to use the average value across the metabolites as the cut-off.</p>\n",
741 |     "    \n",
742 |     "<p style=\"text-align: justify\">This method, <code>bootmodel</code> exports the feature importance metrics as a pandas DataFrame (table). This method also requires values for the following parameters:</p>\n",
743 |     "    \n",
744 |     "<ul>\n",
745 |     "    <li><code>PeakTable</code>: Cleaned PeakTable from step 3</li>\n",
746 |     "    <li><code>PeakList</code>: Peaks to include in plot (default = None; include all samples).\n",
747 |     "    <li><code>plot</code>: To plot the data or median as circles (default 'data'). Alternative values include 'median', and a list structured as [CW, GA].\n",
748 |     "    <li><code>ylabel</code>: Name of column in PeakTable to use as the ylabel (default = 'Label')\n",
749 |     "    <li><code>sort</code>: Whether to sort plots in absolute descending order (default = True)</li>\n",
750 |     "</ul>\n",
751 |     "   \n",
752 |     "<br>\n",
753 |     "</div>"
754 |    ]
755 |   },
756 |   {
757 |    "cell_type": "code",
758 |    "execution_count": null,
759 |    "metadata": {
760 |     "scrolled": false
761 |    },
762 |    "outputs": [],
763 |    "source": [
764 |     "feature_importance = bootmodel.plot_featureimportance(PeakTable,\n",
765 |     "                                                      PeakList,\n",
766 |     "                                                      plot='median',\n",
767 |     "                                                      ylabel='Label',  \n",
768 |     "                                                      sort=False)     "
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "markdown",
773 |    "metadata": {},
774 |    "source": [
775 |     "<div style=\"background-color:rgb(255, 250, 250); padding:10px; padding-left: 1em; padding-right: 1em;\">\n",
776 |     "\n",
777 |     "<a id=\"13\"></a>\n",
778 |     "<h2 style=\"text-align: justify\"> 13. Export Results </h2>\n",
779 |     "\n",
780 |     "<p style=\"text-align: justify\">The feature importance table created in step 8.3 can be exported using the inbuilt <code>.to_excel()</code> function within a pandas DataFrame. This function requires an input with the name of the file to create, and it can include directories by using the ‘ / ’ symbol. In the cell below, the table <code>feature_importance</code> is exported as an excel file called 'ANNSigSig_ST001047.xlsx' in the 'results' folder.<p>\n",
781 |     "\n",
782 |     "</div>"
783 |    ]
784 |   },
785 |   {
786 |    "cell_type": "code",
787 |    "execution_count": null,
788 |    "metadata": {},
789 |    "outputs": [],
790 |    "source": [
791 |     "export_folder = 'results/'\n",
792 |     "export_file = 'ANNSigSig_ST001047.xlsx'\n",
793 |     "\n",
794 |     "feature_importance.to_excel(export_folder + export_file)\n",
795 |     "print(\"Done!\")"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "code",
800 |    "execution_count": null,
801 |    "metadata": {},
802 |    "outputs": [],
803 |    "source": []
804 |   },
805 |   {
806 |    "cell_type": "code",
807 |    "execution_count": null,
808 |    "metadata": {},
809 |    "outputs": [],
810 |    "source": []
811 |   }
812 |  ],
813 |  "metadata": {
814 |   "kernelspec": {
815 |    "display_name": "Python 3",
816 |    "language": "python",
817 |    "name": "python3"
818 |   },
819 |   "language_info": {
820 |    "codemirror_mode": {
821 |     "name": "ipython",
822 |     "version": 3
823 |    },
824 |    "file_extension": ".py",
825 |    "mimetype": "text/x-python",
826 |    "name": "python",
827 |    "nbconvert_exporter": "python",
828 |    "pygments_lexer": "ipython3",
829 |    "version": "3.7.3"
830 |   },
831 |   "toc": {
832 |    "base_numbering": 1,
833 |    "nav_menu": {},
834 |    "number_sections": false,
835 |    "sideBar": false,
836 |    "skip_h1_title": false,
837 |    "title_cell": "Table of Contents",
838 |    "title_sidebar": "Contents",
839 |    "toc_cell": false,
840 |    "toc_position": {},
841 |    "toc_section_display": false,
842 |    "toc_window_display": false
843 |   },
844 |   "toc-autonumbering": false,
845 |   "toc-showmarkdowntxt": false
846 |  },
847 |  "nbformat": 4,
848 |  "nbformat_minor": 2
849 | }
850 | 


--------------------------------------------------------------------------------