├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── LICENSE ├── README.md ├── cheatsheet.md ├── examples └── modelcomparison.json ├── linter_profile.yaml ├── mcflylogo.png ├── notebooks ├── experiments │ ├── Actitracker_preprocess.ipynb │ ├── Actitracker_train.ipynb │ ├── EEG_alcoholic_preprocessing.ipynb │ ├── EEG_alcoholic_train.ipynb │ ├── Preprocess_PAMAP.ipynb │ ├── Preprocess_PAMAP2.ipynb │ ├── dataset_PEMS_prepare.ipynb │ ├── dataset_phoneme_prepare.ipynb │ ├── dataset_rackets_prepare.ipynb │ ├── deeplearning_eecology.ipynb │ ├── deeplearning_guinneabissau.ipynb │ ├── experiment_PAMAP.ipynb │ ├── experiment_PAMAP2.ipynb │ ├── experiment_PAMAP2_9fold.ipynb │ ├── experiment_PAMAP2_9fold_small.ipynb │ ├── experiment_extra_datasets.ipynb │ ├── experiment_extra_datasets_4model_types.ipynb │ ├── experiment_skipconnections.ipynb │ └── preproces_Guinea-Bisseau_Nigeria.ipynb └── tutorial │ ├── model │ └── model.h5 │ ├── tutorial.ipynb │ ├── tutorial_quick.ipynb │ └── workshop.ipynb ├── requirements.txt ├── scripts ├── Actitracker_train.py ├── EEG_alcoholic_train.py ├── experiment_PAMAP.py ├── experiment_PAMAP2_9fold.py └── pamap2.py ├── tests ├── __init__.py ├── test_tutorial_pamap2.py └── test_tutorial_weather.py └── utils ├── __init__.py ├── tutorial_pamap2.py ├── tutorial_racketsports.py ├── tutorial_vu.py └── tutorial_weather.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: CI Build 5 | 6 | on: 7 | workflow_dispatch: 8 | push: 9 | branches: 10 | - main 11 | pull_request: 12 | branches: 13 | - main 14 | schedule: 15 | - cron: '0 0 1 * *' 16 | 17 | jobs: 18 | first_check: 19 | name: first code check / python-3.10 / ubuntu-latest 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: '3.10' 27 | - name: Python info 28 | run: | 29 | which python3 30 | python3 --version 31 | - name: Install dependiencies 32 | run: | 33 | python3 -m pip install --upgrade pip setuptools wheel 34 | python3 -m pip install mcfly prospector pytest pandas 35 | - name: Check style against standards using prospector (only warn for now, but never fail) 36 | shell: bash -l {0} 37 | run: prospector --profile linter_profile -o grouped -o pylint:pylint-report.txt --zero-exit 38 | - name: Run unit tests 39 | run: pytest -v 40 | 41 | basic_checks: 42 | name: Run tests across OS and versions / python-${{ matrix.python-version }} / ${{ matrix.os }} 43 | runs-on: ${{ matrix.os }} 44 | needs: first_check 45 | strategy: 46 | fail-fast: false 47 | matrix: 48 | os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] 49 | python-version: ['3.7', '3.8', '3.9', '3.10'] 50 | exclude: 51 | # already tested in first_check job 52 | - python-version: 3.10 53 | os: ubuntu-latest 54 | steps: 55 | - uses: actions/checkout@v3 56 | - name: Set up Python ${{ matrix.python-version }} 57 | uses: actions/setup-python@v3 58 | with: 59 | python-version: ${{ matrix.python-version }} 60 | - name: Python info 61 | run: | 62 | which python 63 | python --version 64 | - name: Install dependencies 65 | run: | 66 | python3 -m pip install --upgrade pip setuptools wheel 67 | python3 -m pip install mcfly prospector pytest pandas 68 | - name: Run unit tests 69 | run: pytest -v 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | notebooks/tutorial/data/ 2 | notebooks/tutorial/data.zip 3 | */.ipynb_checkpoints/ 4 | */__pycache__/ 5 | *.pyc 6 | notebooks/*/.ipynb_checkpoints/ 7 | env 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | [![CI Build](https://github.com/NLeSC/mcfly-tutorial/workflows/CI%20Build/badge.svg)](https://github.com/NLeSC/mcfly-tutorial/actions) 6 | 7 | This repository contains notebooks that show how to use the [mcfly](https://github.com/NLeSC/mcfly) software. Mcfly is deep learning tool for time series classification. 8 | 9 | ## Tutorials 10 | Currently we offer two tutorials here. 11 | Our main tutorial can be found in the notebook [notebooks/tutorial/tutorial.ipynb](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/tutorial.ipynb). This tutorial will let you train deep learning models with mcfly on the [PAMAP2 dataset for activity recognition](https://archive.ics.uci.edu/ml/datasets/PAMAP2+Physical+Activity+Monitoring). 12 | 13 | A comparable, slightly quicker tutorial can be found in the notebook [notebooks/tutorial/tutorial_quick.ipynb](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/tutorial_quick.ipynb). This tutorial will let you train deep learning models with mcfly on the [RacketSports dataset for activity recognition](http://www.timeseriesclassification.com/description.php?Dataset=RacketSports). 14 | 15 | Prerequisites: 16 | - Python 3.7 and above 17 | - The following python packages have to be installed (also specified in requirements.txt file): 18 | - mcfly 19 | - jupyter 20 | - pandas 21 | - matplotlib 22 | - scipy 23 | - numpy 24 | 25 | ## Installation 26 | 27 | ```shell 28 | python3 -m venv env 29 | . env/bin/activate 30 | pip install --upgrade pip setuptools 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | ## Running the notebooks 35 | The tutorials can be run using Jupyter notebook. From the tutorial root folder run: 36 | 37 | `jupyter notebook` 38 | 39 | There are two versions of the tutorial. The [standard tutorial](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/tutorial.ipynb) is for self-learning. There is also a [version for workshops](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/workshop.ipynb) which is only expected to be used with the aid of an instructor. 40 | -------------------------------------------------------------------------------- /cheatsheet.md: -------------------------------------------------------------------------------- 1 | # mcfly cheatsheet 2 | 3 | This document can be found at https://github.com/NLeSC/mcfly-tutorial/blob/master/cheatsheet.md 4 | 5 | Detailed documentation can be found in the mcfly [wiki](https://github.com/NLeSC/mcfly/wiki/Home---mcfly). 6 | 7 | Notebook tutorials can be found in the mcfly-tutorial [repository](https://github.com/NLeSC/mcfly-tutorial) 8 | 9 | ### Jargon terms 10 | * [**accuracy**](https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers): proportion of correctly classified samples on all samples in a dataset. 11 | * **convolutional filter**: a set of weights that are applied to neighbouring data points. 12 | * [**convolutional layer**](http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/): type of network layer where a convolutional filter is slided over the input. 13 | * **CNN**: Convolutional Neural Network, a deep learning network that includes convolutional layers, often combined with dense or fully connected layers. 14 | * [**LSTM layer**](http://colah.github.io/posts/2015-08-Understanding-LSTMs/): Long Term Short Memory layer. This is a special type of Recurrent layer, that takes a sequence as input and outputs a sequence. 15 | * **DeepConvLSTM**: A deep learning network that includes both convolutional layers and LSTM layers 16 | * **epoch**: One full pass through a dataset (all datapoints are seen once) in the process of training the weights of a network. 17 | * **loss**: An indicator of overall classification error. More errors means greater loss. In mcfly we use [categorical cross entropy](http://cs231n.github.io/linear-classify/#softmax). 18 | * [**gradient descent**](http://cs231n.github.io/optimization-1/): Algorithm used to find the locally optimal weights for the nodes in the network. The algorithm iteratively improves the weights in order to minimize classification loss. The search space can be interpreted as a landscape where the lowest point is the optimum, hence the term 'descent'. In each step of the gradient descent algorithm, the weights are adjusted with a step in the direction of the gradient ('slope'). 19 | * **hyperparameters**: In mcfly, the hyperparameters are the architectural choices of the model (number of layers, lstm or convolutional layers, etc) and the learning rate and regulization rate. 20 | * **layer**: A deep learning network consists of multiple layers. The more layers, the deeper your network. 21 | * **learning rate**: The step size to take in the gradient descent algorithm. 22 | * **regularization rate**: How strongly the [L2 regularization](http://cs231n.github.io/neural-networks-2/#reg) is applied to avoid overfitting on train data. 23 | * **[validation set](https://en.wikipedia.org/wiki/Test_set#Validation_set)**: Part of the data that is kept apart to evaluate the performance of your model and choose hyper parameters. 24 | 25 | 26 | 27 | 28 | ### Input data: 29 | *X_train* => Nr samples **x** Nr timesteps **x** Nr channels 30 | 31 | *y_train_binary* => Nr samples **x** Nr classes 32 | 33 | ### Generate models: 34 | Generate one or multiple untrained Keras models with random hyperparameters. 35 | 36 | ``` 37 | num_classes = y_train_binary.shape[1] 38 | models = modelgen.generate_models(X_train.shape, number_of_classes=num_classes, number_of_models = 2) 39 | ``` 40 | 41 | ### Train multiple models: 42 | Tries out a number of models on a subsample of the data, and outputs the best found architecture and hyperparameters. 43 | ``` 44 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples( 45 | X_train, y_train_binary, X_val, y_val_binary, 46 | models, nr_epochs=5, subset_size=300, 47 | verbose=True, outputfile=outputfile) 48 | ``` 49 | ### Select best model 50 | ``` 51 | best_model_index = np.argmax(val_accuracies) 52 | best_model, best_params, best_model_types = models[best_model_index] 53 | ``` 54 | 55 | ### Train one specific model (this is done with Keras function fit): 56 | ``` 57 | best_model.fit(X_train, y_train_binary, 58 | nb_epoch=25, validation_data=(X_val, y_val_binary)) 59 | ``` 60 | -------------------------------------------------------------------------------- /linter_profile.yaml: -------------------------------------------------------------------------------- 1 | output-format: json 2 | 3 | strictness: medium 4 | test-warnings: true 5 | doc-warnings: false 6 | 7 | pylint: 8 | disable: 9 | - wrong-import-position 10 | - redefined-builtin 11 | - bare-except 12 | - unused-argument 13 | - dangerous-default-value 14 | - too-many-branches 15 | - too-many-arguments 16 | - too-many-locals 17 | - protected-access 18 | 19 | pep8: 20 | disable: 21 | - E722 22 | -------------------------------------------------------------------------------- /mcflylogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLeSC/mcfly-tutorial/4b1548058c158d0efef41bfb6c7b2caa575a8858/mcflylogo.png -------------------------------------------------------------------------------- /notebooks/experiments/Actitracker_train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "Using TensorFlow backend.\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import sys\n", 20 | "import os\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import json\n", 24 | "# mcfly\n", 25 | "from mcfly import modelgen, find_architecture, storage" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "data_path = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/'\n", 37 | "preprocessed_path = os.path.join(data_path, 'preprocessed')\n", 38 | "result_path = os.path.join(data_path, 'models')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy'))\n", 50 | "X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy'))\n", 51 | "X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy'))\n", 52 | "y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy'))\n", 53 | "y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy'))\n", 54 | "y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy'))" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "with open(os.path.join(preprocessed_path, 'labels.json')) as f:\n", 66 | " labels = json.load(f)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Generate models" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 5, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "num_classes = y_train.shape[1]\n", 85 | "\n", 86 | "models = modelgen.generate_models(X_train.shape,\n", 87 | " number_of_classes=num_classes,\n", 88 | " number_of_models = 2) #10)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "Walking 0.445402\n", 102 | "LyingDown 0.055904\n", 103 | "Standing 0.082027\n", 104 | "Sitting 0.281609\n", 105 | "Jogging 0.103971\n", 106 | "Stairs 0.031087\n", 107 | "dtype: float64" 108 | ] 109 | }, 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "#what is the fraction of classes in the validation set?\n", 117 | "pd.Series(y_val.mean(axis=0), index=labels)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 7, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "if not os.path.exists(result_path):\n", 129 | " os.makedirs(result_path)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "outputfile = os.path.join(resultpath, 'modelcomparison.json')\n", 141 | "histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,\n", 142 | " X_val[:10], y_val[:10],\n", 143 | " models,nr_epochs=1, #5,\n", 144 | " subset_size=128, # 512,\n", 145 | " verbose=True,\n", 146 | " batch_size=32,\n", 147 | " outputpath=outputfile,\n", 148 | " early_stopping=True)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 14, 154 | "metadata": { 155 | "collapsed": false, 156 | "scrolled": true 157 | }, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "Details of the training process were stored in /media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/models_test/models.json\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "print('Details of the training process were stored in ',outputfile)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 15, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "Model type and parameters of the best model:\n", 183 | "DeepConvLSTM\n", 184 | "{'regularization_rate': 0.00574537358824132, 'lstm_dims': [48, 52, 42, 38, 56], 'filters': [100, 87, 31, 82, 70], 'learning_rate': 0.0011995620624020058}\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "best_model_index = np.argmax(val_accuracies)\n", 190 | "best_model, best_params, best_model_types = models[best_model_index]\n", 191 | "print('Model type and parameters of the best model:')\n", 192 | "print(best_model_types)\n", 193 | "print(best_params)\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": false, 201 | "scrolled": true 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "nr_epochs = 3\n", 206 | "datasize = 128 # X_train.shape[0]\n", 207 | "history = best_model.fit(X_train[:datasize,:,:], y_train[:datasize,:],\n", 208 | " epochs=nr_epochs, validation_data=(X_val, y_val))" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 20, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "best_model.save(os.path.join(result_path, 'best_model.h5'))" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "from keras.models import load_model\n", 231 | "best_model = load_model(os.path.join(result_path, 'best_model.h5'))" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 13, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "1641/1641 [==============================] - 110s \n", 246 | "Score of best model: [3.3572144339827048, 0.57586837305229366]\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "## Test on Testset\n", 252 | "score_test = best_model.evaluate(X_test, y_test, verbose=True)\n", 253 | "print('Score of best model: ' + str(score_test))" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "probs = model.predict_proba(X_test)\n", 265 | "predicted = probs.argmax(axis=1)\n", 266 | "y_index = y_val_binary.argmax(axis=1)\n", 267 | "confusion_matrix = pd.crosstab(pd.Series(y_index), pd.Series(predicted))\n", 268 | "confusion_matrix.index = [labels[i] for i in confusion_matrix.index]\n", 269 | "confusion_matrix.columns = [labels[i] for i in confusion_matrix.columns]\n", 270 | "confusion_matrix.reindex(columns=[l for l in labels], fill_value=0)\n", 271 | "confusion_matrix" 272 | ] 273 | } 274 | ], 275 | "metadata": { 276 | "anaconda-cloud": {}, 277 | "kernelspec": { 278 | "display_name": "Python [conda env:mcfly]", 279 | "language": "python", 280 | "name": "conda-env-mcfly-py" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.5.2" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 0 297 | } 298 | -------------------------------------------------------------------------------- /notebooks/experiments/EEG_alcoholic_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "from io import StringIO\n", 14 | "import os" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "def read_datafile(fn):\n", 26 | " with open(fn, 'r') as f:\n", 27 | " header_1 = f.readline()\n", 28 | " subject = header_1.split('.')[0][2:]\n", 29 | " header_2 = f.readline()\n", 30 | " header_3 = f.readline()\n", 31 | " header_4 = f.readline()\n", 32 | " stimulus = ' '.join(header_4.split(' ')[1:3])\n", 33 | " header_5 = f.readline()\n", 34 | " rest = f.read()\n", 35 | " if(len(rest)>0):\n", 36 | " data_trial = pd.read_csv(StringIO(rest), sep=' ', header=None)\n", 37 | " data_trial.columns = ['trial', 'sensor', 'sample', 'value']\n", 38 | " data_trial['subject'] = subject\n", 39 | " data_trial['stimulus'] = stimulus\n", 40 | " else:\n", 41 | " data_trial = None\n", 42 | " return data_trial" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "data_dir = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/data/'" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 6, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | " " 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "%%prun\n", 73 | "dat = pd.DataFrame(columns=['subject', 'stimulus', 'trial', 'sensor', 'sample', 'value'])\n", 74 | "for fn in os.listdir(data_dir)[:100]:\n", 75 | " full_fn = os.path.join(data_dir, fn)\n", 76 | " if os.path.isfile(full_fn):\n", 77 | " try:\n", 78 | " data_trial = read_datafile(full_fn)\n", 79 | " if data_trial is not None:\n", 80 | " dat = dat.append(data_trial)\n", 81 | " except Exception as err:\n", 82 | " print(fn, err)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 19, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "(2119, 2)" 96 | ] 97 | }, 98 | "execution_count": 19, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "dat[['subject', 'trial']].drop_duplicates().shape" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "co2c1000367.rd.065 No columns to parse from file\n", 119 | "co2c1000367.rd.089 No columns to parse from file\n", 120 | "co2c1000367.rd.090 No columns to parse from file\n", 121 | "co2c1000367.rd.105 No columns to parse from file\n", 122 | "co2c1000367.rd.113 No columns to parse from file\n", 123 | "co2c1000367.rd.114 No columns to parse from file\n", 124 | "co2c1000367.rd.116 No columns to parse from file\n", 125 | "co2c1000367.rd.117 No columns to parse from file\n", 126 | "co2c1000367.rd.004 No columns to parse from file\n", 127 | "co2c1000367.rd.005 No columns to parse from file\n", 128 | "co2c1000367.rd.006 No columns to parse from file\n", 129 | "co2c1000367.rd.023 No columns to parse from file\n", 130 | "co2c1000367.rd.029 No columns to parse from file\n", 131 | "co2c1000367.rd.037 No columns to parse from file\n", 132 | "co2c1000367.rd.042 No columns to parse from file\n", 133 | "co2c1000367.rd.053 No columns to parse from file\n", 134 | "co2c1000367.rd.054 No columns to parse from file\n", 135 | "318.12917041778564 s\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "#%%prun\n", 141 | "import time\n", 142 | "\n", 143 | "Xes = []\n", 144 | "sensors = []\n", 145 | "labels = []\n", 146 | "headers = []\n", 147 | "\n", 148 | "t = time.time()\n", 149 | "for fn in os.listdir(data_dir):\n", 150 | "#for fn in np.random.choice(os.listdir(data_dir), 100):\n", 151 | " full_fn = os.path.join(data_dir, fn)\n", 152 | " if os.path.isfile(full_fn):\n", 153 | " try:\n", 154 | " data_trial = pd.read_csv(full_fn, sep=' ', header=None, comment='#')\n", 155 | " if data_trial is not None:\n", 156 | " data_trial.columns = ['trial', 'sensor', 'sample', 'value']\n", 157 | " pivoted = data_trial.pivot_table(index='sample', columns='sensor', values='value')\n", 158 | " Xes.append(pivoted.as_matrix())\n", 159 | " labels.append(fn[3])\n", 160 | " sensors.append(pivoted.columns)\n", 161 | " with open(full_fn, 'r') as f:\n", 162 | " header = [f.readline() for i in range(5)]\n", 163 | " headers.append(header)\n", 164 | " except Exception as err:\n", 165 | " print(fn, err)\n", 166 | "print((time.time()-t), 's')" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 12, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "# S1 obj 5477\n", 180 | "# S2 match 2757\n", 181 | "# S2 nomatch 2728\n", 182 | "# S2 match err 60\n", 183 | "# S2 nomatch err 35\n", 184 | "dtype: int64" 185 | ] 186 | }, 187 | "execution_count": 12, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "stimuli = [h[3].split(',')[0].strip() for h in headers]\n", 194 | "pd.Series(stimuli).value_counts()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 40, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "subjects = [h[0].split(' ')[1].strip() for h in headers]\n", 206 | "trials = [h[3].split('trial ')[-1].strip() for h in headers]" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 41, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "(11057, 3)\n" 221 | ] 222 | }, 223 | { 224 | "data": { 225 | "text/html": [ 226 | "
\n", 227 | "\n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | "
stimulisubjecttrial
0# S2 nomatchco3c0000402.rd13
1# S1 objco3c0000402.rd14
2# S2 nomatchco3c0000402.rd15
3# S1 objco3c0000402.rd16
4# S2 matchco3c0000402.rd17
\n", 269 | "
" 270 | ], 271 | "text/plain": [ 272 | " stimuli subject trial\n", 273 | "0 # S2 nomatch co3c0000402.rd 13\n", 274 | "1 # S1 obj co3c0000402.rd 14\n", 275 | "2 # S2 nomatch co3c0000402.rd 15\n", 276 | "3 # S1 obj co3c0000402.rd 16\n", 277 | "4 # S2 match co3c0000402.rd 17" 278 | ] 279 | }, 280 | "execution_count": 41, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "metadata = pd.DataFrame({'subject': subjects, 'trial': trials, 'stimuli': stimuli})\n", 287 | "print(metadata.shape)\n", 288 | "metadata.head()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 42, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "(11057, 256, 64)" 302 | ] 303 | }, 304 | "execution_count": 42, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "# The shape should be: (num_samples, num_timesteps, num_channels)\n", 311 | "Xa = np.array(Xes)\n", 312 | "Xa.shape" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 43, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/html": [ 325 | "
\n", 326 | "\n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | "
0123456789...54555657585960616263
0AF1AF2AF7AF8AFZC1C2C3C4C5...PO8POZPZT7T8TP7TP8XYnd
\n", 380 | "

1 rows × 64 columns

\n", 381 | "
" 382 | ], 383 | "text/plain": [ 384 | " 0 1 2 3 4 5 6 7 8 9 ... 54 55 56 57 58 59 \\\n", 385 | "0 AF1 AF2 AF7 AF8 AFZ C1 C2 C3 C4 C5 ... PO8 POZ PZ T7 T8 TP7 \n", 386 | "\n", 387 | " 60 61 62 63 \n", 388 | "0 TP8 X Y nd \n", 389 | "\n", 390 | "[1 rows x 64 columns]" 391 | ] 392 | }, 393 | "execution_count": 43, 394 | "metadata": {}, 395 | "output_type": "execute_result" 396 | } 397 | ], 398 | "source": [ 399 | "pd.DataFrame(sensors).drop_duplicates()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 44, 405 | "metadata": { 406 | "collapsed": false 407 | }, 408 | "outputs": [ 409 | { 410 | "data": { 411 | "text/plain": [ 412 | "a 7033\n", 413 | "c 4024\n", 414 | "dtype: int64" 415 | ] 416 | }, 417 | "execution_count": 44, 418 | "metadata": {}, 419 | "output_type": "execute_result" 420 | } 421 | ], 422 | "source": [ 423 | "# How many subjects do we have for each label?\n", 424 | "pd.Series(labels).value_counts()" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 45, 430 | "metadata": { 431 | "collapsed": false 432 | }, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/plain": [ 437 | "10962" 438 | ] 439 | }, 440 | "execution_count": 45, 441 | "metadata": {}, 442 | "output_type": "execute_result" 443 | } 444 | ], 445 | "source": [ 446 | "# filter for errors\n", 447 | "no_error = ~metadata.stimuli.str.contains('err')\n", 448 | "sum(no_error)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 46, 454 | "metadata": { 455 | "collapsed": false 456 | }, 457 | "outputs": [ 458 | { 459 | "data": { 460 | "text/plain": [ 461 | "(10962, 256, 64)" 462 | ] 463 | }, 464 | "execution_count": 46, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "Xa_filtered = Xa[no_error]\n", 471 | "Xa_filtered.shape" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 53, 477 | "metadata": { 478 | "collapsed": false 479 | }, 480 | "outputs": [ 481 | { 482 | "data": { 483 | "text/plain": [ 484 | "0" 485 | ] 486 | }, 487 | "execution_count": 53, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "# Do we have NaN values?\n", 494 | "np.isnan(Xa_filtered).sum()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 47, 500 | "metadata": { 501 | "collapsed": true 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "metadata_filtered = metadata[no_error]" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 64, 511 | "metadata": { 512 | "collapsed": false 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "preprocessed_path = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/preprocessed/'\n", 517 | "np.save(os.path.join(preprocessed_path, 'X.npy'), arr=Xa_filtered)\n", 518 | "metadata_filtered.to_csv(os.path.join(preprocessed_path, 'metadata.csv'), index=False)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 56, 524 | "metadata": { 525 | "collapsed": false 526 | }, 527 | "outputs": [ 528 | { 529 | "name": "stdout", 530 | "output_type": "stream", 531 | "text": [ 532 | "8769 1096 1097\n" 533 | ] 534 | } 535 | ], 536 | "source": [ 537 | "# Create train and test set\n", 538 | "n = Xa_filtered.shape[0]\n", 539 | "n_train = int(0.8*n)\n", 540 | "n_val = int(0.1*n)\n", 541 | "n_test = n - n_train - n_val\n", 542 | "print(n_train, n_val, n_test)\n", 543 | "\n", 544 | "ind_perm = np.random.permutation(n)\n", 545 | "ind_train = ind_perm[:n_train]\n", 546 | "ind_val = ind_perm[n_train:n_train+n_val]\n", 547 | "ind_test = ind_perm[-n_test:]" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 65, 553 | "metadata": { 554 | "collapsed": false 555 | }, 556 | "outputs": [], 557 | "source": [ 558 | "np.save(os.path.join(preprocessed_path, 'X_train.npy'), arr=Xa_filtered[ind_train])\n", 559 | "np.save(os.path.join(preprocessed_path, 'X_val.npy'), arr=Xa_filtered[ind_val])\n", 560 | "np.save(os.path.join(preprocessed_path, 'X_test.npy'), arr=Xa_filtered[ind_test])" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 71, 566 | "metadata": { 567 | "collapsed": false 568 | }, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/plain": [ 573 | "(10962, 2)" 574 | ] 575 | }, 576 | "execution_count": 71, 577 | "metadata": {}, 578 | "output_type": "execute_result" 579 | } 580 | ], 581 | "source": [ 582 | "# make binary labels\n", 583 | "y = np.zeros((len(labels), 2))\n", 584 | "y[:, 0] = [1*(l=='a') for l in labels]\n", 585 | "y[:, 1] = [1*(l=='c') for l in labels]\n", 586 | "y_filtered = y[no_error]\n", 587 | "y_filtered.shape" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 72, 593 | "metadata": { 594 | "collapsed": false 595 | }, 596 | "outputs": [], 597 | "source": [ 598 | "np.save(os.path.join(preprocessed_path, 'y.npy'), arr=y_filtered)\n", 599 | "np.save(os.path.join(preprocessed_path, 'y_train.npy'), arr=y_filtered[ind_train])\n", 600 | "np.save(os.path.join(preprocessed_path, 'y_val.npy'), arr=y_filtered[ind_val])\n", 601 | "np.save(os.path.join(preprocessed_path, 'y_test.npy'), arr=y_filtered[ind_test])" 602 | ] 603 | } 604 | ], 605 | "metadata": { 606 | "anaconda-cloud": {}, 607 | "kernelspec": { 608 | "display_name": "Python [conda env:mcfly]", 609 | "language": "python", 610 | "name": "conda-env-mcfly-py" 611 | }, 612 | "language_info": { 613 | "codemirror_mode": { 614 | "name": "ipython", 615 | "version": 3 616 | }, 617 | "file_extension": ".py", 618 | "mimetype": "text/x-python", 619 | "name": "python", 620 | "nbconvert_exporter": "python", 621 | "pygments_lexer": "ipython3", 622 | "version": "3.5.2" 623 | } 624 | }, 625 | "nbformat": 4, 626 | "nbformat_minor": 0 627 | } 628 | -------------------------------------------------------------------------------- /notebooks/experiments/EEG_alcoholic_train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import sys\n", 12 | "import os\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "# mcfly\n", 16 | "from mcfly import modelgen, find_architecture, storage" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 3, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "data_path = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/'\n", 28 | "preprocessed_path = os.path.join(data_path, 'preprocessed')\n", 29 | "result_path = os.path.join(data_path, 'models')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "\n", 41 | "X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy'))\n", 42 | "X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy'))\n", 43 | "X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy'))\n", 44 | "y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy'))\n", 45 | "y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy'))\n", 46 | "y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy'))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Generate models" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "num_classes = y_train.shape[1]\n", 65 | "\n", 66 | "models = modelgen.generate_models(X_train.shape,\n", 67 | " number_of_classes=num_classes,\n", 68 | " number_of_models = 10)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 6, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "array([ 0.63868613, 0.36131387])" 82 | ] 83 | }, 84 | "execution_count": 6, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "#what is the fraction of a vs c in the validation set?\n", 91 | "y_val.mean(axis=0)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "if not os.path.exists(result_path):\n", 103 | " os.makedirs(result_path)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "Training model 0 DeepConvLSTM\n", 118 | "Train on 512 samples, validate on 1096 samples\n", 119 | "Epoch 1/5\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "outputfile = os.path.join(result_path, 'modelcomparison.json')\n", 125 | "histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,\n", 126 | " X_val, y_val,\n", 127 | " models,nr_epochs=5,\n", 128 | " subset_size=512,\n", 129 | " verbose=True,\n", 130 | " batch_size=32,\n", 131 | " outputfile=outputfile,\n", 132 | " early_stopping=True)\n", 133 | "print('Details of the training process were stored in ',outputfile)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "anaconda-cloud": {}, 148 | "kernelspec": { 149 | "display_name": "Python [conda env:mcfly]", 150 | "language": "python", 151 | "name": "conda-env-mcfly-py" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.5.2" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 0 168 | } 169 | -------------------------------------------------------------------------------- /notebooks/experiments/Preprocess_PAMAP2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "Using Theano backend.\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "%load_ext autoreload\n", 20 | "%autoreload 2\n", 21 | "import sys\n", 22 | "import os\n", 23 | "sys.path.insert(0, os.path.abspath('../..'))\n", 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "# mcfly\n", 27 | "from mcfly import tutorial_pamap2, modelgen, find_architecture, storage" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# Specify in which directory you want to store the data:\n", 39 | "directory_to_extract_to = \"/media/sf_VBox_Shared/timeseries/\"" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "['timestamp', 'activityID', 'heartrate', 'hand_temperature', 'hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z', 'hand_acc_6g_x', 'hand_acc_6g_y', 'hand_acc_6g_z', 'hand_gyroscope_x', 'hand_gyroscope_y', 'hand_gyroscope_z', 'hand_magnometer_x', 'hand_magnometer_y', 'hand_magnometer_z', 'hand_orientation_0', 'hand_orientation_1', 'hand_orientation_2', 'hand_orientation_3', 'chest_temperature', 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z', 'chest_acc_6g_x', 'chest_acc_6g_y', 'chest_acc_6g_z', 'chest_gyroscope_x', 'chest_gyroscope_y', 'chest_gyroscope_z', 'chest_magnometer_x', 'chest_magnometer_y', 'chest_magnometer_z', 'chest_orientation_0', 'chest_orientation_1', 'chest_orientation_2', 'chest_orientation_3', 'ankle_temperature', 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z', 'ankle_acc_6g_x', 'ankle_acc_6g_y', 'ankle_acc_6g_z', 'ankle_gyroscope_x', 'ankle_gyroscope_y', 'ankle_gyroscope_z', 'ankle_magnometer_x', 'ankle_magnometer_y', 'ankle_magnometer_z', 'ankle_orientation_0', 'ankle_orientation_1', 'ankle_orientation_2', 'ankle_orientation_3']\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "header = tutorial_pamap2.get_header()\n", 59 | "print(header)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "19 possible activities:\n", 67 | "\n", 68 | "– 1 lying – 2 sitting – 3 standing – 4 walking – 5 running – 6 cycling – 7 Nordic walking – 9 watching TV – 10 computer work – 11 car driving – 12 ascending stairs – 13 descending stairs – 16 vacuum cleaning – 17 ironing – 18 folding laundry – 19 house cleaning – 20 playing soccer – 24 rope jumping – 0 other (transient activities" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Included activities:\n", 76 | "\n", 77 | "(1-lie, 2-sit, 3-stand, 4-walk, 5-run, 6-cycle, 7-Nordic walk, 17-iron, 16-vacuum clean, 24-rope jump, 12-ascend and 13-descend stairs)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 11, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "12\n", 92 | "[9, 10, 11, 18, 19, 20, 0]\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "all_activities = [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24, 0]\n", 98 | "include_activities = [1, 2, 3, 4, 5, 6, 7, 12, 13, 16, 17, 24]\n", 99 | "print(len(include_activities))\n", 100 | "exclude_activities = [n for n in all_activities if n not in include_activities]\n", 101 | "print(exclude_activities)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',\n", 113 | " 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',\n", 114 | " 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']\n", 115 | "outputdir = \"cleaned_12activities_9vars\"" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 8, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "Automatic pdb calling has been turned ON\n", 130 | "Data previously downloaded and stored in /media/sf_VBox_Shared/timeseries/PAMAP2/\n", 131 | "Start pre-processing all 9 files...\n", 132 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_0 y_0\n", 133 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_1 y_1\n", 134 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_2 y_2\n", 135 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_3 y_3\n", 136 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_4 y_4\n", 137 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_5 y_5\n", 138 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_6 y_6\n", 139 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_7 y_7\n", 140 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_8 y_8\n", 141 | "Processed data succesfully stored in /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars\n" 142 | ] 143 | }, 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "'/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars'" 148 | ] 149 | }, 150 | "execution_count": 8, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "%pdb 1\n", 157 | "tutorial_pamap2.fetch_and_preprocess(directory_to_extract_to, \n", 158 | " columns_to_use=columns_to_use, \n", 159 | " output_dir=outputdir, \n", 160 | " exclude_activities=exclude_activities,\n", 161 | " fold=True)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 10, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "Data previously downloaded and stored in /media/sf_VBox_Shared/timeseries/PAMAP2/\n", 176 | "Start pre-processing all 9 files...\n", 177 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_0 y_0\n", 178 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_1 y_1\n", 179 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_2 y_2\n", 180 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_3 y_3\n", 181 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_4 y_4\n", 182 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_5 y_5\n", 183 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_6 y_6\n", 184 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_7 y_7\n", 185 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_8 y_8\n", 186 | "Processed data succesfully stored in /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars\n" 187 | ] 188 | }, 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "'/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars'" 193 | ] 194 | }, 195 | "execution_count": 10, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "columns_to_use = header[2:]\n", 202 | "outputdir = \"cleaned_12activities_allvars\"\n", 203 | "tutorial_pamap2.fetch_and_preprocess(directory_to_extract_to, \n", 204 | " columns_to_use=columns_to_use, \n", 205 | " output_dir=outputdir, \n", 206 | " exclude_activities=exclude_activities,\n", 207 | " fold=True)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [] 218 | } 219 | ], 220 | "metadata": { 221 | "anaconda-cloud": {}, 222 | "kernelspec": { 223 | "display_name": "Python [conda env:mcfly]", 224 | "language": "python", 225 | "name": "conda-env-mcfly-py" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.5.2" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 0 242 | } 243 | -------------------------------------------------------------------------------- /notebooks/experiments/dataset_PEMS_prepare.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Dataset: PEMS\n", 9 | "\n", 10 | "http://www.timeseriesclassification.com/description.php?Dataset=PEMS-SF \n", 11 | "https://archive.ics.uci.edu/ml/datasets/PEMS-SF\n", 12 | "\n", 13 | "### Info from data source:\n", 14 | "Source: California Department of Transportation, www.pems.dot.ca.gov\n", 15 | "Creator: Marco Cuturi, Kyoto University, mcuturi '@' i.kyoto-u.ac.jp\n", 16 | "\n", 17 | "Data Set Information:\n", 18 | "\n", 19 | "15 months worth of daily data from the California Department of Transportation PEMS website. The data describes the occupancy\n", 20 | "rate, between 0 and 1, of different car lanes of San Francisco bay area freeways. The measurements cover the period from Jan. 1st 2008 to Mar. 30th 2009 and are sampled every 10 minutes. We consider each day in this database as a single time series of dimension 963 (the number of sensors which functioned consistently throughout the studied period) and length 6 x 24=144. We remove public holidays from the dataset, as well\n", 21 | "as two days with anomalies (March 8th 2009 and March 9th 2008) where all sensors were muted between 2:00 and 3:00 AM.\n", 22 | "This results in a database of 440 time series.\n", 23 | "\n", 24 | "The task is to classify each observed day as the correct day of the week, from Monday to Sunday, e.g. label it with an integer in {1,2,3,4,5,6,7}.\n", 25 | "Each attribute describes the measurement of the occupancy rate (between 0 and 1) of a captor location as recorded by a measuring station, at a given timestamp in time during the day. The ID of each station is given in the stations_list text file. For more information on the location (GPS, Highway, Direction) of each station please refer to the PEMS website. There are 963 (stations) x 144 (timestamps) = 138.672 attributes for each record.\n", 26 | "\n", 27 | "Relevant Papers:\n", 28 | "[1] M. Cuturi, Fast Global Alignment Kernels, Proceedings of the Intern. Conference on Machine Learning 2011.\n", 29 | "\n", 30 | "\n", 31 | "### Size:\n", 32 | "+ Training samples: 267\n", 33 | "+ Test sampels: 173\n", 34 | "+ Dimension: 144 timepoints x 963 channels\n", 35 | "+ Classes: 7\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import numpy as np\n", 45 | "import os\n", 46 | "import sys\n", 47 | "import pandas as pd\n", 48 | "\n", 49 | "CODE = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\mcfly\\\\mcfly'\n", 50 | "DATA = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\PEMS-SF'\n", 51 | "sys.path.append(CODE)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "file_train = os.path.join(DATA, 'PEMS-SF_TRAIN.arff')\n", 61 | "file_test = os.path.join(DATA, 'PEMS-SF_TEST.arff')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "def load_arff(filename):\n", 71 | " start = 0\n", 72 | "\n", 73 | " data = []\n", 74 | " labels = []\n", 75 | " start_line = 0\n", 76 | " with open(filename) as fp:\n", 77 | " line = fp.readline()\n", 78 | " count = 0\n", 79 | " while line:\n", 80 | " if start == 1:\n", 81 | " label = line.split(\"',\")[-1]\n", 82 | " labels.append(label.replace('\\n', ''))\n", 83 | " line = line.split(\"',\")[0] \n", 84 | " lines = line.split('\\\\n')\n", 85 | " data_line = []\n", 86 | " for l in lines:\n", 87 | " data_line_sub = []\n", 88 | " #for entry in l.split(','):\n", 89 | " #data_line_sub.append(entry.replace(\"'\", \"\"))\n", 90 | " #data_line.append(data_line_sub)\n", 91 | " data_line.append([x.replace(\"'\", \"\") for x in l.split(',')])\n", 92 | " data.append(data_line)\n", 93 | "\n", 94 | " if line.startswith('@data'):\n", 95 | " start_line = count\n", 96 | " #print(\"Actual data start in line\", start_line)\n", 97 | " start = 1\n", 98 | "\n", 99 | " line = fp.readline()\n", 100 | " count += 1\n", 101 | " \n", 102 | " return np.swapaxes(np.array(data).astype(float), 1,2), labels\n", 103 | "\n", 104 | "X_train, y_train = load_arff(file_train)\n", 105 | "X_test0, y_test0 = load_arff(file_test)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "X_train.shape (267, 144, 963)\n", 118 | "267\n", 119 | "X_test.shape (173, 144, 963)\n", 120 | "173\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "print(\"X_train.shape\", X_train.shape)\n", 126 | "print(len(y_train))\n", 127 | "\n", 128 | "print(\"X_test.shape\", X_test0.shape)\n", 129 | "print(len(y_test0))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "numpy.float64" 141 | ] 142 | }, 143 | "execution_count": 5, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "type(X_train[0,0,0])" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "array([0.0134, 0.0129, 0.0122, 0.0105, 0.0103, 0.0095, 0.0086, 0.0084,\n", 161 | " 0.0079, 0.0075, 0.0075, 0.0076, 0.0073, 0.0073, 0.007 , 0.0074,\n", 162 | " 0.0074, 0.0072, 0.0071, 0.0078, 0.0078, 0.0101, 0.0109, 0.0111,\n", 163 | " 0.0113, 0.0126, 0.0161, 0.0175, 0.0238, 0.0247, 0.0275, 0.0314,\n", 164 | " 0.0397, 0.0532, 0.0568, 0.0593, 0.0589, 0.0721, 0.0765, 0.0893,\n", 165 | " 0.0947, 0.0951, 0.094 , 0.0987, 0.1094, 0.1108, 0.1159, 0.1143,\n", 166 | " 0.1076, 0.1083, 0.1078, 0.1052, 0.1051, 0.0975, 0.0931, 0.0879,\n", 167 | " 0.086 , 0.0861, 0.0857, 0.0834, 0.0754, 0.0745, 0.0736, 0.0731,\n", 168 | " 0.0742, 0.0725, 0.0691, 0.0704, 0.0711, 0.072 , 0.0713, 0.0699,\n", 169 | " 0.0683, 0.0703, 0.0707, 0.0714, 0.0719, 0.0718, 0.0683, 0.0703,\n", 170 | " 0.071 , 0.0703, 0.0723, 0.0706, 0.0698, 0.072 , 0.0736, 0.0744,\n", 171 | " 0.0774, 0.0743, 0.0731, 0.079 , 0.079 , 0.077 , 0.0814, 0.0794,\n", 172 | " 0.0759, 0.0791, 0.0769, 0.0765, 0.0823, 0.081 , 0.0813, 0.0865,\n", 173 | " 0.0892, 0.0834, 0.083 , 0.0789, 0.0755, 0.0747, 0.0723, 0.0657,\n", 174 | " 0.0659, 0.0619, 0.0554, 0.0543, 0.0509, 0.0493, 0.046 , 0.0446,\n", 175 | " 0.0413, 0.0419, 0.0417, 0.0391, 0.0383, 0.0374, 0.0376, 0.0399,\n", 176 | " 0.0406, 0.038 , 0.0374, 0.0359, 0.0336, 0.0335, 0.03 , 0.0294,\n", 177 | " 0.0274, 0.0254, 0.0219, 0.0218, 0.0203, 0.0179, 0.0179, 0.0146])" 178 | ] 179 | }, 180 | "execution_count": 6, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "X_train[0,:,10]" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### Split test into test and validation:" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 7, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "2.0 25\n", 206 | "3.0 26\n", 207 | "1.0 30\n", 208 | "4.0 23\n", 209 | "7.0 20\n", 210 | "5.0 22\n", 211 | "6.0 27\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "y_val = []\n", 217 | "y_test = []\n", 218 | "IDs_val = []\n", 219 | "IDs_test = []\n", 220 | "\n", 221 | "np.random.seed(1)\n", 222 | "for label in list(set(y_test0)):\n", 223 | " idx = np.where(np.array(y_test0) == label)[0]\n", 224 | " idx1 = np.random.choice(idx, len(idx)//2, replace=False)\n", 225 | " idx2 = list(set(idx) - set(idx1))\n", 226 | " IDs_val.extend(idx1)\n", 227 | " IDs_test.extend(idx2)\n", 228 | " y_val.extend(len(idx1) * [label])\n", 229 | " y_test.extend(len(idx2) * [label])\n", 230 | "\n", 231 | " print(label, y_test0.count(label))\n", 232 | " \n", 233 | "X_test = X_test0[IDs_test,:,:]\n", 234 | "X_val = X_test0[IDs_val,:,:]" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "(88, 144, 963) (85, 144, 963)\n", 247 | "88 85\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "print(X_test.shape, X_val.shape)\n", 253 | "print(len(y_test), len(y_val))" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Save pre-processed data as numpy files" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 10, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "dataset_name = 'PEMS_'\n", 270 | "\n", 271 | "output_path = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\processed'\n", 272 | "np.save(os.path.join(output_path, dataset_name + 'X_train.npy'), X_train)\n", 273 | "np.save(os.path.join(output_path, dataset_name + 'X_val.npy'), X_val)\n", 274 | "np.save(os.path.join(output_path, dataset_name + 'X_test.npy'), X_test)\n", 275 | "np.save(os.path.join(output_path, dataset_name + 'y_train.npy'), y_train)\n", 276 | "np.save(os.path.join(output_path, dataset_name + 'y_val.npy'), y_val)\n", 277 | "np.save(os.path.join(output_path, dataset_name + 'y_test.npy'), y_test)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "## Or: Create new split of data ?" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [] 300 | } 301 | ], 302 | "metadata": { 303 | "kernelspec": { 304 | "display_name": "Python 3", 305 | "language": "python", 306 | "name": "python3" 307 | }, 308 | "language_info": { 309 | "codemirror_mode": { 310 | "name": "ipython", 311 | "version": 3 312 | }, 313 | "file_extension": ".py", 314 | "mimetype": "text/x-python", 315 | "name": "python", 316 | "nbconvert_exporter": "python", 317 | "pygments_lexer": "ipython3", 318 | "version": "3.6.5" 319 | } 320 | }, 321 | "nbformat": 4, 322 | "nbformat_minor": 2 323 | } 324 | -------------------------------------------------------------------------------- /notebooks/experiments/dataset_phoneme_prepare.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Dataset: PhonemeSpectra\n", 9 | "\n", 10 | "http://www.timeseriesclassification.com/description.php?Dataset=PhonemeSpectra\n", 11 | "\n", 12 | "### Info from data source:\n", 13 | "Phoneme Description:\n", 14 | "This data set is a multivaritate representation of a subset of the data used in the paper Dual-domain Hierarchical Classification of Phonetic Time Series. \n", 15 | "In the case of the raw data.\n", 16 | "Each series was extracted from the segmented audio collected from Google Translate\n", 17 | "Audio files collected from Google translate are recorded at 22050\n", 18 | "The speakers are male and female.\n", 19 | "After data collection, they segment waveforms of the words to generate phonemes using the Forced Aligner tool from the Penn Phonetics Laboratory.\n", 20 | "A Spectrogram of each instance was then created with a window size of 0.001 seconds and an overlap of 90%.\n", 21 | "Each instance in this multivariate dataset is arranged such that each dimension is a frequency band from the spectrogram.\n", 22 | "The data consists of 39 classes each with 170 instances. \n", 23 | "\n", 24 | "Phoneme Refference:\n", 25 | "Publication: Hamooni H, Mueen A. Dual-domain hierarchical classification of phonetic time series. InData Mining (ICDM), 2014 IEEE International Conference on 2014 Dec 14 (pp. 160-169). IEEE.\n", 26 | "\n", 27 | "\n", 28 | "### Size:\n", 29 | "+ Training samples: 3315\n", 30 | "+ Test sampels: 3353\n", 31 | "+ Dimension: 217 timepoints x 11 channels\n", 32 | "+ Classes: 39\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import numpy as np\n", 42 | "import os\n", 43 | "import sys\n", 44 | "import pandas as pd\n", 45 | "\n", 46 | "CODE = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\mcfly\\\\mcfly'\n", 47 | "DATA = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\PhonemeSpectra'\n", 48 | "sys.path.append(CODE)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "file_train = os.path.join(DATA, 'PhonemeSpectra_TRAIN.arff')\n", 58 | "file_test = os.path.join(DATA, 'PhonemeSpectra_TEST.arff')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "def load_arff(filename):\n", 68 | " start = 0\n", 69 | "\n", 70 | " data = []\n", 71 | " labels = []\n", 72 | " start_line = 0\n", 73 | " with open(filename) as fp:\n", 74 | " line = fp.readline()\n", 75 | " count = 0\n", 76 | " while line:\n", 77 | " if start == 1:\n", 78 | " label = line.split(\"',\")[-1]\n", 79 | " labels.append(label.replace('\\n', ''))\n", 80 | " line = line.split(\"',\")[0] \n", 81 | " lines = line.split('\\\\n')\n", 82 | " data_line = []\n", 83 | " for l in lines:\n", 84 | " data_line_sub = []\n", 85 | " #for entry in l.split(','):\n", 86 | " #data_line_sub.append(entry.replace(\"'\", \"\"))\n", 87 | " #data_line.append(data_line_sub)\n", 88 | " data_line.append([x.replace(\"'\", \"\") for x in l.split(',')])\n", 89 | " data.append(data_line)\n", 90 | "\n", 91 | " if line.startswith('@data'):\n", 92 | " start_line = count\n", 93 | " #print(\"Actual data start in line\", start_line)\n", 94 | " start = 1\n", 95 | "\n", 96 | " line = fp.readline()\n", 97 | " count += 1\n", 98 | " \n", 99 | " return np.swapaxes(np.array(data).astype(float), 1,2), labels\n", 100 | "\n", 101 | "X_train, y_train = load_arff(file_train)\n", 102 | "X_test0, y_test0 = load_arff(file_test)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 4, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "X_train.shape (3315, 217, 11)\n", 115 | "3315\n", 116 | "X_test.shape (3353, 217, 11)\n", 117 | "3353\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "print(\"X_train.shape\", X_train.shape)\n", 123 | "print(len(y_train))\n", 124 | "\n", 125 | "print(\"X_test.shape\", X_test0.shape)\n", 126 | "print(len(y_test0))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "numpy.float64" 138 | ] 139 | }, 140 | "execution_count": 5, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "type(X_train[0,0,0])" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "array([ 0.60185 , 0.10432 , 0.67014 , 0.15635 , 0.95577 , 2.4809 ,\n", 158 | " 3.5833 , 4.7018 , 1.1286 , 7.2648 , 7.1282 , 5.3625 ,\n", 159 | " 4.6666 , 3.4076 , 3.4368 , 3.1312 , 0.6371 , 7.2779 ,\n", 160 | " 10.702 , 9.528 , 8.9655 , 5.3169 , 2.2338 , 0.31894 ,\n", 161 | " 1.8213 , 6.7641 , 10.24 , 9.8695 , 7.5672 , 2.3384 ,\n", 162 | " 3.8596 , 7.3618 , 3.3751 , 2.6142 , 2.605 , 5.3137 ,\n", 163 | " 5.4795 , 1.0734 , 1.0891 , 3.0922 , 2.4679 , 0.091312,\n", 164 | " 2.8001 , 6.1137 , 4.8455 , 0.27992 , 3.3654 , 7.6773 ,\n", 165 | " 9.0268 , 12.636 , 12.903 , 8.7211 , 8.656 , 9.1178 ,\n", 166 | " 5.2904 , 3.632 , 6.6237 , 6.1359 , 5.684 , 5.1734 ,\n", 167 | " 5.4562 , 5.3652 , 5.2969 , 4.7929 , 8.4382 , 9.1113 ,\n", 168 | " 2.4906 , 1.5931 , 1.2522 , 5.8437 , 8.9623 , 5.8633 ,\n", 169 | " 4.0618 , 2.3871 , 0.9758 , 0.74115 , 0.95252 , 2.296 ,\n", 170 | " 2.6277 , 3.1806 , 5.8372 , 7.1867 , 6.8454 , 7.0274 ,\n", 171 | " 7.4567 , 7.5445 , 8.2956 , 8.7867 , 9.5703 , 9.6614 ,\n", 172 | " 6.9689 , 6.7347 , 6.712 , 6.6502 , 5.4763 , 7.4989 ,\n", 173 | " 10.647 , 10.585 , 13.574 , 12.874 , 9.4045 , 4.3024 ,\n", 174 | " 2.3578 , 2.7995 , 9.5959 , 11.085 , 14.688 , 23.92 ,\n", 175 | " 24.843 , 15.833 , 15.885 , 10.116 , 0.064775, 0.21435 ,\n", 176 | " 2.4158 , 4.9229 , 11.127 , 10.916 , 9.921 , 15.599 ,\n", 177 | " 6.9587 , 2.6044 , 7.4561 , 4.5035 , 1.6945 , 2.1947 ,\n", 178 | " 0.38345 , 3.7951 , 4.4358 , 3.7496 , 3.6163 , 0.075052,\n", 179 | " 1.5281 , 5.4953 , 10.812 , 5.4921 , 4.9458 , 2.1915 ,\n", 180 | " 4.9495 , 9.0468 , 2.0749 , 6.2465 , 5.8107 , 2.6565 ,\n", 181 | " 1.4825 , 4.6081 , 12.673 , 11.932 , 2.3514 , 5.2352 ,\n", 182 | " 6.3863 , 7.3098 , 5.3262 , 3.6293 , 15.082 , 16.552 ,\n", 183 | " 15.859 , 14.896 , 12.448 , 6.0291 , 7.4729 , 9.2094 ,\n", 184 | " 9.9443 , 11.121 , 13.275 , 12.555 , 10.507 , 9.9053 ,\n", 185 | " 10.276 , 14.081 , 14.197 , 13.947 , 14.377 , 12.884 ,\n", 186 | " 3.0764 , 1.0143 , 0.20187 , 1.1606 , 4.1881 , 3.5605 ,\n", 187 | " 3.5995 , 6.318 , 11.144 , 24.688 , 17.897 , 13.641 ,\n", 188 | " 22.681 , 14.932 , 3.3139 , 7.0139 , 7.6578 , 2.8255 ,\n", 189 | " 12.825 , 14.344 , 13.261 , 17.232 , 5.0401 , 1.6977 ,\n", 190 | " 7.4301 , 1.7822 , 2.127 , 10.519 , 7.9993 , 0.14984 ,\n", 191 | " 4.3669 , 3.8276 , 7.9119 , 2.5725 , 3.0662 , 8.3439 ,\n", 192 | " 14.206 , 8.643 , 0.57531 , 1.9058 , 13.368 , 12.279 ,\n", 193 | " 1.3264 ])" 194 | ] 195 | }, 196 | "execution_count": 6, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "X_train[0,:,10]" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "### Split test into test and validation:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 7, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "UW 86\n", 222 | "L 86\n", 223 | "EH 86\n", 224 | "Y 86\n", 225 | "JH 86\n", 226 | "UH 86\n", 227 | "SH 86\n", 228 | "K 86\n", 229 | "AY 86\n", 230 | "P 86\n", 231 | "F 86\n", 232 | "D 86\n", 233 | "ER 86\n", 234 | "DH 86\n", 235 | "R 86\n", 236 | "Z 86\n", 237 | "M 86\n", 238 | "IH 86\n", 239 | "IY 86\n", 240 | "AE 86\n", 241 | "ZH 85\n", 242 | "OY 86\n", 243 | "EY 86\n", 244 | "N 86\n", 245 | "NG 86\n", 246 | "V 86\n", 247 | "G 86\n", 248 | "AA 86\n", 249 | "TH 86\n", 250 | "AH 86\n", 251 | "AW 86\n", 252 | "B 86\n", 253 | "HH 86\n", 254 | "CH 86\n", 255 | "T 86\n", 256 | "S 86\n", 257 | "OW 86\n", 258 | "AO 86\n", 259 | "W 86\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "y_val = []\n", 265 | "y_test = []\n", 266 | "IDs_val = []\n", 267 | "IDs_test = []\n", 268 | "\n", 269 | "np.random.seed(1)\n", 270 | "for label in list(set(y_test0)):\n", 271 | " idx = np.where(np.array(y_test0) == label)[0]\n", 272 | " idx1 = np.random.choice(idx, len(idx)//2, replace=False)\n", 273 | " idx2 = list(set(idx) - set(idx1))\n", 274 | " IDs_val.extend(idx1)\n", 275 | " IDs_test.extend(idx2)\n", 276 | " y_val.extend(len(idx1) * [label])\n", 277 | " y_test.extend(len(idx2) * [label])\n", 278 | "\n", 279 | " print(label, y_test0.count(label))\n", 280 | " \n", 281 | "X_test = X_test0[IDs_test,:,:]\n", 282 | "X_val = X_test0[IDs_val,:,:]" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 8, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "(1677, 217, 11) (1676, 217, 11)\n", 295 | "1677 1676\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "print(X_test.shape, X_val.shape)\n", 301 | "print(len(y_test), len(y_val))" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "## Save pre-processed data as numpy files" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 10, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "dataset_name = 'PhenomeSpectra_'\n", 318 | "\n", 319 | "output_path = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\processed'\n", 320 | "np.save(os.path.join(output_path, dataset_name + 'X_train.npy'), X_train)\n", 321 | "np.save(os.path.join(output_path, dataset_name + 'X_val.npy'), X_val)\n", 322 | "np.save(os.path.join(output_path, dataset_name + 'X_test.npy'), X_test)\n", 323 | "np.save(os.path.join(output_path, dataset_name + 'y_train.npy'), y_train)\n", 324 | "np.save(os.path.join(output_path, dataset_name + 'y_val.npy'), y_val)\n", 325 | "np.save(os.path.join(output_path, dataset_name + 'y_test.npy'), y_test)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## Or: Create new split of data ?" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [] 348 | } 349 | ], 350 | "metadata": { 351 | "kernelspec": { 352 | "display_name": "Python 3", 353 | "language": "python", 354 | "name": "python3" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 3 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython3", 366 | "version": "3.6.5" 367 | } 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 2 371 | } 372 | -------------------------------------------------------------------------------- /notebooks/experiments/dataset_rackets_prepare.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# Dataset: RacketSports\n", 9 | "\n", 10 | "http://www.timeseriesclassification.com/description.php?Dataset=RacketSports\n", 11 | "\n", 12 | "### Info from data source:\n", 13 | "The data was created by university students plyaing badminton or squash whilst wearing a smart watch (Sony Smart watch 35). The watch relayed the x-y-z coordinates for\n", 14 | "both the gyroscope and accelerometer to an android phone (One Plus 56). The phone\n", 15 | "wrote these values to an Attribute-Relation File Format (arff) file using an app developed\n", 16 | "by a UEA computer science masters student. The problem is to identify which sport and which stroke the players are making. The data was collected at a rate of 10 HZ over 3 seconds whilst the player played\n", 17 | "either a forehand/backhand in squash or a clear/smash in badminton.\n", 18 | "The data was collected as part of an undergraduate project by Phillip Perks in 2017/18.\n", 19 | "\n", 20 | "### Size:\n", 21 | "+ Training samples: 151\t\n", 22 | "+ Test sampels: 152\n", 23 | "+ Dimension: 30 timepoints x 6 channels\n", 24 | "+ Classes: 4\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 66, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import numpy as np\n", 34 | "import os\n", 35 | "import sys\n", 36 | "import pandas as pd\n", 37 | "\n", 38 | "CODE = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\mcfly\\\\mcfly'\n", 39 | "DATA = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\RacketSports'\n", 40 | "sys.path.append(CODE)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 6, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "file_train = os.path.join(DATA, 'RacketSports_TRAIN.arff')\n", 50 | "file_test = os.path.join(DATA, 'RacketSports_TEST.arff')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 34, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "def load_racket_arff(filename):\n", 60 | " start = 0\n", 61 | "\n", 62 | " data = []\n", 63 | " labels = []\n", 64 | " start_line = 0\n", 65 | " with open(filename) as fp:\n", 66 | " line = fp.readline()\n", 67 | " count = 0\n", 68 | " while line:\n", 69 | " if start == 1:\n", 70 | " lines = line.split('\\\\n')\n", 71 | " data_line = []\n", 72 | " for l in lines:\n", 73 | " data_line_sub = []\n", 74 | " for entry in l.split(','):\n", 75 | " if entry.startswith('B') or entry.startswith('S'):\n", 76 | " labels.append(entry.replace(\"'\", \"\").replace('\\n', ''))\n", 77 | " else:\n", 78 | " data_line_sub.append(entry.replace(\"'\", \"\"))\n", 79 | " data_line.append(data_line_sub)\n", 80 | " data.append(data_line)\n", 81 | "\n", 82 | " if line.startswith('@data'):\n", 83 | " start_line = count\n", 84 | " #print(\"Actual data start in line\", start_line)\n", 85 | " start = 1\n", 86 | "\n", 87 | " line = fp.readline()\n", 88 | " count += 1\n", 89 | " \n", 90 | " return np.swapaxes(np.array(data), 1,2), labels\n", 91 | "\n", 92 | "X_train, y_train = load_racket_arff(file_train)\n", 93 | "X_test0, y_test0 = load_racket_arff(file_test)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 35, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "X_train.shape (151, 30, 6)\n", 106 | "151\n", 107 | "X_test.shape (152, 30, 6)\n", 108 | "152\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "print(\"X_train.shape\", X_train.shape)\n", 114 | "print(len(y_train))\n", 115 | "\n", 116 | "print(\"X_test.shape\", X_test0.shape)\n", 117 | "print(len(y_test0))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "### Split test into test and validation:" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 42, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "[39 12 9 36 11 30 10 13 1 22 16 31 38 24 23 33 26 34 28 18] [0, 32, 2, 3, 4, 5, 6, 7, 8, 35, 37, 14, 15, 17, 19, 20, 21, 25, 27, 29]\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "IDs1 = np.random.choice(idx, len(idx) //2, replace=False)\n", 142 | "IDs2 = list(set(idx) - set(IDs1))\n", 143 | "print(IDs1, IDs2)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 48, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Squash_BackhandBoast 34\n", 156 | "Squash_ForehandBoast 35\n", 157 | "Badminton_Smash 40\n", 158 | "Badminton_Clear 43\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "y_val = []\n", 164 | "y_test = []\n", 165 | "IDs_val = []\n", 166 | "IDs_test = []\n", 167 | "\n", 168 | "for label in list(set(y_test0)):\n", 169 | " idx = np.where(np.array(y_test0) == label)[0]\n", 170 | " idx1 = np.random.choice(idx, len(idx)//2, replace=False)\n", 171 | " idx2 = list(set(idx) - set(idx1))\n", 172 | " IDs_val.extend(idx1)\n", 173 | " IDs_test.extend(idx2)\n", 174 | " y_val.extend(len(idx1) * [label])\n", 175 | " y_test.extend(len(idx2) * [label])\n", 176 | "\n", 177 | " print(label, y_test0.count(label))\n", 178 | " \n", 179 | "X_test = X_test0[IDs_test,:,:]\n", 180 | "X_val = X_test0[IDs_val,:,:]" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 54, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "(77, 30, 6) (75, 30, 6)\n", 193 | "77 75\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "print(X_test.shape, X_val.shape)\n", 199 | "print(len(y_test), len(y_val))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Save pre-processed data as numpy files" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 65, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "dataset_name = 'RacketSports_'\n", 216 | "\n", 217 | "output_path = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\processed'\n", 218 | "np.save(os.path.join(output_path, dataset_name + 'X_train.npy'), X_train)\n", 219 | "np.save(os.path.join(output_path, dataset_name + 'X_val.npy'), X_val)\n", 220 | "np.save(os.path.join(output_path, dataset_name + 'X_test.npy'), X_test)\n", 221 | "np.save(os.path.join(output_path, dataset_name + 'y_train.npy'), y_train)\n", 222 | "np.save(os.path.join(output_path, dataset_name + 'y_val.npy'), y_val)\n", 223 | "np.save(os.path.join(output_path, dataset_name + 'y_test.npy'), y_test)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 61, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "((30,), array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,\n", 242 | " 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,\n", 243 | " 2.7, 2.8, 2.9, 3. ]))" 244 | ] 245 | }, 246 | "execution_count": 61, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "time_axis = np.linspace( 0.1, 3, 30)\n", 253 | "time_axis.shape, time_axis" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Or: Create new split of data ?" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 30, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "(303, 30, 6)\n", 273 | "303\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "X_data = np.concatenate((X_train, X_val), axis=0)\n", 279 | "print(X_data.shape)\n", 280 | "\n", 281 | "y_data = y_train.copy()\n", 282 | "y_data.extend(y_val)\n", 283 | "print(len(y_data))" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 31, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "Squash_BackhandBoast 68\n", 296 | "Squash_ForehandBoast 70\n", 297 | "Badminton_Smash 79\n", 298 | "Badminton_Clear 86\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "split = [0.6, 0.2, 0.2]\n", 304 | "\n", 305 | "for label in list(set(y_data)):\n", 306 | " idx = np.where(np.array(y_data) == label)[0]\n", 307 | " \n", 308 | " #print(label, np.where(np.array(y_val) == label)[0].shape)\n", 309 | " print(label, y_data.count(label))" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 3", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.6.5" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 2 341 | } 342 | -------------------------------------------------------------------------------- /notebooks/experiments/deeplearning_guinneabissau.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EEG data classification Guinnea Bissau" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook contains experiments with an EEG dataset. The classes are Epilepsy: 0 or Control 1." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "source": [ 23 | "Load dependences and setting output configuration" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [ 33 | { 34 | "name": "stderr", 35 | "output_type": "stream", 36 | "text": [ 37 | "Using Theano backend.\n" 38 | ] 39 | }, 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "Populating the interactive namespace from numpy and matplotlib\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "import numpy as np\n", 50 | "from keras.utils.np_utils import to_categorical\n", 51 | "import keras\n", 52 | "%pylab inline\n", 53 | "%load_ext autoreload\n", 54 | "%autoreload 2" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Load data from npy files" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Specify location of npy files:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 2, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "datapath = '/media/windows-share/EEGs_Guinea-Bissau_np/'\n", 80 | "#datapath = '/media/sf_VBox_Shared/timeseries/EEGs_Guinea-Bissau_np/'#" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Load data stored in 10 seconds at 128 Hertz corresponding to the experiment where the participant had the eyes closed:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "condition = '_10seconds_closed.npy'\n", 99 | "X_train = np.load(datapath+'X_train'+condition)\n", 100 | "y_train = np.load(datapath+'y_train'+condition)\n", 101 | "X_val = np.load(datapath+'X_valid'+condition)\n", 102 | "y_val = np.load(datapath+'y_valid'+condition)\n", 103 | "X_test = np.load(datapath+'X_test'+condition)\n", 104 | "y_test = np.load(datapath+'y_test'+condition)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "{'Epilepsy': 0, 'Control': 1}\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "classlabels = list(set(y_train))\n", 124 | "mapclasses = {classlabels[i] : i for i in range(len(classlabels))}\n", 125 | "print(mapclasses)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "y_train = np.array([mapclasses[c] for c in y_train], dtype='int')\n", 137 | "y_val = np.array([mapclasses[c] for c in y_test], dtype='int')\n", 138 | "y_test = np.array([mapclasses[c] for c in y_test], dtype='int')\n", 139 | "y_train_binary = to_categorical(y_train)\n", 140 | "y_val_binary = to_categorical(y_val)\n", 141 | "y_test_binary = to_categorical(y_test)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": { 148 | "collapsed": false, 149 | "scrolled": true 150 | }, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "array([[ 1., 0.],\n", 156 | " [ 1., 0.],\n", 157 | " [ 1., 0.],\n", 158 | " [ 0., 1.],\n", 159 | " [ 1., 0.],\n", 160 | " [ 1., 0.],\n", 161 | " [ 1., 0.],\n", 162 | " [ 0., 1.],\n", 163 | " [ 0., 1.],\n", 164 | " [ 1., 0.],\n", 165 | " [ 1., 0.],\n", 166 | " [ 1., 0.],\n", 167 | " [ 1., 0.],\n", 168 | " [ 1., 0.],\n", 169 | " [ 0., 1.],\n", 170 | " [ 0., 1.],\n", 171 | " [ 0., 1.],\n", 172 | " [ 0., 1.],\n", 173 | " [ 0., 1.],\n", 174 | " [ 0., 1.]])" 175 | ] 176 | }, 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "y_val_binary" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 8, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "(108,)" 197 | ] 198 | }, 199 | "execution_count": 8, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "y_train.shape" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": true 222 | }, 223 | "outputs": [], 224 | "source": [] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": true 231 | }, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": true 240 | }, 241 | "outputs": [], 242 | "source": [] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "collapsed": true 249 | }, 250 | "outputs": [], 251 | "source": [] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.5.2" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 0 275 | } 276 | -------------------------------------------------------------------------------- /notebooks/experiments/experiment_PAMAP2_9fold_small.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Experiment PAMAP2 with mcfly" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This experiment finds an optimal model for the PAMAP2 dataset." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Import required Python modules" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": { 28 | "collapsed": false, 29 | "scrolled": false 30 | }, 31 | "outputs": [ 32 | { 33 | "name": "stderr", 34 | "output_type": "stream", 35 | "text": [ 36 | "Using Theano backend.\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "import sys\n", 42 | "import os\n", 43 | "sys.path.insert(0, os.path.abspath('../..'))\n", 44 | "import numpy as np\n", 45 | "import pandas as pd\n", 46 | "# mcfly\n", 47 | "from mcfly import tutorial_pamap2, modelgen, find_architecture, storage\n", 48 | "# Keras module is use for the deep learning\n", 49 | "import keras\n", 50 | "from keras.utils.np_utils import to_categorical\n", 51 | "from keras.models import Sequential\n", 52 | "from keras.layers import Dense, Activation, Convolution1D, Flatten, MaxPooling1D\n", 53 | "from keras.optimizers import Adam\n", 54 | "# We can set some backend options to avoid NaNs\n", 55 | "from keras import backend as K" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Load the data" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "datapath = '/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/'\n", 74 | "Xs = []\n", 75 | "ys = []\n", 76 | "\n", 77 | "ext = '.npy'\n", 78 | "for i in range(9):\n", 79 | " Xs.append(np.load(datapath+'X_'+str(i)+ext))\n", 80 | " ys.append(np.load(datapath+'y_'+str(i)+ext))" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# Define directory where the results, e.g. json file, will be stored\n", 92 | "resultpath = '/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/results_tutorial/' " 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "modelname = 'my_bestmodel'\n", 104 | "model_reloaded = storage.loadmodel(resultpath,modelname)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 10, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "def split_train_test(X_list, y_list, j):\n", 116 | " X_train = np.concatenate(X_list[0:j]+X_list[j+1:])\n", 117 | " X_test = X_list[j]\n", 118 | " y_train = np.concatenate(y_list[0:j]+y_list[j+1:])\n", 119 | " y_test = y_list[j]\n", 120 | " return X_train, y_train, X_test, y_test\n", 121 | "\n", 122 | "def split_train_small_val(X_list, y_list, j, trainsize=500, valsize=500):\n", 123 | " X = np.concatenate(X_list[0:j]+X_list[j+1:])\n", 124 | " y = np.concatenate(y_list[0:j]+y_list[j+1:])\n", 125 | " rand_ind = np.random.choice(X.shape[0], trainsize+valsize, replace=False)\n", 126 | " X_train = X[rand_ind[:trainsize]]\n", 127 | " y_train = y[rand_ind[:trainsize]]\n", 128 | " X_val = X[rand_ind[trainsize:]]\n", 129 | " y_val = y[rand_ind[trainsize:]]\n", 130 | " return X_train, y_train, X_val, y_val" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 11, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "from keras.models import model_from_json\n", 142 | "\n", 143 | "def get_fresh_copy(model, lr):\n", 144 | " model_json = model.to_json()\n", 145 | " model_copy = model_from_json(model_json)\n", 146 | " model_copy.compile(loss='categorical_crossentropy',\n", 147 | " optimizer=Adam(lr=lr),\n", 148 | " metrics=['accuracy'])\n", 149 | " #for layer in model_copy.layers:\n", 150 | " # layer.build(layer.input_shape)\n", 151 | " return model_copy" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Train the best model for real" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Now that we have identified the best model architecture out of our random pool of models we can continue by training the model on the full training sample. For the purpose of speeding up the example we only train the full model on the first 1000 values. You will need to replace this by 'datasize = X_train.shape[0]' in a real world example." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 8, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "best_model = model_reloaded" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 16, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "import json\n", 188 | "with open(resultpath+'modelcomparison.json', 'r') as outfile:\n", 189 | " model_json = json.load(outfile)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 20, 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "best_params = model_json[0]" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 22, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [ 210 | { 211 | "name": "stdout", 212 | "output_type": "stream", 213 | "text": [ 214 | "Train on 14663 samples, validate on 2155 samples\n", 215 | "Epoch 1/2\n", 216 | "14663/14663 [==============================] - 490s - loss: 1.0568 - acc: 0.8655 - val_loss: 0.9868 - val_acc: 0.8297\n", 217 | "Epoch 2/2\n", 218 | "14663/14663 [==============================] - 523s - loss: 0.5656 - acc: 0.9360 - val_loss: 0.8527 - val_acc: 0.8278\n", 219 | "Train on 14528 samples, validate on 2290 samples\n", 220 | "Epoch 1/2\n", 221 | "14528/14528 [==============================] - 496s - loss: 1.0377 - acc: 0.8771 - val_loss: 0.8812 - val_acc: 0.7638\n", 222 | "Epoch 2/2\n", 223 | "14528/14528 [==============================] - 2782s - loss: 0.5823 - acc: 0.9290 - val_loss: 0.8818 - val_acc: 0.7258\n", 224 | "Train on 15344 samples, validate on 1474 samples\n", 225 | "Epoch 1/2\n", 226 | "15344/15344 [==============================] - 1015s - loss: 1.0461 - acc: 0.8672 - val_loss: 0.2425 - val_acc: 0.9512\n", 227 | "Epoch 2/2\n", 228 | "15344/15344 [==============================] - 518s - loss: 0.5721 - acc: 0.9327 - val_loss: 0.1883 - val_acc: 0.9478\n", 229 | "Train on 14799 samples, validate on 2019 samples\n", 230 | "Epoch 1/2\n", 231 | "14799/14799 [==============================] - 229s - loss: 1.0516 - acc: 0.8664 - val_loss: 0.6108 - val_acc: 0.8920\n", 232 | "Epoch 2/2\n", 233 | "14799/14799 [==============================] - 241s - loss: 0.6011 - acc: 0.9263 - val_loss: 0.4059 - val_acc: 0.9188\n", 234 | "Train on 14438 samples, validate on 2380 samples\n", 235 | "Epoch 1/2\n", 236 | "14438/14438 [==============================] - 240s - loss: 1.0530 - acc: 0.8686 - val_loss: 0.6165 - val_acc: 0.8597\n", 237 | "Epoch 2/2\n", 238 | "14438/14438 [==============================] - 261s - loss: 0.5826 - acc: 0.9341 - val_loss: 0.6550 - val_acc: 0.8122\n", 239 | "Train on 14639 samples, validate on 2179 samples\n", 240 | "Epoch 1/2\n", 241 | "14639/14639 [==============================] - 237s - loss: 1.0891 - acc: 0.8577 - val_loss: 0.4885 - val_acc: 0.9197\n", 242 | "Epoch 2/2\n", 243 | "14639/14639 [==============================] - 244s - loss: 0.6007 - acc: 0.9292 - val_loss: 0.3850 - val_acc: 0.9101\n", 244 | "Train on 14811 samples, validate on 2007 samples\n", 245 | "Epoch 1/2\n", 246 | "14811/14811 [==============================] - 244s - loss: 1.0595 - acc: 0.8606 - val_loss: 0.3250 - val_acc: 0.9482\n", 247 | "Epoch 2/2\n", 248 | "14811/14811 [==============================] - 278s - loss: 0.5904 - acc: 0.9294 - val_loss: 0.2464 - val_acc: 0.9307\n", 249 | "Train on 14543 samples, validate on 2275 samples\n", 250 | "Epoch 1/2\n", 251 | "14543/14543 [==============================] - 251s - loss: 1.0374 - acc: 0.8742 - val_loss: 1.8568 - val_acc: 0.5200\n", 252 | "Epoch 2/2\n", 253 | "14543/14543 [==============================] - 247s - loss: 0.5543 - acc: 0.9392 - val_loss: 2.5112 - val_acc: 0.4686\n", 254 | "Train on 16779 samples, validate on 39 samples\n", 255 | "Epoch 1/2\n", 256 | "16779/16779 [==============================] - 261s - loss: 1.0542 - acc: 0.8620 - val_loss: 0.1967 - val_acc: 0.9744\n", 257 | "Epoch 2/2\n", 258 | "16779/16779 [==============================] - 292s - loss: 0.5617 - acc: 0.9311 - val_loss: 0.1018 - val_acc: 0.9744\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "nr_epochs = 2\n", 264 | "\n", 265 | "np.random.seed(123)\n", 266 | "histories, test_accuracies_list, models = [], [], []\n", 267 | "for j in range(len(Xs)):\n", 268 | " X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j)\n", 269 | " model_copy = get_fresh_copy(best_model, best_params['learning_rate'])\n", 270 | " datasize = X_train.shape[0]\n", 271 | " \n", 272 | " history = model_copy.fit(X_train[:datasize,:,:], y_train[:datasize,:],\n", 273 | " nb_epoch=nr_epochs, validation_data=(X_test, y_test))\n", 274 | " \n", 275 | " histories.append(history)\n", 276 | " test_accuracies_list.append(history.history['val_acc'][-1] )\n", 277 | " models.append(model_copy)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 23, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "0.835116382719\n" 292 | ] 293 | }, 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "[0.82784222737819024,\n", 298 | " 0.72576419198356856,\n", 299 | " 0.94776119402985071,\n", 300 | " 0.91877166914314012,\n", 301 | " 0.81218487384940397,\n", 302 | " 0.91005048189977267,\n", 303 | " 0.9307424017132131,\n", 304 | " 0.46857142858452849,\n", 305 | " 0.97435897588729858]" 306 | ] 307 | }, 308 | "execution_count": 23, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "print(np.mean(test_accuracies_list))\n", 315 | "test_accuracies_list" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 24, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [ 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "fold 0\n", 330 | "fold 1\n", 331 | "fold 2\n", 332 | "fold 3\n", 333 | "fold 4\n", 334 | "fold 5\n", 335 | "fold 6\n", 336 | "fold 7\n", 337 | "fold 8\n" 338 | ] 339 | } 340 | ], 341 | "source": [ 342 | "# Calculate 1-NN for each fold:\n", 343 | "nr_epochs = 2\n", 344 | "\n", 345 | "np.random.seed(123)\n", 346 | "knn_test_accuracies_list = []\n", 347 | "for j in range(len(Xs)):\n", 348 | " print(\"fold \", j)\n", 349 | " X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j)\n", 350 | " acc = find_architecture.kNN_accuracy(X_train, y_train, X_test, y_test, k=1)\n", 351 | " knn_test_accuracies_list.append(acc )" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 25, 357 | "metadata": { 358 | "collapsed": false 359 | }, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "0.53974709837\n" 366 | ] 367 | }, 368 | { 369 | "data": { 370 | "text/html": [ 371 | "
\n", 372 | "\n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | "
CNNkNN
00.8278420.611601
10.7257640.610044
20.9477610.613976
30.9187720.523031
40.8121850.615966
50.9100500.523176
60.9307420.603886
70.4685710.371429
80.9743590.384615
\n", 428 | "
" 429 | ], 430 | "text/plain": [ 431 | " CNN kNN\n", 432 | "0 0.827842 0.611601\n", 433 | "1 0.725764 0.610044\n", 434 | "2 0.947761 0.613976\n", 435 | "3 0.918772 0.523031\n", 436 | "4 0.812185 0.615966\n", 437 | "5 0.910050 0.523176\n", 438 | "6 0.930742 0.603886\n", 439 | "7 0.468571 0.371429\n", 440 | "8 0.974359 0.384615" 441 | ] 442 | }, 443 | "execution_count": 25, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "print(np.mean(knn_test_accuracies_list))\n", 450 | "accs_compared = pd.DataFrame({'CNN': test_accuracies_list, 'kNN':knn_test_accuracies_list})\n", 451 | "accs_compared" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "### Saving, loading and comparing reloaded model with orignal model" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "The modoel can be saved for future use. The savemodel function will save two separate files: a json file for the architecture and a npy (numpy array) file for the weights." 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 26, 471 | "metadata": { 472 | "collapsed": true 473 | }, 474 | "outputs": [], 475 | "source": [ 476 | "modelname = 'my_bestmodel'" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 27, 482 | "metadata": { 483 | "collapsed": false 484 | }, 485 | "outputs": [], 486 | "source": [ 487 | "for i, model in enumerate(models):\n", 488 | " storage.savemodel(model,resultpath,modelname+str(i))" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": true 496 | }, 497 | "outputs": [], 498 | "source": [] 499 | } 500 | ], 501 | "metadata": { 502 | "anaconda-cloud": {}, 503 | "kernelspec": { 504 | "display_name": "Python [conda env:mcfly]", 505 | "language": "python", 506 | "name": "conda-env-mcfly-py" 507 | }, 508 | "language_info": { 509 | "codemirror_mode": { 510 | "name": "ipython", 511 | "version": 3 512 | }, 513 | "file_extension": ".py", 514 | "mimetype": "text/x-python", 515 | "name": "python", 516 | "nbconvert_exporter": "python", 517 | "pygments_lexer": "ipython3", 518 | "version": "3.5.2" 519 | } 520 | }, 521 | "nbformat": 4, 522 | "nbformat_minor": 1 523 | } 524 | -------------------------------------------------------------------------------- /notebooks/experiments/experiment_skipconnections.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exploration of skip connection implementation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "raw", 12 | "metadata": {}, 13 | "source": [ 14 | "In this notebook we explore how skip connections can be implemented. We use the PAMAP2 dataset/tutorial as test case.\n", 15 | "\n", 16 | "As an example dataset we use the publicly available [PAMAP2 dataset](https://archive.ics.uci.edu/ml/datasets/PAMAP2+Physical+Activity+Monitoring). It contains time series data from movement sensors worn by nine individuals. The data is labelled with the activity types that these individuals did and the aim is to train and evaluate a *classifier*.\n", 17 | "\n", 18 | "Before you can start, please make sure you install mcfly (see the [mcfly installation page](https://github.com/NLeSC/mcfly))." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Import required Python modules" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "scrolled": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stderr", 37 | "output_type": "stream", 38 | "text": [ 39 | "Using TensorFlow backend.\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "import sys\n", 45 | "import os\n", 46 | "import numpy as np\n", 47 | "import pandas as pd\n", 48 | "# mcfly\n", 49 | "from mcfly import modelgen, find_architecture, storage\n", 50 | "from keras.models import load_model\n", 51 | "np.random.seed(2)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "sys.path.insert(0, os.path.abspath('../..'))\n", 61 | "from utils import tutorial_pamap2" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Download data pre-procesed data" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "We have created a function for you to fetch the preprocessed data from https://zenodo.org/record/834467. Please specify the `directory_to_extract_to` in the code below and then execute the cell. This will download the preprocessed data into the directory in the `data` subdirectory. The output of the function is the path where the preprocessed data was stored." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# Specify in which directory you want to store the data:\n", 85 | "directory_to_extract_to = '.'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "Downloading data...\n", 98 | "Extracting data...\n", 99 | "Done\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "data_path = tutorial_pamap2.download_preprocessed_data(directory_to_extract_to)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "data": { 114 | "text/plain": [ 115 | "'./data/PAMAP2/preprocessed'" 116 | ] 117 | }, 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "data_path" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Load the pre-processed data" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "Load the preprocessed data as stored in Numpy-files. Please note that the data has already been split up in a training (training), validation (val), and test subsets. It is common practice to call the input data X and the labels y." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "X_train, y_train_binary, X_val, y_val_binary, X_test, y_test_binary, labels = tutorial_pamap2.load_data(data_path)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Data X and labels y are of type Numpy array. In the cell below we inspect the shape of the data. As you can see the shape of X is expressed as a Python tuple containing: the number of samples, length of the time series, and the number of channels for each sample. Similarly, the shape of y is represents the number of samples and the number of classes (unique labels). Note that y has the format of a binary array where only the correct class for each sample is assigned a 1. This is called one-hot-encoding." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 7, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "x shape: (11397, 512, 9)\n", 167 | "y shape: (11397, 7)\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "print('x shape:', X_train.shape)\n", 173 | "print('y shape:', y_train_binary.shape)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "The data is split between train test and validation." 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 8, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "train set size: 11397\n", 193 | "validation set size: 100\n", 194 | "test set size: 1000\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "print('train set size:', X_train.shape[0])\n", 200 | "print('validation set size:', X_val.shape[0])\n", 201 | "print('test set size:', X_test.shape[0])" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "Let's have a look at the distribution of the labels:" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/html": [ 219 | "
\n", 220 | "\n", 233 | "\n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | "
frequency
lying0.136615
sitting0.130736
standing0.136703
walking0.176625
cycling0.118540
vaccuum_cleaning0.125208
ironing0.175573
\n", 271 | "
" 272 | ], 273 | "text/plain": [ 274 | " frequency\n", 275 | "lying 0.136615\n", 276 | "sitting 0.130736\n", 277 | "standing 0.136703\n", 278 | "walking 0.176625\n", 279 | "cycling 0.118540\n", 280 | "vaccuum_cleaning 0.125208\n", 281 | "ironing 0.175573" 282 | ] 283 | }, 284 | "execution_count": 9, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "frequencies = y_train_binary.mean(axis=0)\n", 291 | "frequencies_df = pd.DataFrame(frequencies, index=labels, columns=['frequency'])\n", 292 | "frequencies_df" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### *Question 1: How many channels does this dataset have?*\n", 300 | "### *Question 2: What is the least common activity label in this dataset?*\n", 301 | "\n", 302 | " " 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "## Generate models" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "First step in the development of any deep learning model is to create a model architecture. As we do not know what architecture is best for our data we will create a set of random models to investigate which architecture is most suitable for our data and classification task. This process, creating random models, checking how good they are and then selecting the best one is called a 'random search'. A random search is considered to be the most robust approach to finding a good model. You will need to specificy how many models you want to create with argument 'number_of_models'. See for a full overview of the optional arguments the function documentation of modelgen.generate_models by running `modelgen.generate_models?`.\n", 317 | "\n", 318 | "##### What number of models to select?\n", 319 | "This number differs per dataset. More models will give better results but it will take longer to evaluate them. For the purpose of this tutorial we recommend trying only 2 models to begin with. If you have enough time you can try a larger number of models, e.g. 10 or 20 models. Because mcfly uses random search, you will get better results when using more models." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 10, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "num_classes = y_train_binary.shape[1]\n", 329 | "\n", 330 | "models = modelgen.generate_models(X_train.shape,\n", 331 | " number_of_classes=num_classes,\n", 332 | " number_of_models = 2)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 11, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/plain": [ 343 | "(11397, 512, 9)" 344 | ] 345 | }, 346 | "execution_count": 11, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "X_train.shape" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 12, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "from keras.layers import Input\n", 362 | "from keras.layers.convolutional import Conv2D\n", 363 | "from keras.layers import BatchNormalization, Activation, Convolution1D, Lambda, \\\n", 364 | " Convolution2D, Flatten, \\\n", 365 | " Reshape, LSTM, Dropout, TimeDistributed, BatchNormalization\n", 366 | "from keras.regularizers import l2\n", 367 | "import keras" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 13, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "x = Input(shape=(512, 3))\n", 377 | "\n", 378 | "# 1x3 conv with 3 output channels (same as input channels)\n", 379 | "y = Convolution1D(3, (3), padding='same')(x)\n", 380 | "# this returns x + y.\n", 381 | "z = keras.layers.add([x, y],name='skipconnection')" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 14, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | " nn = keras.models.Model(inputs=x, outputs=z)\n", 391 | " " 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 15, 397 | "metadata": {}, 398 | "outputs": [ 399 | { 400 | "name": "stdout", 401 | "output_type": "stream", 402 | "text": [ 403 | "__________________________________________________________________________________________________\n", 404 | "Layer (type) Output Shape Param # Connected to \n", 405 | "==================================================================================================\n", 406 | "input_1 (InputLayer) (None, 512, 3) 0 \n", 407 | "__________________________________________________________________________________________________\n", 408 | "conv1d_4 (Conv1D) (None, 512, 3) 30 input_1[0][0] \n", 409 | "__________________________________________________________________________________________________\n", 410 | "skipconnection (Add) (None, 512, 3) 0 input_1[0][0] \n", 411 | " conv1d_4[0][0] \n", 412 | "==================================================================================================\n", 413 | "Total params: 30\n", 414 | "Trainable params: 30\n", 415 | "Non-trainable params: 0\n", 416 | "__________________________________________________________________________________________________\n" 417 | ] 418 | } 419 | ], 420 | "source": [ 421 | "nn.summary()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [] 444 | } 445 | ], 446 | "metadata": { 447 | "anaconda-cloud": {}, 448 | "kernelspec": { 449 | "display_name": "Python 3", 450 | "language": "python", 451 | "name": "python3" 452 | }, 453 | "language_info": { 454 | "codemirror_mode": { 455 | "name": "ipython", 456 | "version": 3 457 | }, 458 | "file_extension": ".py", 459 | "mimetype": "text/x-python", 460 | "name": "python", 461 | "nbconvert_exporter": "python", 462 | "pygments_lexer": "ipython3", 463 | "version": "3.6.6" 464 | } 465 | }, 466 | "nbformat": 4, 467 | "nbformat_minor": 1 468 | } 469 | -------------------------------------------------------------------------------- /notebooks/experiments/preproces_Guinea-Bisseau_Nigeria.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Load Guinea-Bissau data and save as numpy file" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "from pandas import merge \n", 21 | "from os import listdir\n", 22 | "from numpy import genfromtxt, random" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": { 29 | "collapsed": false 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "datadir = \"/media/windows-share/EEG/EEGs_Guinea-Bissau_cleaned\"\n", 34 | "outputdir = \"/media/windows-share/EEG/EEGs_Guinea-Bissau_np\"\n", 35 | "namecontrol = 'Control'\n", 36 | "nameepilepsy = 'Epilepsy'\n", 37 | "#datadir = \"/media/windows-share/EEG/EEGs_Nigeria_cleaned\"\n", 38 | "#outputdir = \"/media/windows-share/EEG/EEGs_Nigeria_np\"\n", 39 | "#namecontrol = 'control'\n", 40 | "#nameepilepsy = 'epilepsy'\n", 41 | "filenames = listdir(datadir)\n", 42 | "D = []\n", 43 | "sf = 128\n", 44 | "nc = 14\n", 45 | "#Nfiles = len(filenames)\n", 46 | "#X = np.zeros((Nfiles,maxtslength,nc)) " 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "id = list(map(int,list(map(lambda file: file[file.find('id')+2:file.find('dur')-1],filenames))))\n", 58 | "dur = list(map(int,list(map(lambda file: file[file.find('dur')+3:file.find('epoch')-1],filenames))))\n", 59 | "#epoch = list(map(int,list(map(lambda file: file[file.find('epoch')+5:file.find('gro')-1],filenames))))\n", 60 | "group = list(map(str,list(map(lambda file: file[file.find('gro')+3:file.find('.csv')],filenames))))\n", 61 | "protocol = list(map(str,list(map(lambda file: file[file.find('yes')+3:file.find('id')-1],filenames))))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "mydata = id, dur, group\n", 73 | "df = pd.DataFrame.from_items([('id',id),('dur',dur),('group',group),('filenames',filenames),\n", 74 | " ('protocol',protocol)])" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 6, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "def getids(x,y,prop,N):\n", 86 | " ix = np.sort(np.random.choice(x,round(N*prop),replace=False))\n", 87 | " iy = np.sort(np.random.choice(y,round(N*(1-prop)),replace=False))\n", 88 | " if (len(ix)+len(iy)) < 20:\n", 89 | " print(prop,N,len(x),len(y))\n", 90 | " x = [x for i,x in enumerate(x) if x not in ix] \n", 91 | " y = [x for i,x in enumerate(y) if x not in iy] \n", 92 | " icon = np.concatenate((ix,iy))\n", 93 | " return icon, x, y" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "train_10seconds_open 91\n", 108 | "(413, 1280, 14)\n", 109 | "(413, 1)\n", 110 | "valid_10seconds_open 20\n", 111 | "(141, 1280, 14)\n", 112 | "(141, 1)\n", 113 | "test_10seconds_open 20\n", 114 | "(119, 1280, 14)\n", 115 | "(119, 1)\n", 116 | "train_10seconds_closed 84\n", 117 | "(440, 1280, 14)\n", 118 | "(440, 1)\n", 119 | "valid_10seconds_closed 20\n", 120 | "(145, 1280, 14)\n", 121 | "(145, 1)\n", 122 | "test_10seconds_closed 20\n", 123 | "(162, 1280, 14)\n", 124 | "(162, 1)\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "logstructure = []\n", 130 | "for mindur in [10]: # minimum duration of an epoch in seconds\n", 131 | " for protocol in ['open','closed']:\n", 132 | " df2 = df[(df['protocol']==protocol) & (df['dur'] >= mindur)]\n", 133 | " maxtslength = mindur * sf\n", 134 | " #Identify training, test and validation group\n", 135 | " con = np.unique(df2[df2['group'] == namecontrol]['id'])\n", 136 | " epi = np.unique(df2[df2['group'] == nameepilepsy]['id'])\n", 137 | " Nid = len(con) + len(epi) #number of ids\n", 138 | " prop = 0.5 # len(con) / Nid #proportion of controls\n", 139 | " random.seed(300)\n", 140 | " ival, con, epi = getids(con,epi,prop,N=20) # validation set\n", 141 | " ites, con, epi = getids(con,epi,prop,N=20) # test set\n", 142 | " itra = np.concatenate((con, epi)) # training set\n", 143 | " #print(len(ival),len(ites),len(itra))\n", 144 | " # Now use identifies per group to load the data\n", 145 | " for subset in ['train','valid','test']:\n", 146 | " conditionname = subset+'_'+str(mindur)+'seconds_'+protocol\n", 147 | " if subset == 'train':\n", 148 | " tmp = df2[df2.id.isin(itra)]\n", 149 | " filenames = tmp['filenames']\n", 150 | " if subset == 'valid':\n", 151 | " tmp = df2[(df2.id.isin(ival))]\n", 152 | " tmp = tmp.sort_values(by=['id']).groupby('id').first() # select first available epoch\n", 153 | " filenames = tmp['filenames']\n", 154 | " if subset == 'test':\n", 155 | " tmp = df2[(df2.id.isin(ites))]\n", 156 | " tmp = tmp.sort_values(by=['id']).groupby('id').first() # select first available epoch\n", 157 | " filenames = tmp['filenames']\n", 158 | " X = np.zeros((0,maxtslength,nc)) #len(filenames)\n", 159 | " y = np.zeros((0,1)) #len(filenames)\n", 160 | " \n", 161 | " print(conditionname + ' ' + str(len(filenames)))\n", 162 | " for file in filenames:\n", 163 | " path = datadir + '/' + file\n", 164 | " D = pd.read_csv(path, sep=',',header=0,usecols=list(range(0,14)))\n", 165 | " if D.shape[0] > maxtslength:\n", 166 | " slicesize = sf * 10\n", 167 | " for slicei in range(int((len(D)/slicesize)-1)):\n", 168 | " sta = (((slicei)*slicesize))+1\n", 169 | " end = ((slicei+1)*slicesize)+1\n", 170 | " D2 = np.array(D[sta:end]) # take first part or should these be a random selection?\n", 171 | " #D = np.array(D[0:maxtslength]) # take first part or should these be a random selection?\n", 172 | " D2 = np.reshape(D2,(1,D2.shape[0],D2.shape[1]))\n", 173 | " m = D2.mean(axis=1,keepdims=True)\n", 174 | " D2 = D2 - m # subtract mean\n", 175 | " if X.shape[2] != D2.shape[2]:\n", 176 | " print(X.shape)\n", 177 | " print(D2.shape)\n", 178 | " X = np.vstack((X,D2))\n", 179 | " logstructure.append([subset,mindur,protocol,file])\n", 180 | " diagnosis = tmp.group[(tmp.filenames == file)]\n", 181 | " y = np.vstack((y,diagnosis))\n", 182 | " \n", 183 | " fnameX = outputdir + '/X_' + conditionname\n", 184 | " fnamey = outputdir + '/y_' + conditionname\n", 185 | " np.save(file=fnameX,arr=X)\n", 186 | " #y = np.array(tmp['group'])\n", 187 | " np.save(file=fnamey,arr=y) \n", 188 | " print(X.shape)\n", 189 | " print(y.shape)\n", 190 | "np.savetxt(outputdir + '/log.csv', logstructure,\n", 191 | " delimiter=\",\", fmt='%s')" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "/media/windows-share/EEG/EEGs_Guinea-Bissau_np/X_test_10seconds_closed\n", 206 | "/media/windows-share/EEG/EEGs_Guinea-Bissau_np/y_test_10seconds_closed\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "print(fnameX)\n", 212 | "print(fnamey)\n", 213 | "testreadX = np.load(file=fnameX+'.npy')\n", 214 | "testready = np.load(file=fnamey+'.npy')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 9, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "(162, 1280, 14)\n", 229 | "(162, 1)\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "print(testreadX.shape)\n", 235 | "print(testready.shape)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.5.2" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 0 278 | } 279 | -------------------------------------------------------------------------------- /notebooks/tutorial/model/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLeSC/mcfly-tutorial/4b1548058c158d0efef41bfb6c7b2caa575a8858/notebooks/tutorial/model/model.h5 -------------------------------------------------------------------------------- /notebooks/tutorial/workshop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tutorial PAMAP2 with mcfly (workshop version)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In preparation of the workshop, please run code below to check if mcfly is installed correctly and to download the dataset (PAMAP2).\n", 15 | "\n", 16 | "Before you can start, please make sure you install mcfly (see the [mcfly installation page](https://github.com/NLeSC/mcfly)) and make sure your jupyter notebook has a python3 kernel." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Import required Python modules" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": false, 31 | "scrolled": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import sys\n", 36 | "import os\n", 37 | "import numpy as np\n", 38 | "import pandas as pd\n", 39 | "# mcfly\n", 40 | "from mcfly import modelgen, find_architecture, storage\n", 41 | "np.random.seed(2)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "sys.path.insert(0, os.path.abspath('../..'))\n", 53 | "from utils import tutorial_pamap2" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Download data pre-procesed data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "We have created a function for you to fetch the preprocessed data. Please specify the `directory_to_extract_to` in the code below and then execute the cell. This will download the preprocessed data into the directory in the `data` subdirectory. The output of the function is the path where the preprocessed data was stored." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# Specify in which directory you want to store the data:\n", 79 | "directory_to_extract_to = '.'" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "data_path = tutorial_pamap2.download_preprocessed_data(directory_to_extract_to)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [] 101 | } 102 | ], 103 | "metadata": { 104 | "anaconda-cloud": {}, 105 | "kernelspec": { 106 | "display_name": "Python [conda root]", 107 | "language": "python", 108 | "name": "conda-root-py" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.5.2" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 1 125 | } 126 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mcfly 2 | matplotlib 3 | pandas 4 | jupyter 5 | numpy 6 | scipy 7 | -------------------------------------------------------------------------------- /scripts/Actitracker_train.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import sys 7 | import os 8 | import numpy as np 9 | import pandas as pd 10 | import json 11 | # mcfly 12 | from mcfly import modelgen, find_architecture, storage 13 | 14 | 15 | # In[2]: 16 | 17 | data_path = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/' 18 | preprocessed_path = os.path.join(data_path, 'preprocessed') 19 | result_path = os.path.join(data_path, 'models_test') 20 | 21 | 22 | 23 | # In[3]: 24 | 25 | X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy')) 26 | X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy')) 27 | X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy')) 28 | y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy')) 29 | y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy')) 30 | y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy')) 31 | 32 | 33 | 34 | with open(os.path.join(preprocessed_path, 'labels.json')) as f: 35 | labels = json.load(f) 36 | 37 | 38 | # ## Generate models 39 | 40 | 41 | num_classes = y_train.shape[1] 42 | 43 | models = modelgen.generate_models(X_train.shape, 44 | number_of_classes=num_classes, 45 | number_of_models = 15) 46 | 47 | 48 | 49 | 50 | #what is the fraction of classes in the validation set? 51 | pd.Series(y_val.mean(axis=0), index=labels) 52 | 53 | 54 | if not os.path.exists(result_path): 55 | os.makedirs(result_path) 56 | 57 | 58 | 59 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train, 60 | X_val, y_val, 61 | models,nr_epochs=5, 62 | subset_size=512, 63 | verbose=True, 64 | batch_size=32, 65 | outputpath=result_path, 66 | early_stopping=True) 67 | 68 | 69 | 70 | print('Details of the training process were stored in ',os.path.join(result_path, 'models.json')) 71 | 72 | 73 | 74 | best_model_index = np.argmax(val_accuracies) 75 | best_model, best_params, best_model_types = models[best_model_index] 76 | print('Model type and parameters of the best model:') 77 | print(best_model_types) 78 | print(best_params) 79 | 80 | 81 | nr_epochs = 3 82 | datasize = X_train.shape[0] 83 | history = best_model.fit(X_train[:datasize,:,:], y_train[:datasize,:], 84 | epochs=nr_epochs, validation_data=(X_val, y_val)) 85 | 86 | 87 | best_model.save(os.path.join(result_path, 'best_model.h5')) 88 | 89 | 90 | 91 | ## Test on Testset 92 | score_test = best_model.evaluate(X_test, y_test, verbose=True) 93 | print('Score of best model: ' + str(score_test)) 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /scripts/EEG_alcoholic_train.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | import sys 7 | import os 8 | import numpy as np 9 | import pandas as pd 10 | # mcfly 11 | from mcfly import modelgen, find_architecture, storage 12 | 13 | # Parameters 14 | data_path = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/' 15 | number_of_models = 10 16 | nr_epochs = 5 17 | subset_size = 512 18 | batch_size = 32 19 | early_stopping = True 20 | 21 | # In[3]: 22 | 23 | 24 | preprocessed_path = os.path.join(data_path, 'preprocessed') 25 | result_path = os.path.join(data_path, 'models') 26 | 27 | 28 | # In[4]: 29 | 30 | 31 | X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy')) 32 | X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy')) 33 | X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy')) 34 | y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy')) 35 | y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy')) 36 | y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy')) 37 | 38 | 39 | # ## Generate models 40 | 41 | # In[5]: 42 | 43 | num_classes = y_train.shape[1] 44 | 45 | models = modelgen.generate_models(X_train.shape, 46 | number_of_classes=num_classes, 47 | number_of_models = number_of_models) 48 | 49 | 50 | # In[6]: 51 | 52 | #what is the fraction of a vs c in the validation set? 53 | y_val.mean(axis=0) 54 | 55 | 56 | # In[7]: 57 | 58 | if not os.path.exists(result_path): 59 | os.makedirs(result_path) 60 | 61 | 62 | # In[ ]: 63 | 64 | outputfile = os.path.join(result_path, 'modelcomparison.json') 65 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train, 66 | X_val, y_val, 67 | models,nr_epochs=nr_epochs, 68 | subset_size=subset_size, 69 | verbose=True, 70 | batch_size=batch_size, 71 | outputfile=outputfile, 72 | early_stopping=early_stopping) 73 | print('Details of the training process were stored in ',outputfile) 74 | 75 | 76 | # In[ ]: 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /scripts/experiment_PAMAP.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Experiment PAMAP with mcfly 5 | 6 | # ## Import required Python modules 7 | 8 | # In[1]: 9 | 10 | import sys 11 | import os 12 | import numpy as np 13 | import pandas as pd 14 | # mcfly 15 | from mcfly import modelgen, find_architecture, storage 16 | from keras.models import load_model 17 | np.random.seed(2) 18 | 19 | 20 | # In[2]: 21 | 22 | sys.path.insert(0, os.path.abspath('../..')) 23 | from utils import tutorial_pamap2 24 | 25 | 26 | # Load the preprocessed data as stored in Numpy-files. Please note that the data has already been split up in a training (training), validation (val), and test subsets. It is common practice to call the input data X and the labels y. 27 | 28 | # In[3]: 29 | 30 | data_path = '/media/sf_VBox_Shared/timeseries/PAMAP_Dataset/cleaned_7act/' 31 | 32 | 33 | # In[4]: 34 | 35 | X_train, y_train_binary, X_val, y_val_binary, X_test, y_test_binary, labels = tutorial_pamap2.load_data(data_path) 36 | 37 | 38 | # In[5]: 39 | 40 | print('x shape:', X_train.shape) 41 | print('y shape:', y_train_binary.shape) 42 | 43 | 44 | # The data is split between train test and validation. 45 | 46 | # In[6]: 47 | 48 | print('train set size:', X_train.shape[0]) 49 | print('validation set size:', X_val.shape[0]) 50 | print('test set size:', X_test.shape[0]) 51 | 52 | 53 | # Let's have a look at the distribution of the labels: 54 | 55 | # In[7]: 56 | 57 | frequencies = y_train_binary.mean(axis=0) 58 | frequencies_df = pd.DataFrame(frequencies, index=labels, columns=['frequency']) 59 | frequencies_df 60 | 61 | 62 | # ## Generate models 63 | 64 | # In[8]: 65 | 66 | num_classes = y_train_binary.shape[1] 67 | 68 | models = modelgen.generate_models(X_train.shape, 69 | number_of_classes=num_classes, 70 | number_of_models = 5) 71 | 72 | 73 | # In[10]: 74 | 75 | models_to_print = range(len(models)) 76 | for i, item in enumerate(models): 77 | if i in models_to_print: 78 | model, params, model_types = item 79 | print("-------------------------------------------------------------------------------------------------------") 80 | print("Model " + str(i)) 81 | print(" ") 82 | print("Hyperparameters:") 83 | print(params) 84 | print(" ") 85 | print("Model description:") 86 | model.summary() 87 | print(" ") 88 | print("Model type:") 89 | print(model_types) 90 | print(" ") 91 | 92 | 93 | # ## Compare models 94 | 95 | # In[13]: 96 | 97 | # Define directory where the results, e.g. json file, will be stored 98 | resultpath = os.path.join(data_path, '..', 'data/models') 99 | if not os.path.exists(resultpath): 100 | os.makedirs(resultpath) 101 | 102 | 103 | # In[14]: 104 | 105 | outputfile = os.path.join(resultpath, 'modelcomparison_pamap.json') 106 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train_binary, 107 | X_val, y_val_binary, 108 | models,nr_epochs=5, 109 | subset_size=1000, 110 | verbose=True, 111 | outputfile=outputfile) 112 | print('Details of the training process were stored in ',outputfile) 113 | 114 | 115 | # In[15]: 116 | 117 | best_model_index = np.argmax(val_accuracies) 118 | best_model, best_params, best_model_types = models[best_model_index] 119 | print('Model type and parameters of the best model:') 120 | print(best_model_types) 121 | print(best_params) 122 | 123 | 124 | # ## Train the best model on the full dataset 125 | 126 | # In[16]: 127 | 128 | #We make a copy of the model, to start training from fresh 129 | nr_epochs = 1 130 | datasize = X_train.shape[0] 131 | history = best_model.fit(X_train[:datasize,:,:], y_train_binary[:datasize,:], 132 | epochs=nr_epochs, validation_data=(X_val, y_val_binary)) 133 | 134 | 135 | # In[17]: 136 | 137 | modelname = 'my_bestmodel.h5' 138 | model_path = os.path.join(resultpath,modelname) 139 | 140 | 141 | # In[18]: 142 | 143 | best_model.save(model_path) 144 | 145 | 146 | # In[ ]: 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /scripts/experiment_PAMAP2_9fold.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Experiment PAMAP2 with mcfly 5 | 6 | # This experiment finds an optimal model for the PAMAP2 dataset. 7 | 8 | # ## Import required Python modules 9 | 10 | # In[1]: 11 | 12 | import sys 13 | import os 14 | import numpy as np 15 | import pandas as pd 16 | # mcfly 17 | from mcfly import modelgen, find_architecture, storage 18 | 19 | 20 | # In[11]: 21 | 22 | trainsize = 500 23 | valsize = 100 24 | nr_models = 10 25 | nr_epochs = 10 26 | subset_size = trainsize 27 | 28 | 29 | # ## Load the data 30 | 31 | # In[2]: 32 | 33 | # Define directory where the results, e.g. json file, will be stored 34 | datapath = '/data/mcfly/input' 35 | resultpath = '/data/mcfly/output' 36 | if not os.path.exists(resultpath): 37 | os.makedirs(resultpath) 38 | 39 | 40 | # In[3]: 41 | 42 | Xs = [] 43 | ys = [] 44 | 45 | ext = '.npy' 46 | for i in range(9): 47 | Xs.append(np.load(os.path.join(datapath,'X_'+str(i)+ext))) 48 | ys.append(np.load(os.path.join(datapath, 'y_'+str(i)+ext))) 49 | 50 | 51 | # In[4]: 52 | 53 | print(Xs[0].shape, ys[0].shape) 54 | 55 | 56 | # ## Generate models 57 | 58 | # First step is to create a model architecture. As we do not know what architecture is best for our data we will create a set of models to investigate which architecture is most suitable for our data and classification task. You will need to specificy how many models you want to create with argument 'number_of_models', the type of model which can been 'CNN' or 'DeepConvLSTM', and maximum number of layers per modeltype. See for a full overview of the optional arguments the function documentation of modelgen.generate_models 59 | 60 | # In[16]: 61 | 62 | num_classes = ys[0].shape[1] 63 | np.random.seed(123) 64 | models = modelgen.generate_models(Xs[0].shape, 65 | number_of_classes=num_classes, 66 | number_of_models = nr_models) 67 | 68 | 69 | # In[ ]: 70 | 71 | 72 | 73 | 74 | # In[19]: 75 | 76 | for i, (model, params, model_type) in enumerate(models): 77 | storage.savemodel(model,resultpath,"model_"+str(i)) 78 | 79 | 80 | # ## Compare models 81 | # Now that the model architectures have been generated it is time to compare the models by training them in a subset of the training data and evaluating the models in the validation subset. This will help us to choose the best candidate model. Performance results are stored in a json file. 82 | 83 | # In[25]: 84 | 85 | def split_train_test(X_list, y_list, j): 86 | X_train = np.concatenate(X_list[0:j]+X_list[j+1:]) 87 | X_test = X_list[j] 88 | y_train = np.concatenate(y_list[0:j]+y_list[j+1:]) 89 | y_test = y_list[j] 90 | return X_train, y_train, X_test, y_test 91 | 92 | def split_train_small_val(X_list, y_list, j, trainsize=500, valsize=500): 93 | X = np.concatenate(X_list[0:j]+X_list[j+1:]) 94 | y = np.concatenate(y_list[0:j]+y_list[j+1:]) 95 | rand_ind = np.random.choice(X.shape[0], trainsize+valsize, replace=False) 96 | X_train = X[rand_ind[:trainsize]] 97 | y_train = y[rand_ind[:trainsize]] 98 | X_val = X[rand_ind[trainsize:]] 99 | y_val = y[rand_ind[trainsize:]] 100 | return X_train, y_train, X_val, y_val 101 | 102 | 103 | # In[26]: 104 | 105 | from keras.optimizers import Adam 106 | from keras.models import model_from_json 107 | 108 | def get_fresh_copy(model, lr): 109 | model_json = model.to_json() 110 | model_copy = model_from_json(model_json) 111 | model_copy.compile(loss='categorical_crossentropy', 112 | optimizer=Adam(lr=lr), 113 | metrics=['accuracy']) 114 | #for layer in model_copy.layers: 115 | # layer.build(layer.input_shape) 116 | return model_copy 117 | 118 | 119 | # In[10]: 120 | 121 | models = [(get_fresh_copy(model, params['learning_rate']), params, model_type) for model, params, model_type in models] 122 | 123 | 124 | # In[12]: 125 | 126 | import time 127 | t = time.time() 128 | np.random.seed(123) 129 | histories_list, val_accuracies_list, val_losses_list = [], [], [] 130 | for j in range(len(Xs)): 131 | print('fold '+str(j)) 132 | models = [(get_fresh_copy(model, params['learning_rate']), params, model_type) for model, params, model_type in models] 133 | X_train, y_train, X_val, y_val = split_train_small_val(Xs, ys, j, trainsize=trainsize, valsize=valsize) 134 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train, 135 | X_val, y_val, 136 | models, 137 | nr_epochs=nr_epochs, 138 | subset_size=subset_size, 139 | verbose=True, 140 | outputfile=os.path.join(resultpath, 141 | 'experiment'+str(j)+'.json'), 142 | early_stopping=True) 143 | histories_list.append(histories) 144 | val_accuracies_list.append(val_accuracies) 145 | val_losses.append(val_losses) 146 | print(time.time()-t) 147 | 148 | 149 | # In[6]: 150 | 151 | # Read them all back in 152 | import json 153 | model_jsons = [] 154 | for j in len(Xs): 155 | with open(os.path.join(resultpath, 'experiment'+str(j)+'.json'), 'r') as outfile: 156 | model_jsons.append(json.load(outfile)) 157 | 158 | 159 | # In[12]: 160 | 161 | val_accuracies = np.array([[mod['val_acc'][-1] for mod in fold] for fold in model_jsons]) 162 | 163 | 164 | # In[13]: 165 | 166 | val_acc = np.array([np.array([mod['val_acc'][-1] for mod in fold], dtype='float') for fold in model_jsons]) 167 | train_acc = np.array([np.array([mod['train_acc'][-1] for mod in fold], dtype='float') for fold in model_jsons]) 168 | train_loss = np.array([np.array([mod['train_loss'][-1] for mod in fold], dtype='float') for fold in model_jsons]) 169 | val_loss = np.array([np.array([mod['val_loss'][-1] for mod in fold], dtype='float') for fold in model_jsons]) 170 | 171 | 172 | # In[14]: 173 | 174 | val_accuracies_avg = val_acc.mean(axis=0) 175 | print('val_accuracies_avg:', val_accuracies_avg) 176 | 177 | 178 | # In[23]: 179 | 180 | best_model_index = np.argmax(val_accuracies_avg) 181 | best_model = storage.loadmodel(resultpath, 'model_'+str(best_model_index)) 182 | 183 | 184 | # In[28]: 185 | 186 | best_params = model_jsons[0][best_model_index] 187 | 188 | 189 | # ## Train the best model for real 190 | 191 | # Now that we have identified the best model architecture out of our random pool of models we can continue by training the model on the full training sample. For the purpose of speeding up the example we only train the full model on the first 1000 values. You will need to replace this by 'datasize = X_train.shape[0]' in a real world example. 192 | 193 | # In[ ]: 194 | 195 | nr_epochs = 2 196 | 197 | np.random.seed(123) 198 | histories, test_accuracies_list, models = [], [], [] 199 | for j in range(len(Xs)): 200 | X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j) 201 | model_copy = get_fresh_copy(best_model, best_params['learning_rate']) 202 | datasize = X_train.shape[0] 203 | 204 | history = model_copy.fit(X_train[:datasize,:,:], y_train[:datasize,:], 205 | nb_epoch=nr_epochs, validation_data=(X_test, y_test)) 206 | 207 | histories.append(history) 208 | test_accuracies_list.append(history.history['val_acc'][-1] ) 209 | models.append(model_copy) 210 | 211 | 212 | # In[ ]: 213 | 214 | modelname = 'my_bestmodel' 215 | 216 | 217 | # In[ ]: 218 | 219 | for i, model in enumerate(models): 220 | storage.savemodel(model,resultpath,modelname+str(i)) 221 | 222 | 223 | # In[ ]: 224 | 225 | print('accuracies: ', test_accuracies_list) 226 | 227 | 228 | # In[ ]: 229 | 230 | print(np.mean(test_accuracies_list)) 231 | 232 | 233 | # In[ ]: 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /scripts/pamap2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to do a model comparison 3 | Run this script from the root of repository: 4 | 5 | `python scripts/pamap2.py` 6 | """ 7 | import sys 8 | import os 9 | import numpy as np 10 | import pandas as pd 11 | from mcfly import modelgen, find_architecture, storage 12 | 13 | np.random.seed(2) 14 | sys.path.insert(0, os.path.abspath('.')) 15 | print(sys.path) 16 | from utils import tutorial_pamap2 17 | 18 | # ## Settings 19 | # Specify in which directory you want to store the data: 20 | directory_to_extract_to = 'notebooks/tutorial/' 21 | number_of_models = 2 22 | subset_size = 10 23 | nr_epochs = 1 24 | 25 | # ## Download data and pre-proces data 26 | data_path = tutorial_pamap2.download_preprocessed_data(directory_to_extract_to) 27 | X_train, y_train_binary, X_val, y_val_binary, X_test, y_test_binary, labels = tutorial_pamap2.load_data(data_path) 28 | 29 | # The data is split between train test and validation. 30 | 31 | print('train set size:', X_train.shape[0]) 32 | print('validation set size:', X_val.shape[0]) 33 | print('test set size:', X_test.shape[0]) 34 | 35 | # ## Generate models 36 | 37 | num_classes = y_train_binary.shape[1] 38 | models = modelgen.generate_models(X_train.shape, 39 | number_of_classes=num_classes, 40 | number_of_models=number_of_models) 41 | 42 | # Define output path 43 | resultpath = os.path.join(directory_to_extract_to, 'data/models') 44 | if not os.path.exists(resultpath): 45 | os.makedirs(resultpath) 46 | outputfile = os.path.join(resultpath, 'modelcomparison.json') 47 | 48 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train_binary, 49 | X_val, y_val_binary, 50 | models, nr_epochs=nr_epochs, 51 | subset_size=subset_size, 52 | verbose=True, 53 | outputfile=outputfile) 54 | print('Details of the training process were stored in ', outputfile) 55 | 56 | # # Inspect model performance (table) 57 | modelcomparisons = pd.DataFrame({'model': [str(params) for model, params, model_types in models], 58 | 'train_acc': [history.history['acc'][-1] for history in histories], 59 | 'train_loss': [history.history['loss'][-1] for history in histories], 60 | 'val_acc': [history.history['val_acc'][-1] for history in histories], 61 | 'val_loss': [history.history['val_loss'][-1] for history in histories] 62 | }) 63 | modelcomparisons.to_csv(os.path.join(resultpath, 'modelcomparisons.csv')) 64 | 65 | modelcomparisons 66 | 67 | # # Choose the best model and save it 68 | 69 | 70 | best_model_index = np.argmax(val_accuracies) 71 | best_model, best_params, best_model_types = models[best_model_index] 72 | print('Model type and parameters of the best model:') 73 | print(best_model_types) 74 | print(best_params) 75 | modelname = 'my_bestmodel' 76 | storage.savemodel(best_model, resultpath, modelname) 77 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLeSC/mcfly-tutorial/4b1548058c158d0efef41bfb6c7b2caa575a8858/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_tutorial_pamap2.py: -------------------------------------------------------------------------------- 1 | from utils import tutorial_pamap2 2 | import numpy as np 3 | import pandas as pd 4 | import os.path 5 | import unittest 6 | 7 | 8 | class TutorialPAMAP2Suite(unittest.TestCase): 9 | """Basic test cases.""" 10 | 11 | def test_split_activities(self): 12 | """ 13 | Test whether split_activities produces a Numpy array 14 | """ 15 | labels = np.ones(3000) 16 | labels[range(150)] = 2 17 | X = np.ones((3000,9)) 18 | splittedX, splitted_y = tutorial_pamap2.split_activities(labels,X,[0], borders=50) 19 | assert splittedX[0].shape == (50, 9) 20 | assert splittedX[1].shape == (2750, 9) 21 | 22 | 23 | def test_sliding_window(self): 24 | """ Test whether sliding_window correctly updates x_train to the 25 | right size""" 26 | frame_length = 512 27 | step = 100 28 | x_trainlist = [np.zeros((25187,9)) for b in range(78)] 29 | y_trainlist = [np.zeros((12,9)) for b in range(78)] 30 | x_train, y_train = tutorial_pamap2.sliding_window(frame_length, step, x_trainlist, y_trainlist) 31 | test = len(x_train) == 19266 32 | assert test 33 | 34 | def test_transform_y(self): 35 | """ Test whether function produces Numpy array of expected size """ 36 | mapclasses = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, \ 37 | 12: 7, 13: 8, 16: 9, 17: 10, 24: 11} 38 | nr_classes = 12 39 | y = list([1,2,5,7,13,16,24,1,2,5,7,13,16,24]) #14 values 40 | transformedy = tutorial_pamap2.transform_y(y, mapclasses, nr_classes) 41 | test = transformedy.shape == (14,12) 42 | assert test 43 | 44 | def test_addheader(self): 45 | """ Test whether addheader produces dataframe of same shape as input 46 | """ 47 | datasets = [pd.DataFrame(index=range(100),columns=range(54)) for b in range(10)] 48 | datasetsnew = tutorial_pamap2.addheader(datasets) 49 | test = datasetsnew[0].shape == datasets[0].shape 50 | assert test 51 | 52 | def test_numpify_and_store(self): 53 | """ Test whether numpify_and_store produces npy-file """ 54 | Nsamples = 9 55 | Ntimesteps = 10 56 | Ncolumns = 3 57 | X = [[[0 for a in range(Ncolumns)] for b in range(Ntimesteps)] \ 58 | for c in range(Nsamples)] 59 | y = [[0 for a in range(Ntimesteps)] for b in range(Nsamples)] 60 | xname = 'xname' 61 | yname = 'yname' 62 | outdatapath = os.getcwd() 63 | tutorial_pamap2.numpify_and_store(X, y, xname, yname, outdatapath, \ 64 | shuffle=True) 65 | filename = os.path.join(outdatapath, xname+ '.npy') 66 | test = os.path.isfile(filename) 67 | if test == True: 68 | os.remove(filename) 69 | os.remove(os.path.join(outdatapath, yname + '.npy')) 70 | assert test 71 | 72 | def test_split_data(self): 73 | """ Test whether function produces numpy arrays 74 | of the correct dimensions """ 75 | Xlists = tuple([[np.zeros((200,9)) for b in range(14)] for c in range(9)]) 76 | ybinarylists = [np.zeros((14,12)) for c in range(9)] 77 | indices = slice(7, 9) 78 | x_test, y_test = tutorial_pamap2.split_data(Xlists, ybinarylists, \ 79 | indices) 80 | test = y_test[0].shape == (12,) and x_test[0].shape == (200, 9) 81 | assert test 82 | 83 | def test_load_model(self): 84 | from tensorflow.keras.models import load_model 85 | model = load_model('./notebooks/tutorial/model/model.h5') 86 | assert len(model.layers) > 1 87 | 88 | if __name__ == '__main__': 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /tests/test_tutorial_weather.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | from pathlib import Path 4 | 5 | from utils.tutorial_weather import load_data 6 | 7 | 8 | class TutorialWeatherSuite(unittest.TestCase): 9 | """ Weather data set test cases.""" 10 | temp_test_dir = 'temp_weather_test' 11 | 12 | def test_data_downloading_has_correct_shape(self): 13 | n_features = 89 14 | n_train_instances = 767 15 | n_test_instances = 329 16 | 17 | X_train, X_test, y_train, y_test = load_data(self.temp_test_dir) 18 | 19 | assert X_train.shape == (n_train_instances, n_features) 20 | assert X_test.shape == (n_test_instances, n_features) 21 | assert y_train.shape == (n_train_instances,) 22 | assert y_test.shape == (n_test_instances,) 23 | 24 | def setUp(self) -> None: 25 | Path(self.temp_test_dir).mkdir() 26 | 27 | def tearDown(self) -> None: 28 | shutil.rmtree(Path(self.temp_test_dir)) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .tutorial_pamap2 import * -------------------------------------------------------------------------------- /utils/tutorial_pamap2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Summary: 3 | Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and 4 | preproces the data. 5 | Example function calls in 'Tutorial mcfly on PAMAP2.ipynb' 6 | """ 7 | import numpy as np 8 | import pandas as pd 9 | from os import listdir 10 | import os.path 11 | import zipfile 12 | from tensorflow.keras.utils import to_categorical 13 | import six.moves.urllib as urllib 14 | import json 15 | 16 | 17 | def split_activities(labels, X, exclude_activities, borders=10 * 100): 18 | """ 19 | Splits up the data per activity and exclude activity=0. 20 | Also remove borders for each activity. 21 | Returns lists with subdatasets 22 | 23 | Parameters 24 | ---------- 25 | labels : numpy array 26 | Activity labels 27 | X : numpy array 28 | Data points 29 | borders : int 30 | Nr of timesteps to remove from the borders of an activity 31 | exclude_activities : list or tuple 32 | activities to exclude from the 33 | 34 | Returns 35 | ------- 36 | X_list 37 | y_list 38 | """ 39 | tot_len = len(labels) 40 | startpoints = np.where([1] + [labels[i] != labels[i - 1] 41 | for i in range(1, tot_len)])[0] 42 | endpoints = np.append(startpoints[1:] - 1, tot_len - 1) 43 | acts = [labels[s] for s, e in zip(startpoints, endpoints)] 44 | # Also split up the data, and only keep the non-zero activities 45 | xysplit = [(X[s + borders:e - borders + 1, :], a) 46 | for s, e, a in zip(startpoints, endpoints, acts) 47 | if a not in exclude_activities and e-borders+1 >= 0 and s+borders < tot_len] 48 | xysplit = [(Xs, y) for Xs, y in xysplit if len(Xs) > 0] 49 | Xlist = [Xs for Xs, y in xysplit] 50 | ylist = [y for X, y in xysplit] 51 | return Xlist, ylist 52 | 53 | 54 | def sliding_window(frame_length, step, Xsampleslist, ysampleslist): 55 | """ 56 | Splits time series in ysampleslist and Xsampleslist 57 | into segments by applying a sliding overlapping window 58 | of size equal to frame_length with steps equal to step 59 | it does this for all the samples and appends all the output together. 60 | So, the participant distinction is not kept 61 | 62 | Parameters 63 | ---------- 64 | frame_length : int 65 | Length of sliding window 66 | step : int 67 | Stepsize between windows 68 | Xsamples : list 69 | Existing list of window fragments 70 | ysamples : list 71 | Existing list of window fragments 72 | Xsampleslist : list 73 | Samples to take sliding windows from 74 | ysampleslist 75 | Samples to take sliding windows from 76 | """ 77 | Xsamples = [] 78 | ysamples = [] 79 | for j in range(len(Xsampleslist)): 80 | X = Xsampleslist[j] 81 | ybinary = ysampleslist[j] 82 | for i in range(0, X.shape[0] - frame_length, step): 83 | xsub = X[i:i + frame_length, :] 84 | ysub = ybinary 85 | Xsamples.append(xsub) 86 | ysamples.append(ysub) 87 | return Xsamples, ysamples 88 | 89 | 90 | def transform_y(y, mapclasses, nr_classes): 91 | """ 92 | Transforms y, a list with one sequence of A timesteps 93 | and B unique classes into a binary Numpy matrix of 94 | shape (A, B) 95 | 96 | Parameters 97 | ---------- 98 | y : list or array 99 | List of classes 100 | mapclasses : dict 101 | dictionary that maps the classes to numbers 102 | nr_classes : int 103 | total number of classes 104 | """ 105 | ymapped = np.array([mapclasses[c] for c in y], dtype='int') 106 | ybinary = to_categorical(ymapped, nr_classes) 107 | return ybinary 108 | 109 | def get_header(): 110 | axes = ['x', 'y', 'z'] 111 | IMUsensor_columns = ['temperature'] + \ 112 | ['acc_16g_' + i for i in axes] + \ 113 | ['acc_6g_' + i for i in axes] + \ 114 | ['gyroscope_' + i for i in axes] + \ 115 | ['magnometer_' + i for i in axes] + \ 116 | ['orientation_' + str(i) for i in range(4)] 117 | header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s 118 | for s in IMUsensor_columns] \ 119 | + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s 120 | for s in IMUsensor_columns] 121 | return header 122 | 123 | def addheader(datasets): 124 | """ 125 | The columns of the pandas data frame are numbers 126 | this function adds the column labels 127 | 128 | Parameters 129 | ---------- 130 | datasets : list 131 | List of pandas dataframes 132 | """ 133 | header = get_header() 134 | for i in range(0, len(datasets)): 135 | datasets[i].columns = header 136 | return datasets 137 | 138 | 139 | def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False): 140 | """ 141 | Converts python lists x 3D and y 1D into numpy arrays 142 | and stores the numpy array in directory outdatapath 143 | shuffle is optional and shuffles the samples 144 | 145 | Parameters 146 | ---------- 147 | X : list 148 | list with data 149 | y : list 150 | list with data 151 | X_name : str 152 | name to store the x arrays 153 | y_name : str 154 | name to store the y arrays 155 | outdatapath : str 156 | path to the directory to store the data 157 | shuffle : bool 158 | whether to shuffle the data before storing 159 | """ 160 | X = np.array(X) 161 | y = np.array(y) 162 | # Shuffle the train set 163 | if shuffle is True: 164 | np.random.seed(123) 165 | neworder = np.random.permutation(X.shape[0]) 166 | X = X[neworder, :, :] 167 | y = y[neworder, :] 168 | # Save binary file 169 | xpath = os.path.join(outdatapath, X_name) 170 | ypath = os.path.join(outdatapath, y_name) 171 | np.save(xpath, X) 172 | np.save(ypath, y) 173 | print('Stored ' + xpath, y_name) 174 | 175 | 176 | def fetch_data(directory_to_extract_to): 177 | """ 178 | Fetch the data and extract the contents of the zip file 179 | to the directory_to_extract_to. 180 | First check whether this was done before, if yes, then skip 181 | 182 | Parameters 183 | ---------- 184 | directory_to_extract_to : str 185 | directory to create subfolder 'PAMAP2' 186 | 187 | Returns 188 | ------- 189 | targetdir: str 190 | directory where the data is extracted 191 | """ 192 | targetdir = os.path.join(directory_to_extract_to, "PAMAP2") 193 | if os.path.exists(targetdir): 194 | print('Data previously downloaded and stored in ' + targetdir) 195 | else: 196 | os.makedirs(targetdir) # create target directory 197 | # Download the PAMAP2 data, this is 688 Mb 198 | path_to_zip_file = os.path.join(directory_to_extract_to, 'PAMAP2_Dataset.zip') 199 | test_file_exist = os.path.isfile(path_to_zip_file) 200 | if test_file_exist is False: 201 | url = str('https://archive.ics.uci.edu/ml/' + 202 | 'machine-learning-databases/00231/PAMAP2_Dataset.zip') 203 | # retrieve data from url 204 | local_fn, headers = urllib.request.urlretrieve(url, 205 | filename=path_to_zip_file) 206 | print('Download complete and stored in: ' + path_to_zip_file) 207 | else: 208 | print('The data was previously downloaded and stored in ' + 209 | path_to_zip_file) 210 | # unzip 211 | 212 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: 213 | zip_ref.extractall(targetdir) 214 | os.remove(path_to_zip_file) 215 | return targetdir 216 | 217 | 218 | def map_class(datasets_filled, exclude_activities): 219 | ysetall = [set(np.array(data.activityID)) - set(exclude_activities) 220 | for data in datasets_filled] 221 | class_ids = list(set.union(*[set(y) for y in ysetall])) 222 | class_labels = [ACTIVITIES_MAP[i] for i in class_ids] 223 | nr_classes = len(class_ids) 224 | mapclasses = {class_ids[i]: i for i in range(len(class_ids))} 225 | return class_labels, nr_classes, mapclasses 226 | 227 | 228 | def split_data(Xlists, ybinarylists, indices): 229 | """ Function takes subset from list given indices 230 | 231 | Parameters 232 | ---------- 233 | Xlists: tuple 234 | tuple (samples) of lists (windows) of numpy-arrays (time, variable) 235 | ybinarylist : 236 | list (samples) of numpy-arrays (window, class) 237 | indices : 238 | indices of the slice of data (samples) to be taken 239 | 240 | Returns 241 | ------- 242 | x_setlist : list 243 | list (windows across samples) of numpy-arrays (time, variable) 244 | y_setlist: list 245 | list (windows across samples) of numpy-arrays (class, ) 246 | """ 247 | tty = str(type(indices)) 248 | # or statement in next line is to account for python2 and python3 249 | # difference 250 | if tty == "" or tty == "": 251 | x_setlist = [X for Xlist in Xlists[indices] for X in Xlist] 252 | y_setlist = [y for ylist in ybinarylists[indices] for y in ylist] 253 | else: 254 | x_setlist = [X for X in Xlists[indices]] 255 | y_setlist = [y for y in ybinarylists[indices]] 256 | return x_setlist, y_setlist 257 | 258 | def split_data_random(X, y, val_size, test_size): 259 | X = np.array(X) 260 | y = np.array(y) 261 | size = len(X) 262 | train_size = size - val_size - test_size 263 | indices = np.random.permutation(size) 264 | X_train = X[indices[:train_size]] 265 | y_train = y[indices[:train_size]] 266 | X_val = X[indices[train_size:train_size+val_size]] 267 | y_val = y[indices[train_size:train_size+val_size]] 268 | X_test = X[indices[train_size+val_size:]] 269 | y_test = y[indices[train_size+val_size:]] 270 | return X_train, y_train, X_val, y_val, X_test, y_test 271 | 272 | def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, 273 | val_test_size=None): 274 | """ Function to preprocess the PAMAP2 data after it is fetched 275 | 276 | Parameters 277 | ---------- 278 | targetdir : str 279 | subdirectory of directory_to_extract_to, targetdir 280 | is defined by function fetch_data 281 | outdatapath : str 282 | a subdirectory of directory_to_extract_to, outdatapath 283 | is the direcotry where the Numpy output will be stored. 284 | columns_to_use : list 285 | list of column names to use 286 | exclude_activities : list or tuple 287 | activities to exclude from the 288 | fold : boolean 289 | Whether to store each fold seperately ('False' creates 290 | Train, Test and Validation sets) 291 | 292 | Returns 293 | ------- 294 | None 295 | """ 296 | datadir = os.path.join(targetdir, 'PAMAP2_Dataset', 'Protocol') 297 | filenames = listdir(datadir) 298 | filenames.sort() 299 | print('Start pre-processing all ' + str(len(filenames)) + ' files...') 300 | # load the files and put them in a list of pandas dataframes: 301 | datasets = [pd.read_csv(os.path.join(datadir, fn), header=None, sep=' ') 302 | for fn in filenames] 303 | datasets = addheader(datasets) # add headers to the datasets 304 | # Interpolate dataset to get same sample rate between channels 305 | datasets_filled = [d.interpolate() for d in datasets] 306 | # Create mapping for class labels 307 | class_labels, nr_classes, mapclasses = map_class(datasets_filled, exclude_activities) 308 | # Save class labels 309 | with open(os.path.join(outdatapath, 'labels.json'), 'w') as fp: 310 | json.dump(class_labels, fp) 311 | # Create input (x) and output (y) sets 312 | xall = [np.array(data[columns_to_use]) for data in datasets_filled] 313 | yall = [np.array(data.activityID) for data in datasets_filled] 314 | xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)] 315 | Xlists, ylists = zip(*xylists) 316 | ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists] 317 | frame_length = int(5.12 * 100) 318 | step = 1 * 100 319 | if not fold: 320 | if val_test_size is None: 321 | # Split in train, test and val 322 | x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6) 323 | test_range = slice(7, len(datasets_filled)) 324 | x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range) 325 | x_trainlist, y_trainlist = split_data(Xlists, ybinarylists, 326 | indices=slice(0, 6)) 327 | # Take sliding-window frames, target is label of last time step, 328 | # and store as numpy file 329 | x_train, y_train = sliding_window(frame_length, step, x_trainlist, 330 | y_trainlist) 331 | x_val, y_val = sliding_window(frame_length, step, x_vallist, 332 | y_vallist) 333 | x_test, y_test = sliding_window(frame_length, step, x_testlist, 334 | y_testlist) 335 | 336 | else: 337 | val_size, test_size = val_test_size 338 | X_list, y_list = split_data(Xlists, ybinarylists, 339 | slice(0, len(datasets_filled))) 340 | X, y = sliding_window(frame_length, step, X_list, 341 | y_list) 342 | x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size) 343 | 344 | numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train', 345 | outdatapath=outdatapath, shuffle=True) 346 | numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val', 347 | outdatapath=outdatapath, shuffle=False) 348 | numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test', 349 | outdatapath=outdatapath, shuffle=False) 350 | else: 351 | for i in range(len(Xlists)): 352 | X_i, y_i = split_data(Xlists, ybinarylists, i) 353 | X, y = sliding_window(frame_length, step, X_i, y_i) 354 | numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i), 355 | outdatapath=outdatapath, shuffle=True) 356 | 357 | 358 | print('Processed data succesfully stored in ' + outdatapath) 359 | return None 360 | 361 | 362 | def fetch_and_preprocess(directory_to_extract_to, 363 | columns_to_use=None, 364 | output_dir='preprocessed', 365 | exclude_activities=[0], 366 | fold=False, 367 | val_test_size=None): 368 | """High level function to fetch_and_preprocess the PAMAP2 dataset. 369 | 370 | Parameters 371 | ---------- 372 | directory_to_extract_to : str 373 | the directory where the data will be stored 374 | columns_to_use : list 375 | the columns to use 376 | ouptput_dir : str 377 | name of the directory to write the outputdata to 378 | exclude_activities : list or tuple 379 | activities to exclude from the 380 | fold : boolean 381 | Whether to store each fold seperately ('False' creates 382 | Train, Test and Validation sets) 383 | 384 | Returns 385 | ------- 386 | outdatapath: str 387 | The directory in which the numpy files are stored 388 | """ 389 | if columns_to_use is None: 390 | columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z', 391 | 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z', 392 | 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z'] 393 | targetdir = fetch_data(directory_to_extract_to) 394 | outdatapath = os.path.join(targetdir, output_dir) 395 | if not os.path.exists(outdatapath): 396 | os.makedirs(outdatapath) 397 | if os.path.isfile(os.path.join(outdatapath, 'X_train.npy')): 398 | print('Data previously pre-processed and np-files saved to ' + 399 | outdatapath) 400 | else: 401 | preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size) 402 | return outdatapath 403 | 404 | 405 | def load_data(outputpath): 406 | """Load the numpy data as stored in directory outputpath. 407 | 408 | Parameters 409 | ---------- 410 | outputpath : str 411 | directory where the numpy files are stored 412 | 413 | Returns 414 | ------- 415 | x_train 416 | y_train_binary 417 | x_val 418 | y_val_binary 419 | x_test 420 | y_test_binary 421 | """ 422 | ext = '.npy' 423 | x_train = np.load(os.path.join(outputpath, 'X_train' + ext)) 424 | y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext)) 425 | x_val = np.load(os.path.join(outputpath, 'X_val' + ext)) 426 | y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext)) 427 | x_test = np.load(os.path.join(outputpath, 'X_test' + ext)) 428 | y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext)) 429 | with open(os.path.join(outputpath, 'labels.json'), 'r') as fn: 430 | labels = json.load(fn) 431 | return x_train, y_train_binary, x_val, y_val_binary, \ 432 | x_test, y_test_binary, labels 433 | 434 | 435 | def download_preprocessed_data(directory_to_extract_to): 436 | """Load already preprocessed data from zenodo. 437 | 438 | Args: 439 | ---- 440 | directory_to_extract_to: str 441 | Define directory to extract dataset to (if not yet present). 442 | """ 443 | data_path = os.path.join(directory_to_extract_to, 444 | 'data', 'PAMAP2', 'preprocessed') 445 | 446 | if not os.path.isdir(data_path): 447 | path_to_zip_file = os.path.join(directory_to_extract_to, 'data.zip') 448 | 449 | # Download zip file with data 450 | if not os.path.isfile(path_to_zip_file): 451 | print("Downloading data...") 452 | local_fn, headers = urllib.request.urlretrieve( 453 | 'https://zenodo.org/record/834467/files/data03.zip', 454 | filename=path_to_zip_file) 455 | else: 456 | print("Data already downloaded") 457 | # Extract the zip file 458 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: 459 | print("Extracting data...") 460 | zip_ref.extractall(directory_to_extract_to) 461 | os.rename(os.path.join(directory_to_extract_to, 'data03'), 462 | os.path.join(directory_to_extract_to, 'data')) 463 | print("Done") 464 | else: 465 | print("Data already downloaded and extracted.") 466 | 467 | return data_path 468 | 469 | 470 | ACTIVITIES_MAP = { 471 | 0: 'no_activity', 472 | 1: 'lying', 473 | 2: 'sitting', 474 | 3: 'standing', 475 | 4: 'walking', 476 | 5: 'running', 477 | 6: 'cycling', 478 | 7: 'nordic_walking', 479 | 9: 'watching_tv', 480 | 10: 'computer_work', 481 | 11: 'car_driving', 482 | 12: 'ascending_stairs', 483 | 13: 'descending_stairs', 484 | 16: 'vaccuum_cleaning', 485 | 17: 'ironing', 486 | 18: 'folding_laundry', 487 | 19: 'house_cleaning', 488 | 20: 'playing_soccer', 489 | 24: 'rope_jumping' 490 | } 491 | -------------------------------------------------------------------------------- /utils/tutorial_racketsports.py: -------------------------------------------------------------------------------- 1 | """Data precprocessing or loading for RacketSports dataset. 2 | 3 | Summary: 4 | Contains script to preprocess RacketSports dataset and function to load the 5 | already preprocessed dataset. 6 | 7 | This dataset is rather simple which makes it well suited for quick training 8 | of mcfly models. 9 | """ 10 | import numpy as np 11 | import os.path 12 | import zipfile 13 | import six.moves.urllib as urllib 14 | 15 | 16 | def download_preprocessed_data(directory_to_extract_to): 17 | """Load already preprocessed data from zenodo. 18 | 19 | Args: 20 | ---- 21 | directory_to_extract_to: str 22 | Define directory to extract dataset to (if not yet present). 23 | """ 24 | data_path = os.path.join(directory_to_extract_to, 25 | 'RacketSports', 'preprocessed') 26 | 27 | if not os.path.isdir(data_path): 28 | path_to_zip_file = os.path.join(directory_to_extract_to, 'RacketSports.zip') 29 | 30 | # Download zip file with data 31 | if not os.path.isfile(path_to_zip_file): 32 | print("Downloading data...") 33 | local_fn, headers = urllib.request.urlretrieve( 34 | 'https://zenodo.org/record/3743603/files/RacketSports.zip', 35 | filename=path_to_zip_file) 36 | else: 37 | print("Data already downloaded") 38 | # Extract the zip file 39 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: 40 | print("Extracting data...") 41 | zip_ref.extractall(directory_to_extract_to) 42 | print("Done") 43 | else: 44 | print("Data already downloaded and extracted.") 45 | 46 | return data_path 47 | 48 | 49 | def fetch_and_preprocess(directory_to_extract_to, 50 | output_dir='preprocessed'): 51 | """High level function to fetch_and_preprocess the RacketSports dataset. 52 | 53 | Parameters 54 | ---------- 55 | directory_to_extract_to : str 56 | the directory where the data will be stored 57 | ouptput_dir : str 58 | name of the directory to write the outputdata to 59 | 60 | Returns 61 | ------- 62 | outdatapath: str 63 | The directory in which the numpy files are stored 64 | """ 65 | targetdir = fetch_data(directory_to_extract_to) 66 | outdatapath = os.path.join(targetdir, output_dir) 67 | if not os.path.exists(outdatapath): 68 | os.makedirs(outdatapath) 69 | if os.path.isfile(os.path.join(outdatapath, 'X_train.npy')): 70 | print('Data previously pre-processed and np-files saved to ' + 71 | outdatapath) 72 | else: 73 | preprocess(targetdir, outdatapath) 74 | return outdatapath 75 | 76 | 77 | def preprocess(targetdir, outdatapath): 78 | """ Function to preprocess the RacketSports data after it is fetched 79 | 80 | Parameters 81 | ---------- 82 | targetdir : str 83 | subdirectory of directory_to_extract_to, targetdir 84 | is defined by function fetch_data 85 | outdatapath : str 86 | a subdirectory of directory_to_extract_to, outdatapath 87 | is the direcotry where the Numpy output will be stored. 88 | 89 | Returns 90 | ------- 91 | None 92 | """ 93 | datadir = os.path.join(targetdir) #, 'RacketSports') 94 | filenames = os.listdir(datadir) 95 | filenames.sort() 96 | print('Start pre-processing all ' + str(len(filenames)) + ' files...') 97 | 98 | # Load ans split data 99 | file_train = os.path.join(datadir, 'RacketSports_TRAIN.arff') 100 | file_test = os.path.join(datadir, 'RacketSports_TEST.arff') 101 | X_train, y_train = load_racket_arff(file_train) 102 | X_test, X_val, y_test, y_val = load_and_split(file_test, random_seed=1) 103 | 104 | store_data(X_train, y_train, X_name='X_train', y_name='y_train', 105 | outdatapath=outdatapath, shuffle=True) 106 | store_data(X_val, y_val, X_name='X_val', y_name='y_val', 107 | outdatapath=outdatapath, shuffle=False) 108 | store_data(X_test, y_test, X_name='X_test', y_name='y_test', 109 | outdatapath=outdatapath, shuffle=False) 110 | 111 | print('Processed data succesfully stored in ' + outdatapath) 112 | return None 113 | 114 | 115 | def fetch_data(directory_to_extract_to): 116 | """ 117 | Fetch the data and extract the contents of the zip file 118 | to the directory_to_extract_to. 119 | First check whether this was done before, if yes, then skip 120 | 121 | Parameters 122 | ---------- 123 | directory_to_extract_to : str 124 | directory to create subfolder 'PAMAP2' 125 | 126 | Returns 127 | ------- 128 | targetdir: str 129 | directory where the data is extracted 130 | """ 131 | targetdir = os.path.join(directory_to_extract_to, "RacketSports") 132 | if os.path.exists(targetdir): 133 | print('Data previously downloaded and stored in ' + targetdir) 134 | else: 135 | os.makedirs(targetdir) # create target directory 136 | # Download the PAMAP2 data, this is 688 Mb 137 | path_to_zip_file = os.path.join(directory_to_extract_to, 'RacketSports.zip') 138 | test_file_exist = os.path.isfile(path_to_zip_file) 139 | if test_file_exist is False: 140 | url = str('http://www.timeseriesclassification.com/' + 141 | 'Downloads/RacketSports.zip') 142 | # retrieve data from url 143 | local_fn, headers = urllib.request.urlretrieve(url, 144 | filename=path_to_zip_file) 145 | print('Download complete and stored in: ' + path_to_zip_file) 146 | else: 147 | print('The data was previously downloaded and stored in ' + 148 | path_to_zip_file) 149 | # unzip 150 | 151 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: 152 | zip_ref.extractall(targetdir) 153 | os.remove(path_to_zip_file) 154 | return targetdir 155 | 156 | 157 | def load_racket_arff(filename): 158 | """Load data from arff file.""" 159 | start = 0 160 | data = [] 161 | labels = [] 162 | with open(filename) as fp: 163 | line = fp.readline() 164 | count = 0 165 | while line: 166 | if start == 1: 167 | lines = line.split('\\n') 168 | data_line = [] 169 | for l in lines: 170 | data_line_sub = [] 171 | for entry in l.split(','): 172 | if entry.startswith('B') or entry.startswith('S'): 173 | labels.append(entry.replace("'", "").replace('\n', '')) 174 | else: 175 | data_line_sub.append(float(entry.replace("'", ""))) 176 | data_line.append(data_line_sub) 177 | data.append(data_line) 178 | 179 | if line.startswith('@data'): 180 | start = 1 181 | 182 | line = fp.readline() 183 | count += 1 184 | 185 | return np.swapaxes(np.array(data), 1, 2), labels 186 | 187 | 188 | def load_and_split(file_test, random_seed=1): 189 | """Load data and split into train, test, validation.""" 190 | # Load data from arff files 191 | X_test0, y_test0 = load_racket_arff(file_test) 192 | 193 | # Split dataset 194 | np.random.seed(random_seed) 195 | y_val = [] 196 | y_test = [] 197 | IDs_val = [] 198 | IDs_test = [] 199 | 200 | for label in list(set(y_test0)): 201 | idx = np.where(np.array(y_test0) == label)[0] 202 | idx1 = np.random.choice(idx, len(idx)//2, replace=False) 203 | idx2 = list(set(idx) - set(idx1)) 204 | IDs_val.extend(idx1) 205 | IDs_test.extend(idx2) 206 | y_val.extend(len(idx1) * [label]) 207 | y_test.extend(len(idx2) * [label]) 208 | 209 | print(label, y_test0.count(label)) 210 | 211 | X_test = X_test0[IDs_test, :, :] 212 | X_val = X_test0[IDs_val, :, :] 213 | return X_test, X_val, y_test, y_val 214 | 215 | 216 | def store_data(X, y, X_name, y_name, outdatapath, shuffle=False): 217 | """ 218 | Converts python lists x 3D and y 1D into numpy arrays 219 | and stores the numpy array in directory outdatapath 220 | shuffle is optional and shuffles the samples 221 | 222 | Parameters 223 | ---------- 224 | X : list 225 | list with data 226 | y : list 227 | list with data 228 | X_name : str 229 | name to store the x arrays 230 | y_name : str 231 | name to store the y arrays 232 | outdatapath : str 233 | path to the directory to store the data 234 | shuffle : bool 235 | whether to shuffle the data before storing 236 | """ 237 | X = np.array(X) 238 | y = np.array(y) 239 | # Shuffle the train set 240 | if shuffle: 241 | np.random.seed(123) 242 | neworder = np.random.permutation(X.shape[0]) 243 | X = X[neworder, :, :] 244 | y = y[neworder] 245 | # Save binary file 246 | xpath = os.path.join(outdatapath, X_name) 247 | ypath = os.path.join(outdatapath, y_name) 248 | np.save(xpath, X) 249 | np.save(ypath, y) 250 | print('Stored ' + xpath, y_name) 251 | 252 | 253 | def load_data(outputpath): 254 | """Load the numpy data as stored in directory outputpath. 255 | 256 | Parameters 257 | ---------- 258 | outputpath : str 259 | directory where the numpy files are stored 260 | 261 | Returns 262 | ------- 263 | x_train 264 | y_train_binary 265 | x_val 266 | y_val_binary 267 | x_test 268 | y_test_binary 269 | """ 270 | ext = '.npy' 271 | X_train = np.load(os.path.join(outputpath, 'X_train' + ext)) 272 | y_train = np.load(os.path.join(outputpath, 'y_train' + ext)) 273 | X_val = np.load(os.path.join(outputpath, 'X_val' + ext)) 274 | y_val = np.load(os.path.join(outputpath, 'y_val' + ext)) 275 | X_test = np.load(os.path.join(outputpath, 'X_test' + ext)) 276 | y_test = np.load(os.path.join(outputpath, 'y_test' + ext)) 277 | return X_train, y_train, X_val, y_val, X_test, y_test 278 | -------------------------------------------------------------------------------- /utils/tutorial_vu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import os.path 5 | import numpy as np 6 | import scipy.io 7 | import xlrd 8 | 9 | import logging 10 | 11 | logging.basicConfig(level=logging.INFO, format='%(message)s') 12 | logger = logging.getLogger(__name__) 13 | 14 | ONE_TIME_FALL_DATASET = 'one' # Value to pass to load the one time fallers + controls data set 15 | MULTI_TIME_FALL_DATASET = 'multi' # Value to pass to load the multi time fallers + controls data set 16 | 17 | 18 | class DataLoader: 19 | base_path = './Data espen/' 20 | train_fraction = 0.80 # fraction of subjects used for train set. number of segments per subject is variable. 21 | validation_fraction = 0.10 # fraction of subjects used for validation. number of segments per subject is variable. 22 | 23 | CONTROL_LABEL = 0 24 | ONE_TIME_FALLER_LABEL = 1 25 | MULTI_TIME_FALLER_LABEL = 2 26 | 27 | def load(self, dataset_selection=ONE_TIME_FALL_DATASET): 28 | """ 29 | Gets subject ids from excel file and loads acc and vel data from mat file for each subject. Return a train, 30 | validation and test set. Each set consists of the data X and the label y. 31 | :param dataset_selection: 32 | Determines whether datasets from the single time fallers and their controls or the multi time fallers with 33 | their controls should be loaded. Can be either 'one' or 'multi'. 34 | :return: train_X, train_y, validation_X, 35 | validation_y, test_X, test_y 36 | """ 37 | multi_time_fallers, multi_time_fallers_controls, one_time_fallers, one_time_fallers_controls = self.read_ids_from_excel() 38 | logger.debug('') 39 | 40 | if dataset_selection == ONE_TIME_FALL_DATASET: 41 | train_X, train_y, validation_X, validation_y, test_X, test_y = self.get_split_shuffled_data_set( 42 | self.ONE_TIME_FALLER_LABEL, self.CONTROL_LABEL, one_time_fallers, one_time_fallers_controls) 43 | elif dataset_selection == MULTI_TIME_FALL_DATASET: 44 | train_X, train_y, validation_X, validation_y, test_X, test_y = self.get_split_shuffled_data_set( 45 | self.MULTI_TIME_FALLER_LABEL, self.CONTROL_LABEL, multi_time_fallers, multi_time_fallers_controls) 46 | 47 | logger.info('Loaded train samples with shape {} and train labels with shape {}.' 48 | .format(train_X.shape, train_y.shape)) 49 | logger.info('Loaded validation samples with shape {} and test labels with shape {}.' 50 | .format(validation_X.shape, validation_y.shape)) 51 | logger.info('Loaded test samples with shape {} and test labels with shape {}.' 52 | .format(test_X.shape, test_y.shape)) 53 | logger.info('Of {} instances loaded, {}% is used for training, {}% for validation, {}% for testing.' 54 | .format(len(train_y) + len(test_y) + len(validation_y), 55 | np.round(100.0 * len(train_y) / (len(train_y) + len(test_y) + len(validation_y)), 1), 56 | np.round(100.0 * len(validation_y) / (len(train_y) + len(test_y) + len(validation_y)), 1), 57 | np.round(100.0 * len(test_y) / (len(train_y) + len(test_y) + len(validation_y)), 1))) 58 | 59 | return train_X, train_y, validation_X, validation_y, test_X, test_y 60 | 61 | def read_ids_from_excel(self): 62 | sheet = xlrd.open_workbook(os.path.join(self.base_path, 'File_number_Fall_class.xlsx')).sheet_by_index(0) 63 | one_time_fallers = self.get_ids_from_column(1, sheet) 64 | one_time_fallers_controls = self.get_ids_from_column(3, sheet) 65 | multi_time_fallers = self.get_ids_from_column(6, sheet) 66 | multi_time_fallers_controls = self.get_ids_from_column(8, sheet) 67 | return multi_time_fallers, multi_time_fallers_controls, one_time_fallers, one_time_fallers_controls 68 | 69 | def get_ids_from_column(self, column, sheet): 70 | return list( 71 | [int(sheet.cell_value(i, column)) for i in range(2, sheet.nrows) if sheet.cell_value(i, column) != '']) 72 | 73 | def get_split_shuffled_data_set(self, label, control_label, fallers, controls): 74 | indices = list(range(len(fallers))) 75 | np.random.shuffle(indices) 76 | 77 | n_train_instances = int(self.train_fraction * len(indices)) 78 | n_validation_instances = int(self.validation_fraction * len(indices)) 79 | logger.info('Loading training data.') 80 | train_X, train_y = self.get_data_set(fallers, 81 | controls, 82 | indices[:n_train_instances], 83 | label, 84 | control_label) 85 | logger.info('Loading validation data.') 86 | validation_X, validation_y = self.get_data_set(fallers, 87 | controls, 88 | indices[ 89 | n_train_instances:n_train_instances + n_validation_instances], 90 | label, 91 | control_label) 92 | logger.info('Loading test data.') 93 | test_X, test_y = self.get_data_set(fallers, 94 | controls, 95 | indices[n_train_instances + n_validation_instances:], 96 | label, 97 | control_label) 98 | return train_X, train_y, validation_X, validation_y, test_X, test_y 99 | 100 | def get_data_set(self, fallers, controls, indices, label, control_label): 101 | train_instance_sets = [] 102 | train_label_sets = [] 103 | for index in indices: 104 | fall_id = fallers[index] 105 | fall_X, fall_y = self.get_user_data_and_labels_for_id(fall_id, label) 106 | train_instance_sets.append(fall_X) 107 | train_label_sets.append(fall_y) 108 | 109 | control_id = controls[index] 110 | control_X, control_y = self.get_user_data_and_labels_for_id(control_id, control_label) 111 | train_instance_sets.append(control_X) 112 | train_label_sets.append(control_y) 113 | train_set = np.concatenate(train_instance_sets, axis=0) 114 | train_labels = np.concatenate(train_label_sets) 115 | return train_set, train_labels 116 | 117 | def get_user_data_and_labels_for_id(self, id, label): 118 | filename = 'Acc_Vel_gait_30sec_{}.mat'.format(id) 119 | logger.info('Processing file {}'.format(filename)) 120 | user_data = self.load_user_data(filename) 121 | user_labels = [label for _ in user_data] 122 | return user_data, user_labels 123 | 124 | def load_user_data(self, filename): 125 | path = os.path.join(self.base_path, filename) 126 | data = scipy.io.loadmat(path) 127 | acc = np.array([data['Acc_gait_30sec'][0][i] for i in range(len(data['Acc_gait_30sec'][0]))]) 128 | vel = np.array([data['Vel_gait_30sec'][0][i] for i in range(len(data['Vel_gait_30sec'][0]))]) 129 | userdata = np.concatenate((acc, vel), axis=2) 130 | return userdata 131 | 132 | 133 | def load_one_time_fall_dataset(): 134 | """ 135 | Loads a dataset containing the one time fallers and there matched controls. Fallers are distributed over train, 136 | validation and test set. Controls are kept in the same set as their matched subjects. All segments of a specific 137 | subject, control of faller, end up in the same set. Gets subject ids from excel file and loads acc and vel data 138 | from mat file for each subject. Return a train, validation and test set. Each set consists of the data X and the 139 | label y. 140 | :return: train_X, train_y, validation_X, validation_y, test_X, test_y 141 | """ 142 | return DataLoader().load(dataset_selection=ONE_TIME_FALL_DATASET) 143 | 144 | 145 | def load_multi_time_fall_dataset(): 146 | """ 147 | Loads a dataset containing the multiple time fallers and there matched controls. Fallers are distributed over 148 | train, validation and test set. Controls are kept in the same set as their matched subjects. All segments of a 149 | specific subject, control of faller, end up in the same set. Gets subject ids from excel file and loads acc and 150 | vel data from mat file for each subject. Return a train, validation and test set. Each set consists of the data X 151 | and the label y. 152 | :return: train_X, train_y, validation_X, validation_y, test_X, test_y 153 | """ 154 | return DataLoader().load(dataset_selection=MULTI_TIME_FALL_DATASET) 155 | -------------------------------------------------------------------------------- /utils/tutorial_weather.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import urllib 3 | from pathlib import Path 4 | 5 | from urllib.request import urlretrieve 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | 9 | 10 | def load_data(path: str = '.'): 11 | """ 12 | Load weather dataset (10.5281/zenodo.4770936.). If it's not on the path specified, it will be downloaded. 13 | Parameters 14 | ---------- 15 | path : str 16 | The local path to the data set folder. 17 | 18 | Returns 19 | ------- 20 | X_train 21 | X_test 22 | y_train 23 | y_test 24 | """ 25 | data_path = download_preprocessed_data(path) 26 | data = pd.read_csv(data_path) 27 | nr_rows = 365 * 3 28 | X_data = data.loc[:nr_rows].drop(columns=['DATE', 'MONTH']) 29 | 30 | days_ahead = 1 31 | y_data = data.loc[days_ahead:(nr_rows + days_ahead)]["MAASTRICHT_sunshine"] 32 | X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=0) 33 | 34 | return X_train, X_test, y_train, y_test 35 | 36 | 37 | def download_preprocessed_data(directory_to_extract_to: typing.Union[str, Path]): 38 | data_path = Path(directory_to_extract_to) / 'weather' 39 | data_path.mkdir(exist_ok=True) 40 | data_set_light_path = data_path / 'weather_prediction_dataset_light.csv' 41 | if not data_set_light_path.exists(): 42 | _, _ = urllib.request.urlretrieve( 43 | 'https://zenodo.org/record/7053722/files/weather_prediction_dataset_light.csv', 44 | filename=data_set_light_path) 45 | return data_set_light_path 46 | --------------------------------------------------------------------------------