├── .github
└── workflows
│ └── python-package.yml
├── .gitignore
├── LICENSE
├── README.md
├── cheatsheet.md
├── examples
└── modelcomparison.json
├── linter_profile.yaml
├── mcflylogo.png
├── notebooks
├── experiments
│ ├── Actitracker_preprocess.ipynb
│ ├── Actitracker_train.ipynb
│ ├── EEG_alcoholic_preprocessing.ipynb
│ ├── EEG_alcoholic_train.ipynb
│ ├── Preprocess_PAMAP.ipynb
│ ├── Preprocess_PAMAP2.ipynb
│ ├── dataset_PEMS_prepare.ipynb
│ ├── dataset_phoneme_prepare.ipynb
│ ├── dataset_rackets_prepare.ipynb
│ ├── deeplearning_eecology.ipynb
│ ├── deeplearning_guinneabissau.ipynb
│ ├── experiment_PAMAP.ipynb
│ ├── experiment_PAMAP2.ipynb
│ ├── experiment_PAMAP2_9fold.ipynb
│ ├── experiment_PAMAP2_9fold_small.ipynb
│ ├── experiment_extra_datasets.ipynb
│ ├── experiment_extra_datasets_4model_types.ipynb
│ ├── experiment_skipconnections.ipynb
│ └── preproces_Guinea-Bisseau_Nigeria.ipynb
└── tutorial
│ ├── model
│ └── model.h5
│ ├── tutorial.ipynb
│ ├── tutorial_quick.ipynb
│ └── workshop.ipynb
├── requirements.txt
├── scripts
├── Actitracker_train.py
├── EEG_alcoholic_train.py
├── experiment_PAMAP.py
├── experiment_PAMAP2_9fold.py
└── pamap2.py
├── tests
├── __init__.py
├── test_tutorial_pamap2.py
└── test_tutorial_weather.py
└── utils
├── __init__.py
├── tutorial_pamap2.py
├── tutorial_racketsports.py
├── tutorial_vu.py
└── tutorial_weather.py
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: CI Build
5 |
6 | on:
7 | workflow_dispatch:
8 | push:
9 | branches:
10 | - main
11 | pull_request:
12 | branches:
13 | - main
14 | schedule:
15 | - cron: '0 0 1 * *'
16 |
17 | jobs:
18 | first_check:
19 | name: first code check / python-3.10 / ubuntu-latest
20 | runs-on: ubuntu-latest
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: '3.10'
27 | - name: Python info
28 | run: |
29 | which python3
30 | python3 --version
31 | - name: Install dependiencies
32 | run: |
33 | python3 -m pip install --upgrade pip setuptools wheel
34 | python3 -m pip install mcfly prospector pytest pandas
35 | - name: Check style against standards using prospector (only warn for now, but never fail)
36 | shell: bash -l {0}
37 | run: prospector --profile linter_profile -o grouped -o pylint:pylint-report.txt --zero-exit
38 | - name: Run unit tests
39 | run: pytest -v
40 |
41 | basic_checks:
42 | name: Run tests across OS and versions / python-${{ matrix.python-version }} / ${{ matrix.os }}
43 | runs-on: ${{ matrix.os }}
44 | needs: first_check
45 | strategy:
46 | fail-fast: false
47 | matrix:
48 | os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
49 | python-version: ['3.7', '3.8', '3.9', '3.10']
50 | exclude:
51 | # already tested in first_check job
52 | - python-version: 3.10
53 | os: ubuntu-latest
54 | steps:
55 | - uses: actions/checkout@v3
56 | - name: Set up Python ${{ matrix.python-version }}
57 | uses: actions/setup-python@v3
58 | with:
59 | python-version: ${{ matrix.python-version }}
60 | - name: Python info
61 | run: |
62 | which python
63 | python --version
64 | - name: Install dependencies
65 | run: |
66 | python3 -m pip install --upgrade pip setuptools wheel
67 | python3 -m pip install mcfly prospector pytest pandas
68 | - name: Run unit tests
69 | run: pytest -v
70 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | notebooks/tutorial/data/
2 | notebooks/tutorial/data.zip
3 | */.ipynb_checkpoints/
4 | */__pycache__/
5 | *.pyc
6 | notebooks/*/.ipynb_checkpoints/
7 | env
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | [](https://github.com/NLeSC/mcfly-tutorial/actions)
6 |
7 | This repository contains notebooks that show how to use the [mcfly](https://github.com/NLeSC/mcfly) software. Mcfly is deep learning tool for time series classification.
8 |
9 | ## Tutorials
10 | Currently we offer two tutorials here.
11 | Our main tutorial can be found in the notebook [notebooks/tutorial/tutorial.ipynb](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/tutorial.ipynb). This tutorial will let you train deep learning models with mcfly on the [PAMAP2 dataset for activity recognition](https://archive.ics.uci.edu/ml/datasets/PAMAP2+Physical+Activity+Monitoring).
12 |
13 | A comparable, slightly quicker tutorial can be found in the notebook [notebooks/tutorial/tutorial_quick.ipynb](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/tutorial_quick.ipynb). This tutorial will let you train deep learning models with mcfly on the [RacketSports dataset for activity recognition](http://www.timeseriesclassification.com/description.php?Dataset=RacketSports).
14 |
15 | Prerequisites:
16 | - Python 3.7 and above
17 | - The following python packages have to be installed (also specified in requirements.txt file):
18 | - mcfly
19 | - jupyter
20 | - pandas
21 | - matplotlib
22 | - scipy
23 | - numpy
24 |
25 | ## Installation
26 |
27 | ```shell
28 | python3 -m venv env
29 | . env/bin/activate
30 | pip install --upgrade pip setuptools
31 | pip install -r requirements.txt
32 | ```
33 |
34 | ## Running the notebooks
35 | The tutorials can be run using Jupyter notebook. From the tutorial root folder run:
36 |
37 | `jupyter notebook`
38 |
39 | There are two versions of the tutorial. The [standard tutorial](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/tutorial.ipynb) is for self-learning. There is also a [version for workshops](https://github.com/NLeSC/mcfly-tutorial/blob/master/notebooks/tutorial/workshop.ipynb) which is only expected to be used with the aid of an instructor.
40 |
--------------------------------------------------------------------------------
/cheatsheet.md:
--------------------------------------------------------------------------------
1 | # mcfly cheatsheet
2 |
3 | This document can be found at https://github.com/NLeSC/mcfly-tutorial/blob/master/cheatsheet.md
4 |
5 | Detailed documentation can be found in the mcfly [wiki](https://github.com/NLeSC/mcfly/wiki/Home---mcfly).
6 |
7 | Notebook tutorials can be found in the mcfly-tutorial [repository](https://github.com/NLeSC/mcfly-tutorial)
8 |
9 | ### Jargon terms
10 | * [**accuracy**](https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers): proportion of correctly classified samples on all samples in a dataset.
11 | * **convolutional filter**: a set of weights that are applied to neighbouring data points.
12 | * [**convolutional layer**](http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/): type of network layer where a convolutional filter is slided over the input.
13 | * **CNN**: Convolutional Neural Network, a deep learning network that includes convolutional layers, often combined with dense or fully connected layers.
14 | * [**LSTM layer**](http://colah.github.io/posts/2015-08-Understanding-LSTMs/): Long Term Short Memory layer. This is a special type of Recurrent layer, that takes a sequence as input and outputs a sequence.
15 | * **DeepConvLSTM**: A deep learning network that includes both convolutional layers and LSTM layers
16 | * **epoch**: One full pass through a dataset (all datapoints are seen once) in the process of training the weights of a network.
17 | * **loss**: An indicator of overall classification error. More errors means greater loss. In mcfly we use [categorical cross entropy](http://cs231n.github.io/linear-classify/#softmax).
18 | * [**gradient descent**](http://cs231n.github.io/optimization-1/): Algorithm used to find the locally optimal weights for the nodes in the network. The algorithm iteratively improves the weights in order to minimize classification loss. The search space can be interpreted as a landscape where the lowest point is the optimum, hence the term 'descent'. In each step of the gradient descent algorithm, the weights are adjusted with a step in the direction of the gradient ('slope').
19 | * **hyperparameters**: In mcfly, the hyperparameters are the architectural choices of the model (number of layers, lstm or convolutional layers, etc) and the learning rate and regulization rate.
20 | * **layer**: A deep learning network consists of multiple layers. The more layers, the deeper your network.
21 | * **learning rate**: The step size to take in the gradient descent algorithm.
22 | * **regularization rate**: How strongly the [L2 regularization](http://cs231n.github.io/neural-networks-2/#reg) is applied to avoid overfitting on train data.
23 | * **[validation set](https://en.wikipedia.org/wiki/Test_set#Validation_set)**: Part of the data that is kept apart to evaluate the performance of your model and choose hyper parameters.
24 |
25 |
26 |
27 |
28 | ### Input data:
29 | *X_train* => Nr samples **x** Nr timesteps **x** Nr channels
30 |
31 | *y_train_binary* => Nr samples **x** Nr classes
32 |
33 | ### Generate models:
34 | Generate one or multiple untrained Keras models with random hyperparameters.
35 |
36 | ```
37 | num_classes = y_train_binary.shape[1]
38 | models = modelgen.generate_models(X_train.shape, number_of_classes=num_classes, number_of_models = 2)
39 | ```
40 |
41 | ### Train multiple models:
42 | Tries out a number of models on a subsample of the data, and outputs the best found architecture and hyperparameters.
43 | ```
44 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(
45 | X_train, y_train_binary, X_val, y_val_binary,
46 | models, nr_epochs=5, subset_size=300,
47 | verbose=True, outputfile=outputfile)
48 | ```
49 | ### Select best model
50 | ```
51 | best_model_index = np.argmax(val_accuracies)
52 | best_model, best_params, best_model_types = models[best_model_index]
53 | ```
54 |
55 | ### Train one specific model (this is done with Keras function fit):
56 | ```
57 | best_model.fit(X_train, y_train_binary,
58 | nb_epoch=25, validation_data=(X_val, y_val_binary))
59 | ```
60 |
--------------------------------------------------------------------------------
/linter_profile.yaml:
--------------------------------------------------------------------------------
1 | output-format: json
2 |
3 | strictness: medium
4 | test-warnings: true
5 | doc-warnings: false
6 |
7 | pylint:
8 | disable:
9 | - wrong-import-position
10 | - redefined-builtin
11 | - bare-except
12 | - unused-argument
13 | - dangerous-default-value
14 | - too-many-branches
15 | - too-many-arguments
16 | - too-many-locals
17 | - protected-access
18 |
19 | pep8:
20 | disable:
21 | - E722
22 |
--------------------------------------------------------------------------------
/mcflylogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLeSC/mcfly-tutorial/4b1548058c158d0efef41bfb6c7b2caa575a8858/mcflylogo.png
--------------------------------------------------------------------------------
/notebooks/experiments/Actitracker_train.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [
10 | {
11 | "name": "stderr",
12 | "output_type": "stream",
13 | "text": [
14 | "Using TensorFlow backend.\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import sys\n",
20 | "import os\n",
21 | "import numpy as np\n",
22 | "import pandas as pd\n",
23 | "import json\n",
24 | "# mcfly\n",
25 | "from mcfly import modelgen, find_architecture, storage"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "data_path = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/'\n",
37 | "preprocessed_path = os.path.join(data_path, 'preprocessed')\n",
38 | "result_path = os.path.join(data_path, 'models')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 3,
44 | "metadata": {
45 | "collapsed": false
46 | },
47 | "outputs": [],
48 | "source": [
49 | "X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy'))\n",
50 | "X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy'))\n",
51 | "X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy'))\n",
52 | "y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy'))\n",
53 | "y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy'))\n",
54 | "y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy'))"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 4,
60 | "metadata": {
61 | "collapsed": true
62 | },
63 | "outputs": [],
64 | "source": [
65 | "with open(os.path.join(preprocessed_path, 'labels.json')) as f:\n",
66 | " labels = json.load(f)"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## Generate models"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 5,
79 | "metadata": {
80 | "collapsed": false
81 | },
82 | "outputs": [],
83 | "source": [
84 | "num_classes = y_train.shape[1]\n",
85 | "\n",
86 | "models = modelgen.generate_models(X_train.shape,\n",
87 | " number_of_classes=num_classes,\n",
88 | " number_of_models = 2) #10)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 6,
94 | "metadata": {
95 | "collapsed": false
96 | },
97 | "outputs": [
98 | {
99 | "data": {
100 | "text/plain": [
101 | "Walking 0.445402\n",
102 | "LyingDown 0.055904\n",
103 | "Standing 0.082027\n",
104 | "Sitting 0.281609\n",
105 | "Jogging 0.103971\n",
106 | "Stairs 0.031087\n",
107 | "dtype: float64"
108 | ]
109 | },
110 | "execution_count": 6,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "#what is the fraction of classes in the validation set?\n",
117 | "pd.Series(y_val.mean(axis=0), index=labels)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 7,
123 | "metadata": {
124 | "collapsed": true
125 | },
126 | "outputs": [],
127 | "source": [
128 | "if not os.path.exists(result_path):\n",
129 | " os.makedirs(result_path)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {
136 | "collapsed": true
137 | },
138 | "outputs": [],
139 | "source": [
140 | "outputfile = os.path.join(resultpath, 'modelcomparison.json')\n",
141 | "histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,\n",
142 | " X_val[:10], y_val[:10],\n",
143 | " models,nr_epochs=1, #5,\n",
144 | " subset_size=128, # 512,\n",
145 | " verbose=True,\n",
146 | " batch_size=32,\n",
147 | " outputpath=outputfile,\n",
148 | " early_stopping=True)"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 14,
154 | "metadata": {
155 | "collapsed": false,
156 | "scrolled": true
157 | },
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | "Details of the training process were stored in /media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/models_test/models.json\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "print('Details of the training process were stored in ',outputfile)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 15,
174 | "metadata": {
175 | "collapsed": false
176 | },
177 | "outputs": [
178 | {
179 | "name": "stdout",
180 | "output_type": "stream",
181 | "text": [
182 | "Model type and parameters of the best model:\n",
183 | "DeepConvLSTM\n",
184 | "{'regularization_rate': 0.00574537358824132, 'lstm_dims': [48, 52, 42, 38, 56], 'filters': [100, 87, 31, 82, 70], 'learning_rate': 0.0011995620624020058}\n"
185 | ]
186 | }
187 | ],
188 | "source": [
189 | "best_model_index = np.argmax(val_accuracies)\n",
190 | "best_model, best_params, best_model_types = models[best_model_index]\n",
191 | "print('Model type and parameters of the best model:')\n",
192 | "print(best_model_types)\n",
193 | "print(best_params)\n"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {
200 | "collapsed": false,
201 | "scrolled": true
202 | },
203 | "outputs": [],
204 | "source": [
205 | "nr_epochs = 3\n",
206 | "datasize = 128 # X_train.shape[0]\n",
207 | "history = best_model.fit(X_train[:datasize,:,:], y_train[:datasize,:],\n",
208 | " epochs=nr_epochs, validation_data=(X_val, y_val))"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 20,
214 | "metadata": {
215 | "collapsed": false
216 | },
217 | "outputs": [],
218 | "source": [
219 | "best_model.save(os.path.join(result_path, 'best_model.h5'))"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {
226 | "collapsed": false
227 | },
228 | "outputs": [],
229 | "source": [
230 | "from keras.models import load_model\n",
231 | "best_model = load_model(os.path.join(result_path, 'best_model.h5'))"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 13,
237 | "metadata": {
238 | "collapsed": false
239 | },
240 | "outputs": [
241 | {
242 | "name": "stdout",
243 | "output_type": "stream",
244 | "text": [
245 | "1641/1641 [==============================] - 110s \n",
246 | "Score of best model: [3.3572144339827048, 0.57586837305229366]\n"
247 | ]
248 | }
249 | ],
250 | "source": [
251 | "## Test on Testset\n",
252 | "score_test = best_model.evaluate(X_test, y_test, verbose=True)\n",
253 | "print('Score of best model: ' + str(score_test))"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {
260 | "collapsed": true
261 | },
262 | "outputs": [],
263 | "source": [
264 | "probs = model.predict_proba(X_test)\n",
265 | "predicted = probs.argmax(axis=1)\n",
266 | "y_index = y_val_binary.argmax(axis=1)\n",
267 | "confusion_matrix = pd.crosstab(pd.Series(y_index), pd.Series(predicted))\n",
268 | "confusion_matrix.index = [labels[i] for i in confusion_matrix.index]\n",
269 | "confusion_matrix.columns = [labels[i] for i in confusion_matrix.columns]\n",
270 | "confusion_matrix.reindex(columns=[l for l in labels], fill_value=0)\n",
271 | "confusion_matrix"
272 | ]
273 | }
274 | ],
275 | "metadata": {
276 | "anaconda-cloud": {},
277 | "kernelspec": {
278 | "display_name": "Python [conda env:mcfly]",
279 | "language": "python",
280 | "name": "conda-env-mcfly-py"
281 | },
282 | "language_info": {
283 | "codemirror_mode": {
284 | "name": "ipython",
285 | "version": 3
286 | },
287 | "file_extension": ".py",
288 | "mimetype": "text/x-python",
289 | "name": "python",
290 | "nbconvert_exporter": "python",
291 | "pygments_lexer": "ipython3",
292 | "version": "3.5.2"
293 | }
294 | },
295 | "nbformat": 4,
296 | "nbformat_minor": 0
297 | }
298 |
--------------------------------------------------------------------------------
/notebooks/experiments/EEG_alcoholic_preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "from io import StringIO\n",
14 | "import os"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "def read_datafile(fn):\n",
26 | " with open(fn, 'r') as f:\n",
27 | " header_1 = f.readline()\n",
28 | " subject = header_1.split('.')[0][2:]\n",
29 | " header_2 = f.readline()\n",
30 | " header_3 = f.readline()\n",
31 | " header_4 = f.readline()\n",
32 | " stimulus = ' '.join(header_4.split(' ')[1:3])\n",
33 | " header_5 = f.readline()\n",
34 | " rest = f.read()\n",
35 | " if(len(rest)>0):\n",
36 | " data_trial = pd.read_csv(StringIO(rest), sep=' ', header=None)\n",
37 | " data_trial.columns = ['trial', 'sensor', 'sample', 'value']\n",
38 | " data_trial['subject'] = subject\n",
39 | " data_trial['stimulus'] = stimulus\n",
40 | " else:\n",
41 | " data_trial = None\n",
42 | " return data_trial"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {
49 | "collapsed": true
50 | },
51 | "outputs": [],
52 | "source": [
53 | "data_dir = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/data/'"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 6,
59 | "metadata": {
60 | "collapsed": false
61 | },
62 | "outputs": [
63 | {
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | " "
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "%%prun\n",
73 | "dat = pd.DataFrame(columns=['subject', 'stimulus', 'trial', 'sensor', 'sample', 'value'])\n",
74 | "for fn in os.listdir(data_dir)[:100]:\n",
75 | " full_fn = os.path.join(data_dir, fn)\n",
76 | " if os.path.isfile(full_fn):\n",
77 | " try:\n",
78 | " data_trial = read_datafile(full_fn)\n",
79 | " if data_trial is not None:\n",
80 | " dat = dat.append(data_trial)\n",
81 | " except Exception as err:\n",
82 | " print(fn, err)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 19,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/plain": [
95 | "(2119, 2)"
96 | ]
97 | },
98 | "execution_count": 19,
99 | "metadata": {},
100 | "output_type": "execute_result"
101 | }
102 | ],
103 | "source": [
104 | "dat[['subject', 'trial']].drop_duplicates().shape"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 4,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "co2c1000367.rd.065 No columns to parse from file\n",
119 | "co2c1000367.rd.089 No columns to parse from file\n",
120 | "co2c1000367.rd.090 No columns to parse from file\n",
121 | "co2c1000367.rd.105 No columns to parse from file\n",
122 | "co2c1000367.rd.113 No columns to parse from file\n",
123 | "co2c1000367.rd.114 No columns to parse from file\n",
124 | "co2c1000367.rd.116 No columns to parse from file\n",
125 | "co2c1000367.rd.117 No columns to parse from file\n",
126 | "co2c1000367.rd.004 No columns to parse from file\n",
127 | "co2c1000367.rd.005 No columns to parse from file\n",
128 | "co2c1000367.rd.006 No columns to parse from file\n",
129 | "co2c1000367.rd.023 No columns to parse from file\n",
130 | "co2c1000367.rd.029 No columns to parse from file\n",
131 | "co2c1000367.rd.037 No columns to parse from file\n",
132 | "co2c1000367.rd.042 No columns to parse from file\n",
133 | "co2c1000367.rd.053 No columns to parse from file\n",
134 | "co2c1000367.rd.054 No columns to parse from file\n",
135 | "318.12917041778564 s\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "#%%prun\n",
141 | "import time\n",
142 | "\n",
143 | "Xes = []\n",
144 | "sensors = []\n",
145 | "labels = []\n",
146 | "headers = []\n",
147 | "\n",
148 | "t = time.time()\n",
149 | "for fn in os.listdir(data_dir):\n",
150 | "#for fn in np.random.choice(os.listdir(data_dir), 100):\n",
151 | " full_fn = os.path.join(data_dir, fn)\n",
152 | " if os.path.isfile(full_fn):\n",
153 | " try:\n",
154 | " data_trial = pd.read_csv(full_fn, sep=' ', header=None, comment='#')\n",
155 | " if data_trial is not None:\n",
156 | " data_trial.columns = ['trial', 'sensor', 'sample', 'value']\n",
157 | " pivoted = data_trial.pivot_table(index='sample', columns='sensor', values='value')\n",
158 | " Xes.append(pivoted.as_matrix())\n",
159 | " labels.append(fn[3])\n",
160 | " sensors.append(pivoted.columns)\n",
161 | " with open(full_fn, 'r') as f:\n",
162 | " header = [f.readline() for i in range(5)]\n",
163 | " headers.append(header)\n",
164 | " except Exception as err:\n",
165 | " print(fn, err)\n",
166 | "print((time.time()-t), 's')"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 12,
172 | "metadata": {
173 | "collapsed": false
174 | },
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/plain": [
179 | "# S1 obj 5477\n",
180 | "# S2 match 2757\n",
181 | "# S2 nomatch 2728\n",
182 | "# S2 match err 60\n",
183 | "# S2 nomatch err 35\n",
184 | "dtype: int64"
185 | ]
186 | },
187 | "execution_count": 12,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "stimuli = [h[3].split(',')[0].strip() for h in headers]\n",
194 | "pd.Series(stimuli).value_counts()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 40,
200 | "metadata": {
201 | "collapsed": false
202 | },
203 | "outputs": [],
204 | "source": [
205 | "subjects = [h[0].split(' ')[1].strip() for h in headers]\n",
206 | "trials = [h[3].split('trial ')[-1].strip() for h in headers]"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 41,
212 | "metadata": {
213 | "collapsed": false
214 | },
215 | "outputs": [
216 | {
217 | "name": "stdout",
218 | "output_type": "stream",
219 | "text": [
220 | "(11057, 3)\n"
221 | ]
222 | },
223 | {
224 | "data": {
225 | "text/html": [
226 | "\n",
227 | "
\n",
228 | " \n",
229 | " \n",
230 | " | \n",
231 | " stimuli | \n",
232 | " subject | \n",
233 | " trial | \n",
234 | "
\n",
235 | " \n",
236 | " \n",
237 | " \n",
238 | " 0 | \n",
239 | " # S2 nomatch | \n",
240 | " co3c0000402.rd | \n",
241 | " 13 | \n",
242 | "
\n",
243 | " \n",
244 | " 1 | \n",
245 | " # S1 obj | \n",
246 | " co3c0000402.rd | \n",
247 | " 14 | \n",
248 | "
\n",
249 | " \n",
250 | " 2 | \n",
251 | " # S2 nomatch | \n",
252 | " co3c0000402.rd | \n",
253 | " 15 | \n",
254 | "
\n",
255 | " \n",
256 | " 3 | \n",
257 | " # S1 obj | \n",
258 | " co3c0000402.rd | \n",
259 | " 16 | \n",
260 | "
\n",
261 | " \n",
262 | " 4 | \n",
263 | " # S2 match | \n",
264 | " co3c0000402.rd | \n",
265 | " 17 | \n",
266 | "
\n",
267 | " \n",
268 | "
\n",
269 | "
"
270 | ],
271 | "text/plain": [
272 | " stimuli subject trial\n",
273 | "0 # S2 nomatch co3c0000402.rd 13\n",
274 | "1 # S1 obj co3c0000402.rd 14\n",
275 | "2 # S2 nomatch co3c0000402.rd 15\n",
276 | "3 # S1 obj co3c0000402.rd 16\n",
277 | "4 # S2 match co3c0000402.rd 17"
278 | ]
279 | },
280 | "execution_count": 41,
281 | "metadata": {},
282 | "output_type": "execute_result"
283 | }
284 | ],
285 | "source": [
286 | "metadata = pd.DataFrame({'subject': subjects, 'trial': trials, 'stimuli': stimuli})\n",
287 | "print(metadata.shape)\n",
288 | "metadata.head()"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 42,
294 | "metadata": {
295 | "collapsed": false
296 | },
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/plain": [
301 | "(11057, 256, 64)"
302 | ]
303 | },
304 | "execution_count": 42,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "# The shape should be: (num_samples, num_timesteps, num_channels)\n",
311 | "Xa = np.array(Xes)\n",
312 | "Xa.shape"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 43,
318 | "metadata": {
319 | "collapsed": false
320 | },
321 | "outputs": [
322 | {
323 | "data": {
324 | "text/html": [
325 | "\n",
326 | "
\n",
327 | " \n",
328 | " \n",
329 | " | \n",
330 | " 0 | \n",
331 | " 1 | \n",
332 | " 2 | \n",
333 | " 3 | \n",
334 | " 4 | \n",
335 | " 5 | \n",
336 | " 6 | \n",
337 | " 7 | \n",
338 | " 8 | \n",
339 | " 9 | \n",
340 | " ... | \n",
341 | " 54 | \n",
342 | " 55 | \n",
343 | " 56 | \n",
344 | " 57 | \n",
345 | " 58 | \n",
346 | " 59 | \n",
347 | " 60 | \n",
348 | " 61 | \n",
349 | " 62 | \n",
350 | " 63 | \n",
351 | "
\n",
352 | " \n",
353 | " \n",
354 | " \n",
355 | " 0 | \n",
356 | " AF1 | \n",
357 | " AF2 | \n",
358 | " AF7 | \n",
359 | " AF8 | \n",
360 | " AFZ | \n",
361 | " C1 | \n",
362 | " C2 | \n",
363 | " C3 | \n",
364 | " C4 | \n",
365 | " C5 | \n",
366 | " ... | \n",
367 | " PO8 | \n",
368 | " POZ | \n",
369 | " PZ | \n",
370 | " T7 | \n",
371 | " T8 | \n",
372 | " TP7 | \n",
373 | " TP8 | \n",
374 | " X | \n",
375 | " Y | \n",
376 | " nd | \n",
377 | "
\n",
378 | " \n",
379 | "
\n",
380 | "
1 rows × 64 columns
\n",
381 | "
"
382 | ],
383 | "text/plain": [
384 | " 0 1 2 3 4 5 6 7 8 9 ... 54 55 56 57 58 59 \\\n",
385 | "0 AF1 AF2 AF7 AF8 AFZ C1 C2 C3 C4 C5 ... PO8 POZ PZ T7 T8 TP7 \n",
386 | "\n",
387 | " 60 61 62 63 \n",
388 | "0 TP8 X Y nd \n",
389 | "\n",
390 | "[1 rows x 64 columns]"
391 | ]
392 | },
393 | "execution_count": 43,
394 | "metadata": {},
395 | "output_type": "execute_result"
396 | }
397 | ],
398 | "source": [
399 | "pd.DataFrame(sensors).drop_duplicates()"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 44,
405 | "metadata": {
406 | "collapsed": false
407 | },
408 | "outputs": [
409 | {
410 | "data": {
411 | "text/plain": [
412 | "a 7033\n",
413 | "c 4024\n",
414 | "dtype: int64"
415 | ]
416 | },
417 | "execution_count": 44,
418 | "metadata": {},
419 | "output_type": "execute_result"
420 | }
421 | ],
422 | "source": [
423 | "# How many subjects do we have for each label?\n",
424 | "pd.Series(labels).value_counts()"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 45,
430 | "metadata": {
431 | "collapsed": false
432 | },
433 | "outputs": [
434 | {
435 | "data": {
436 | "text/plain": [
437 | "10962"
438 | ]
439 | },
440 | "execution_count": 45,
441 | "metadata": {},
442 | "output_type": "execute_result"
443 | }
444 | ],
445 | "source": [
446 | "# filter for errors\n",
447 | "no_error = ~metadata.stimuli.str.contains('err')\n",
448 | "sum(no_error)"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 46,
454 | "metadata": {
455 | "collapsed": false
456 | },
457 | "outputs": [
458 | {
459 | "data": {
460 | "text/plain": [
461 | "(10962, 256, 64)"
462 | ]
463 | },
464 | "execution_count": 46,
465 | "metadata": {},
466 | "output_type": "execute_result"
467 | }
468 | ],
469 | "source": [
470 | "Xa_filtered = Xa[no_error]\n",
471 | "Xa_filtered.shape"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": 53,
477 | "metadata": {
478 | "collapsed": false
479 | },
480 | "outputs": [
481 | {
482 | "data": {
483 | "text/plain": [
484 | "0"
485 | ]
486 | },
487 | "execution_count": 53,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "# Do we have NaN values?\n",
494 | "np.isnan(Xa_filtered).sum()"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 47,
500 | "metadata": {
501 | "collapsed": true
502 | },
503 | "outputs": [],
504 | "source": [
505 | "metadata_filtered = metadata[no_error]"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 64,
511 | "metadata": {
512 | "collapsed": false
513 | },
514 | "outputs": [],
515 | "source": [
516 | "preprocessed_path = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/preprocessed/'\n",
517 | "np.save(os.path.join(preprocessed_path, 'X.npy'), arr=Xa_filtered)\n",
518 | "metadata_filtered.to_csv(os.path.join(preprocessed_path, 'metadata.csv'), index=False)"
519 | ]
520 | },
521 | {
522 | "cell_type": "code",
523 | "execution_count": 56,
524 | "metadata": {
525 | "collapsed": false
526 | },
527 | "outputs": [
528 | {
529 | "name": "stdout",
530 | "output_type": "stream",
531 | "text": [
532 | "8769 1096 1097\n"
533 | ]
534 | }
535 | ],
536 | "source": [
537 | "# Create train and test set\n",
538 | "n = Xa_filtered.shape[0]\n",
539 | "n_train = int(0.8*n)\n",
540 | "n_val = int(0.1*n)\n",
541 | "n_test = n - n_train - n_val\n",
542 | "print(n_train, n_val, n_test)\n",
543 | "\n",
544 | "ind_perm = np.random.permutation(n)\n",
545 | "ind_train = ind_perm[:n_train]\n",
546 | "ind_val = ind_perm[n_train:n_train+n_val]\n",
547 | "ind_test = ind_perm[-n_test:]"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 65,
553 | "metadata": {
554 | "collapsed": false
555 | },
556 | "outputs": [],
557 | "source": [
558 | "np.save(os.path.join(preprocessed_path, 'X_train.npy'), arr=Xa_filtered[ind_train])\n",
559 | "np.save(os.path.join(preprocessed_path, 'X_val.npy'), arr=Xa_filtered[ind_val])\n",
560 | "np.save(os.path.join(preprocessed_path, 'X_test.npy'), arr=Xa_filtered[ind_test])"
561 | ]
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": 71,
566 | "metadata": {
567 | "collapsed": false
568 | },
569 | "outputs": [
570 | {
571 | "data": {
572 | "text/plain": [
573 | "(10962, 2)"
574 | ]
575 | },
576 | "execution_count": 71,
577 | "metadata": {},
578 | "output_type": "execute_result"
579 | }
580 | ],
581 | "source": [
582 | "# make binary labels\n",
583 | "y = np.zeros((len(labels), 2))\n",
584 | "y[:, 0] = [1*(l=='a') for l in labels]\n",
585 | "y[:, 1] = [1*(l=='c') for l in labels]\n",
586 | "y_filtered = y[no_error]\n",
587 | "y_filtered.shape"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 72,
593 | "metadata": {
594 | "collapsed": false
595 | },
596 | "outputs": [],
597 | "source": [
598 | "np.save(os.path.join(preprocessed_path, 'y.npy'), arr=y_filtered)\n",
599 | "np.save(os.path.join(preprocessed_path, 'y_train.npy'), arr=y_filtered[ind_train])\n",
600 | "np.save(os.path.join(preprocessed_path, 'y_val.npy'), arr=y_filtered[ind_val])\n",
601 | "np.save(os.path.join(preprocessed_path, 'y_test.npy'), arr=y_filtered[ind_test])"
602 | ]
603 | }
604 | ],
605 | "metadata": {
606 | "anaconda-cloud": {},
607 | "kernelspec": {
608 | "display_name": "Python [conda env:mcfly]",
609 | "language": "python",
610 | "name": "conda-env-mcfly-py"
611 | },
612 | "language_info": {
613 | "codemirror_mode": {
614 | "name": "ipython",
615 | "version": 3
616 | },
617 | "file_extension": ".py",
618 | "mimetype": "text/x-python",
619 | "name": "python",
620 | "nbconvert_exporter": "python",
621 | "pygments_lexer": "ipython3",
622 | "version": "3.5.2"
623 | }
624 | },
625 | "nbformat": 4,
626 | "nbformat_minor": 0
627 | }
628 |
--------------------------------------------------------------------------------
/notebooks/experiments/EEG_alcoholic_train.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import sys\n",
12 | "import os\n",
13 | "import numpy as np\n",
14 | "import pandas as pd\n",
15 | "# mcfly\n",
16 | "from mcfly import modelgen, find_architecture, storage"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 3,
22 | "metadata": {
23 | "collapsed": true
24 | },
25 | "outputs": [],
26 | "source": [
27 | "data_path = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/'\n",
28 | "preprocessed_path = os.path.join(data_path, 'preprocessed')\n",
29 | "result_path = os.path.join(data_path, 'models')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 4,
35 | "metadata": {
36 | "collapsed": true
37 | },
38 | "outputs": [],
39 | "source": [
40 | "\n",
41 | "X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy'))\n",
42 | "X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy'))\n",
43 | "X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy'))\n",
44 | "y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy'))\n",
45 | "y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy'))\n",
46 | "y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy'))"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "## Generate models"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 5,
59 | "metadata": {
60 | "collapsed": false
61 | },
62 | "outputs": [],
63 | "source": [
64 | "num_classes = y_train.shape[1]\n",
65 | "\n",
66 | "models = modelgen.generate_models(X_train.shape,\n",
67 | " number_of_classes=num_classes,\n",
68 | " number_of_models = 10)"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 6,
74 | "metadata": {
75 | "collapsed": false
76 | },
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/plain": [
81 | "array([ 0.63868613, 0.36131387])"
82 | ]
83 | },
84 | "execution_count": 6,
85 | "metadata": {},
86 | "output_type": "execute_result"
87 | }
88 | ],
89 | "source": [
90 | "#what is the fraction of a vs c in the validation set?\n",
91 | "y_val.mean(axis=0)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 7,
97 | "metadata": {
98 | "collapsed": true
99 | },
100 | "outputs": [],
101 | "source": [
102 | "if not os.path.exists(result_path):\n",
103 | " os.makedirs(result_path)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {
110 | "collapsed": false
111 | },
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "Training model 0 DeepConvLSTM\n",
118 | "Train on 512 samples, validate on 1096 samples\n",
119 | "Epoch 1/5\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "outputfile = os.path.join(result_path, 'modelcomparison.json')\n",
125 | "histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,\n",
126 | " X_val, y_val,\n",
127 | " models,nr_epochs=5,\n",
128 | " subset_size=512,\n",
129 | " verbose=True,\n",
130 | " batch_size=32,\n",
131 | " outputfile=outputfile,\n",
132 | " early_stopping=True)\n",
133 | "print('Details of the training process were stored in ',outputfile)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "collapsed": true
141 | },
142 | "outputs": [],
143 | "source": []
144 | }
145 | ],
146 | "metadata": {
147 | "anaconda-cloud": {},
148 | "kernelspec": {
149 | "display_name": "Python [conda env:mcfly]",
150 | "language": "python",
151 | "name": "conda-env-mcfly-py"
152 | },
153 | "language_info": {
154 | "codemirror_mode": {
155 | "name": "ipython",
156 | "version": 3
157 | },
158 | "file_extension": ".py",
159 | "mimetype": "text/x-python",
160 | "name": "python",
161 | "nbconvert_exporter": "python",
162 | "pygments_lexer": "ipython3",
163 | "version": "3.5.2"
164 | }
165 | },
166 | "nbformat": 4,
167 | "nbformat_minor": 0
168 | }
169 |
--------------------------------------------------------------------------------
/notebooks/experiments/Preprocess_PAMAP2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [
10 | {
11 | "name": "stderr",
12 | "output_type": "stream",
13 | "text": [
14 | "Using Theano backend.\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "%load_ext autoreload\n",
20 | "%autoreload 2\n",
21 | "import sys\n",
22 | "import os\n",
23 | "sys.path.insert(0, os.path.abspath('../..'))\n",
24 | "import numpy as np\n",
25 | "import pandas as pd\n",
26 | "# mcfly\n",
27 | "from mcfly import tutorial_pamap2, modelgen, find_architecture, storage"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {
34 | "collapsed": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "# Specify in which directory you want to store the data:\n",
39 | "directory_to_extract_to = \"/media/sf_VBox_Shared/timeseries/\""
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "['timestamp', 'activityID', 'heartrate', 'hand_temperature', 'hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z', 'hand_acc_6g_x', 'hand_acc_6g_y', 'hand_acc_6g_z', 'hand_gyroscope_x', 'hand_gyroscope_y', 'hand_gyroscope_z', 'hand_magnometer_x', 'hand_magnometer_y', 'hand_magnometer_z', 'hand_orientation_0', 'hand_orientation_1', 'hand_orientation_2', 'hand_orientation_3', 'chest_temperature', 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z', 'chest_acc_6g_x', 'chest_acc_6g_y', 'chest_acc_6g_z', 'chest_gyroscope_x', 'chest_gyroscope_y', 'chest_gyroscope_z', 'chest_magnometer_x', 'chest_magnometer_y', 'chest_magnometer_z', 'chest_orientation_0', 'chest_orientation_1', 'chest_orientation_2', 'chest_orientation_3', 'ankle_temperature', 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z', 'ankle_acc_6g_x', 'ankle_acc_6g_y', 'ankle_acc_6g_z', 'ankle_gyroscope_x', 'ankle_gyroscope_y', 'ankle_gyroscope_z', 'ankle_magnometer_x', 'ankle_magnometer_y', 'ankle_magnometer_z', 'ankle_orientation_0', 'ankle_orientation_1', 'ankle_orientation_2', 'ankle_orientation_3']\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "header = tutorial_pamap2.get_header()\n",
59 | "print(header)"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "19 possible activities:\n",
67 | "\n",
68 | "– 1 lying – 2 sitting – 3 standing – 4 walking – 5 running – 6 cycling – 7 Nordic walking – 9 watching TV – 10 computer work – 11 car driving – 12 ascending stairs – 13 descending stairs – 16 vacuum cleaning – 17 ironing – 18 folding laundry – 19 house cleaning – 20 playing soccer – 24 rope jumping – 0 other (transient activities"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "Included activities:\n",
76 | "\n",
77 | "(1-lie, 2-sit, 3-stand, 4-walk, 5-run, 6-cycle, 7-Nordic walk, 17-iron, 16-vacuum clean, 24-rope jump, 12-ascend and 13-descend stairs)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 11,
83 | "metadata": {
84 | "collapsed": false
85 | },
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "12\n",
92 | "[9, 10, 11, 18, 19, 20, 0]\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "all_activities = [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24, 0]\n",
98 | "include_activities = [1, 2, 3, 4, 5, 6, 7, 12, 13, 16, 17, 24]\n",
99 | "print(len(include_activities))\n",
100 | "exclude_activities = [n for n in all_activities if n not in include_activities]\n",
101 | "print(exclude_activities)"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 5,
107 | "metadata": {
108 | "collapsed": true
109 | },
110 | "outputs": [],
111 | "source": [
112 | "columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',\n",
113 | " 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',\n",
114 | " 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']\n",
115 | "outputdir = \"cleaned_12activities_9vars\""
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 8,
121 | "metadata": {
122 | "collapsed": false
123 | },
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "Automatic pdb calling has been turned ON\n",
130 | "Data previously downloaded and stored in /media/sf_VBox_Shared/timeseries/PAMAP2/\n",
131 | "Start pre-processing all 9 files...\n",
132 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_0 y_0\n",
133 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_1 y_1\n",
134 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_2 y_2\n",
135 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_3 y_3\n",
136 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_4 y_4\n",
137 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_5 y_5\n",
138 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_6 y_6\n",
139 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_7 y_7\n",
140 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/X_8 y_8\n",
141 | "Processed data succesfully stored in /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars\n"
142 | ]
143 | },
144 | {
145 | "data": {
146 | "text/plain": [
147 | "'/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars'"
148 | ]
149 | },
150 | "execution_count": 8,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "%pdb 1\n",
157 | "tutorial_pamap2.fetch_and_preprocess(directory_to_extract_to, \n",
158 | " columns_to_use=columns_to_use, \n",
159 | " output_dir=outputdir, \n",
160 | " exclude_activities=exclude_activities,\n",
161 | " fold=True)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 10,
167 | "metadata": {
168 | "collapsed": false
169 | },
170 | "outputs": [
171 | {
172 | "name": "stdout",
173 | "output_type": "stream",
174 | "text": [
175 | "Data previously downloaded and stored in /media/sf_VBox_Shared/timeseries/PAMAP2/\n",
176 | "Start pre-processing all 9 files...\n",
177 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_0 y_0\n",
178 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_1 y_1\n",
179 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_2 y_2\n",
180 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_3 y_3\n",
181 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_4 y_4\n",
182 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_5 y_5\n",
183 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_6 y_6\n",
184 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_7 y_7\n",
185 | "Stored /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/X_8 y_8\n",
186 | "Processed data succesfully stored in /media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars\n"
187 | ]
188 | },
189 | {
190 | "data": {
191 | "text/plain": [
192 | "'/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars'"
193 | ]
194 | },
195 | "execution_count": 10,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "columns_to_use = header[2:]\n",
202 | "outputdir = \"cleaned_12activities_allvars\"\n",
203 | "tutorial_pamap2.fetch_and_preprocess(directory_to_extract_to, \n",
204 | " columns_to_use=columns_to_use, \n",
205 | " output_dir=outputdir, \n",
206 | " exclude_activities=exclude_activities,\n",
207 | " fold=True)"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {
214 | "collapsed": true
215 | },
216 | "outputs": [],
217 | "source": []
218 | }
219 | ],
220 | "metadata": {
221 | "anaconda-cloud": {},
222 | "kernelspec": {
223 | "display_name": "Python [conda env:mcfly]",
224 | "language": "python",
225 | "name": "conda-env-mcfly-py"
226 | },
227 | "language_info": {
228 | "codemirror_mode": {
229 | "name": "ipython",
230 | "version": 3
231 | },
232 | "file_extension": ".py",
233 | "mimetype": "text/x-python",
234 | "name": "python",
235 | "nbconvert_exporter": "python",
236 | "pygments_lexer": "ipython3",
237 | "version": "3.5.2"
238 | }
239 | },
240 | "nbformat": 4,
241 | "nbformat_minor": 0
242 | }
243 |
--------------------------------------------------------------------------------
/notebooks/experiments/dataset_PEMS_prepare.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Dataset: PEMS\n",
9 | "\n",
10 | "http://www.timeseriesclassification.com/description.php?Dataset=PEMS-SF \n",
11 | "https://archive.ics.uci.edu/ml/datasets/PEMS-SF\n",
12 | "\n",
13 | "### Info from data source:\n",
14 | "Source: California Department of Transportation, www.pems.dot.ca.gov\n",
15 | "Creator: Marco Cuturi, Kyoto University, mcuturi '@' i.kyoto-u.ac.jp\n",
16 | "\n",
17 | "Data Set Information:\n",
18 | "\n",
19 | "15 months worth of daily data from the California Department of Transportation PEMS website. The data describes the occupancy\n",
20 | "rate, between 0 and 1, of different car lanes of San Francisco bay area freeways. The measurements cover the period from Jan. 1st 2008 to Mar. 30th 2009 and are sampled every 10 minutes. We consider each day in this database as a single time series of dimension 963 (the number of sensors which functioned consistently throughout the studied period) and length 6 x 24=144. We remove public holidays from the dataset, as well\n",
21 | "as two days with anomalies (March 8th 2009 and March 9th 2008) where all sensors were muted between 2:00 and 3:00 AM.\n",
22 | "This results in a database of 440 time series.\n",
23 | "\n",
24 | "The task is to classify each observed day as the correct day of the week, from Monday to Sunday, e.g. label it with an integer in {1,2,3,4,5,6,7}.\n",
25 | "Each attribute describes the measurement of the occupancy rate (between 0 and 1) of a captor location as recorded by a measuring station, at a given timestamp in time during the day. The ID of each station is given in the stations_list text file. For more information on the location (GPS, Highway, Direction) of each station please refer to the PEMS website. There are 963 (stations) x 144 (timestamps) = 138.672 attributes for each record.\n",
26 | "\n",
27 | "Relevant Papers:\n",
28 | "[1] M. Cuturi, Fast Global Alignment Kernels, Proceedings of the Intern. Conference on Machine Learning 2011.\n",
29 | "\n",
30 | "\n",
31 | "### Size:\n",
32 | "+ Training samples: 267\n",
33 | "+ Test sampels: 173\n",
34 | "+ Dimension: 144 timepoints x 963 channels\n",
35 | "+ Classes: 7\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import numpy as np\n",
45 | "import os\n",
46 | "import sys\n",
47 | "import pandas as pd\n",
48 | "\n",
49 | "CODE = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\mcfly\\\\mcfly'\n",
50 | "DATA = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\PEMS-SF'\n",
51 | "sys.path.append(CODE)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 2,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "file_train = os.path.join(DATA, 'PEMS-SF_TRAIN.arff')\n",
61 | "file_test = os.path.join(DATA, 'PEMS-SF_TEST.arff')"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "def load_arff(filename):\n",
71 | " start = 0\n",
72 | "\n",
73 | " data = []\n",
74 | " labels = []\n",
75 | " start_line = 0\n",
76 | " with open(filename) as fp:\n",
77 | " line = fp.readline()\n",
78 | " count = 0\n",
79 | " while line:\n",
80 | " if start == 1:\n",
81 | " label = line.split(\"',\")[-1]\n",
82 | " labels.append(label.replace('\\n', ''))\n",
83 | " line = line.split(\"',\")[0] \n",
84 | " lines = line.split('\\\\n')\n",
85 | " data_line = []\n",
86 | " for l in lines:\n",
87 | " data_line_sub = []\n",
88 | " #for entry in l.split(','):\n",
89 | " #data_line_sub.append(entry.replace(\"'\", \"\"))\n",
90 | " #data_line.append(data_line_sub)\n",
91 | " data_line.append([x.replace(\"'\", \"\") for x in l.split(',')])\n",
92 | " data.append(data_line)\n",
93 | "\n",
94 | " if line.startswith('@data'):\n",
95 | " start_line = count\n",
96 | " #print(\"Actual data start in line\", start_line)\n",
97 | " start = 1\n",
98 | "\n",
99 | " line = fp.readline()\n",
100 | " count += 1\n",
101 | " \n",
102 | " return np.swapaxes(np.array(data).astype(float), 1,2), labels\n",
103 | "\n",
104 | "X_train, y_train = load_arff(file_train)\n",
105 | "X_test0, y_test0 = load_arff(file_test)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 4,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "name": "stdout",
115 | "output_type": "stream",
116 | "text": [
117 | "X_train.shape (267, 144, 963)\n",
118 | "267\n",
119 | "X_test.shape (173, 144, 963)\n",
120 | "173\n"
121 | ]
122 | }
123 | ],
124 | "source": [
125 | "print(\"X_train.shape\", X_train.shape)\n",
126 | "print(len(y_train))\n",
127 | "\n",
128 | "print(\"X_test.shape\", X_test0.shape)\n",
129 | "print(len(y_test0))"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 5,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "data": {
139 | "text/plain": [
140 | "numpy.float64"
141 | ]
142 | },
143 | "execution_count": 5,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "type(X_train[0,0,0])"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 6,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "array([0.0134, 0.0129, 0.0122, 0.0105, 0.0103, 0.0095, 0.0086, 0.0084,\n",
161 | " 0.0079, 0.0075, 0.0075, 0.0076, 0.0073, 0.0073, 0.007 , 0.0074,\n",
162 | " 0.0074, 0.0072, 0.0071, 0.0078, 0.0078, 0.0101, 0.0109, 0.0111,\n",
163 | " 0.0113, 0.0126, 0.0161, 0.0175, 0.0238, 0.0247, 0.0275, 0.0314,\n",
164 | " 0.0397, 0.0532, 0.0568, 0.0593, 0.0589, 0.0721, 0.0765, 0.0893,\n",
165 | " 0.0947, 0.0951, 0.094 , 0.0987, 0.1094, 0.1108, 0.1159, 0.1143,\n",
166 | " 0.1076, 0.1083, 0.1078, 0.1052, 0.1051, 0.0975, 0.0931, 0.0879,\n",
167 | " 0.086 , 0.0861, 0.0857, 0.0834, 0.0754, 0.0745, 0.0736, 0.0731,\n",
168 | " 0.0742, 0.0725, 0.0691, 0.0704, 0.0711, 0.072 , 0.0713, 0.0699,\n",
169 | " 0.0683, 0.0703, 0.0707, 0.0714, 0.0719, 0.0718, 0.0683, 0.0703,\n",
170 | " 0.071 , 0.0703, 0.0723, 0.0706, 0.0698, 0.072 , 0.0736, 0.0744,\n",
171 | " 0.0774, 0.0743, 0.0731, 0.079 , 0.079 , 0.077 , 0.0814, 0.0794,\n",
172 | " 0.0759, 0.0791, 0.0769, 0.0765, 0.0823, 0.081 , 0.0813, 0.0865,\n",
173 | " 0.0892, 0.0834, 0.083 , 0.0789, 0.0755, 0.0747, 0.0723, 0.0657,\n",
174 | " 0.0659, 0.0619, 0.0554, 0.0543, 0.0509, 0.0493, 0.046 , 0.0446,\n",
175 | " 0.0413, 0.0419, 0.0417, 0.0391, 0.0383, 0.0374, 0.0376, 0.0399,\n",
176 | " 0.0406, 0.038 , 0.0374, 0.0359, 0.0336, 0.0335, 0.03 , 0.0294,\n",
177 | " 0.0274, 0.0254, 0.0219, 0.0218, 0.0203, 0.0179, 0.0179, 0.0146])"
178 | ]
179 | },
180 | "execution_count": 6,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "X_train[0,:,10]"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "### Split test into test and validation:"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 7,
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "2.0 25\n",
206 | "3.0 26\n",
207 | "1.0 30\n",
208 | "4.0 23\n",
209 | "7.0 20\n",
210 | "5.0 22\n",
211 | "6.0 27\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "y_val = []\n",
217 | "y_test = []\n",
218 | "IDs_val = []\n",
219 | "IDs_test = []\n",
220 | "\n",
221 | "np.random.seed(1)\n",
222 | "for label in list(set(y_test0)):\n",
223 | " idx = np.where(np.array(y_test0) == label)[0]\n",
224 | " idx1 = np.random.choice(idx, len(idx)//2, replace=False)\n",
225 | " idx2 = list(set(idx) - set(idx1))\n",
226 | " IDs_val.extend(idx1)\n",
227 | " IDs_test.extend(idx2)\n",
228 | " y_val.extend(len(idx1) * [label])\n",
229 | " y_test.extend(len(idx2) * [label])\n",
230 | "\n",
231 | " print(label, y_test0.count(label))\n",
232 | " \n",
233 | "X_test = X_test0[IDs_test,:,:]\n",
234 | "X_val = X_test0[IDs_val,:,:]"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 9,
240 | "metadata": {},
241 | "outputs": [
242 | {
243 | "name": "stdout",
244 | "output_type": "stream",
245 | "text": [
246 | "(88, 144, 963) (85, 144, 963)\n",
247 | "88 85\n"
248 | ]
249 | }
250 | ],
251 | "source": [
252 | "print(X_test.shape, X_val.shape)\n",
253 | "print(len(y_test), len(y_val))"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "## Save pre-processed data as numpy files"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 10,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "dataset_name = 'PEMS_'\n",
270 | "\n",
271 | "output_path = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\processed'\n",
272 | "np.save(os.path.join(output_path, dataset_name + 'X_train.npy'), X_train)\n",
273 | "np.save(os.path.join(output_path, dataset_name + 'X_val.npy'), X_val)\n",
274 | "np.save(os.path.join(output_path, dataset_name + 'X_test.npy'), X_test)\n",
275 | "np.save(os.path.join(output_path, dataset_name + 'y_train.npy'), y_train)\n",
276 | "np.save(os.path.join(output_path, dataset_name + 'y_val.npy'), y_val)\n",
277 | "np.save(os.path.join(output_path, dataset_name + 'y_test.npy'), y_test)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": []
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "## Or: Create new split of data ?"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": []
300 | }
301 | ],
302 | "metadata": {
303 | "kernelspec": {
304 | "display_name": "Python 3",
305 | "language": "python",
306 | "name": "python3"
307 | },
308 | "language_info": {
309 | "codemirror_mode": {
310 | "name": "ipython",
311 | "version": 3
312 | },
313 | "file_extension": ".py",
314 | "mimetype": "text/x-python",
315 | "name": "python",
316 | "nbconvert_exporter": "python",
317 | "pygments_lexer": "ipython3",
318 | "version": "3.6.5"
319 | }
320 | },
321 | "nbformat": 4,
322 | "nbformat_minor": 2
323 | }
324 |
--------------------------------------------------------------------------------
/notebooks/experiments/dataset_phoneme_prepare.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Dataset: PhonemeSpectra\n",
9 | "\n",
10 | "http://www.timeseriesclassification.com/description.php?Dataset=PhonemeSpectra\n",
11 | "\n",
12 | "### Info from data source:\n",
13 | "Phoneme Description:\n",
14 | "This data set is a multivaritate representation of a subset of the data used in the paper Dual-domain Hierarchical Classification of Phonetic Time Series. \n",
15 | "In the case of the raw data.\n",
16 | "Each series was extracted from the segmented audio collected from Google Translate\n",
17 | "Audio files collected from Google translate are recorded at 22050\n",
18 | "The speakers are male and female.\n",
19 | "After data collection, they segment waveforms of the words to generate phonemes using the Forced Aligner tool from the Penn Phonetics Laboratory.\n",
20 | "A Spectrogram of each instance was then created with a window size of 0.001 seconds and an overlap of 90%.\n",
21 | "Each instance in this multivariate dataset is arranged such that each dimension is a frequency band from the spectrogram.\n",
22 | "The data consists of 39 classes each with 170 instances. \n",
23 | "\n",
24 | "Phoneme Refference:\n",
25 | "Publication: Hamooni H, Mueen A. Dual-domain hierarchical classification of phonetic time series. InData Mining (ICDM), 2014 IEEE International Conference on 2014 Dec 14 (pp. 160-169). IEEE.\n",
26 | "\n",
27 | "\n",
28 | "### Size:\n",
29 | "+ Training samples: 3315\n",
30 | "+ Test sampels: 3353\n",
31 | "+ Dimension: 217 timepoints x 11 channels\n",
32 | "+ Classes: 39\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "import numpy as np\n",
42 | "import os\n",
43 | "import sys\n",
44 | "import pandas as pd\n",
45 | "\n",
46 | "CODE = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\mcfly\\\\mcfly'\n",
47 | "DATA = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\PhonemeSpectra'\n",
48 | "sys.path.append(CODE)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "file_train = os.path.join(DATA, 'PhonemeSpectra_TRAIN.arff')\n",
58 | "file_test = os.path.join(DATA, 'PhonemeSpectra_TEST.arff')"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "def load_arff(filename):\n",
68 | " start = 0\n",
69 | "\n",
70 | " data = []\n",
71 | " labels = []\n",
72 | " start_line = 0\n",
73 | " with open(filename) as fp:\n",
74 | " line = fp.readline()\n",
75 | " count = 0\n",
76 | " while line:\n",
77 | " if start == 1:\n",
78 | " label = line.split(\"',\")[-1]\n",
79 | " labels.append(label.replace('\\n', ''))\n",
80 | " line = line.split(\"',\")[0] \n",
81 | " lines = line.split('\\\\n')\n",
82 | " data_line = []\n",
83 | " for l in lines:\n",
84 | " data_line_sub = []\n",
85 | " #for entry in l.split(','):\n",
86 | " #data_line_sub.append(entry.replace(\"'\", \"\"))\n",
87 | " #data_line.append(data_line_sub)\n",
88 | " data_line.append([x.replace(\"'\", \"\") for x in l.split(',')])\n",
89 | " data.append(data_line)\n",
90 | "\n",
91 | " if line.startswith('@data'):\n",
92 | " start_line = count\n",
93 | " #print(\"Actual data start in line\", start_line)\n",
94 | " start = 1\n",
95 | "\n",
96 | " line = fp.readline()\n",
97 | " count += 1\n",
98 | " \n",
99 | " return np.swapaxes(np.array(data).astype(float), 1,2), labels\n",
100 | "\n",
101 | "X_train, y_train = load_arff(file_train)\n",
102 | "X_test0, y_test0 = load_arff(file_test)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 4,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "name": "stdout",
112 | "output_type": "stream",
113 | "text": [
114 | "X_train.shape (3315, 217, 11)\n",
115 | "3315\n",
116 | "X_test.shape (3353, 217, 11)\n",
117 | "3353\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "print(\"X_train.shape\", X_train.shape)\n",
123 | "print(len(y_train))\n",
124 | "\n",
125 | "print(\"X_test.shape\", X_test0.shape)\n",
126 | "print(len(y_test0))"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 5,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/plain": [
137 | "numpy.float64"
138 | ]
139 | },
140 | "execution_count": 5,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "type(X_train[0,0,0])"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 6,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/plain": [
157 | "array([ 0.60185 , 0.10432 , 0.67014 , 0.15635 , 0.95577 , 2.4809 ,\n",
158 | " 3.5833 , 4.7018 , 1.1286 , 7.2648 , 7.1282 , 5.3625 ,\n",
159 | " 4.6666 , 3.4076 , 3.4368 , 3.1312 , 0.6371 , 7.2779 ,\n",
160 | " 10.702 , 9.528 , 8.9655 , 5.3169 , 2.2338 , 0.31894 ,\n",
161 | " 1.8213 , 6.7641 , 10.24 , 9.8695 , 7.5672 , 2.3384 ,\n",
162 | " 3.8596 , 7.3618 , 3.3751 , 2.6142 , 2.605 , 5.3137 ,\n",
163 | " 5.4795 , 1.0734 , 1.0891 , 3.0922 , 2.4679 , 0.091312,\n",
164 | " 2.8001 , 6.1137 , 4.8455 , 0.27992 , 3.3654 , 7.6773 ,\n",
165 | " 9.0268 , 12.636 , 12.903 , 8.7211 , 8.656 , 9.1178 ,\n",
166 | " 5.2904 , 3.632 , 6.6237 , 6.1359 , 5.684 , 5.1734 ,\n",
167 | " 5.4562 , 5.3652 , 5.2969 , 4.7929 , 8.4382 , 9.1113 ,\n",
168 | " 2.4906 , 1.5931 , 1.2522 , 5.8437 , 8.9623 , 5.8633 ,\n",
169 | " 4.0618 , 2.3871 , 0.9758 , 0.74115 , 0.95252 , 2.296 ,\n",
170 | " 2.6277 , 3.1806 , 5.8372 , 7.1867 , 6.8454 , 7.0274 ,\n",
171 | " 7.4567 , 7.5445 , 8.2956 , 8.7867 , 9.5703 , 9.6614 ,\n",
172 | " 6.9689 , 6.7347 , 6.712 , 6.6502 , 5.4763 , 7.4989 ,\n",
173 | " 10.647 , 10.585 , 13.574 , 12.874 , 9.4045 , 4.3024 ,\n",
174 | " 2.3578 , 2.7995 , 9.5959 , 11.085 , 14.688 , 23.92 ,\n",
175 | " 24.843 , 15.833 , 15.885 , 10.116 , 0.064775, 0.21435 ,\n",
176 | " 2.4158 , 4.9229 , 11.127 , 10.916 , 9.921 , 15.599 ,\n",
177 | " 6.9587 , 2.6044 , 7.4561 , 4.5035 , 1.6945 , 2.1947 ,\n",
178 | " 0.38345 , 3.7951 , 4.4358 , 3.7496 , 3.6163 , 0.075052,\n",
179 | " 1.5281 , 5.4953 , 10.812 , 5.4921 , 4.9458 , 2.1915 ,\n",
180 | " 4.9495 , 9.0468 , 2.0749 , 6.2465 , 5.8107 , 2.6565 ,\n",
181 | " 1.4825 , 4.6081 , 12.673 , 11.932 , 2.3514 , 5.2352 ,\n",
182 | " 6.3863 , 7.3098 , 5.3262 , 3.6293 , 15.082 , 16.552 ,\n",
183 | " 15.859 , 14.896 , 12.448 , 6.0291 , 7.4729 , 9.2094 ,\n",
184 | " 9.9443 , 11.121 , 13.275 , 12.555 , 10.507 , 9.9053 ,\n",
185 | " 10.276 , 14.081 , 14.197 , 13.947 , 14.377 , 12.884 ,\n",
186 | " 3.0764 , 1.0143 , 0.20187 , 1.1606 , 4.1881 , 3.5605 ,\n",
187 | " 3.5995 , 6.318 , 11.144 , 24.688 , 17.897 , 13.641 ,\n",
188 | " 22.681 , 14.932 , 3.3139 , 7.0139 , 7.6578 , 2.8255 ,\n",
189 | " 12.825 , 14.344 , 13.261 , 17.232 , 5.0401 , 1.6977 ,\n",
190 | " 7.4301 , 1.7822 , 2.127 , 10.519 , 7.9993 , 0.14984 ,\n",
191 | " 4.3669 , 3.8276 , 7.9119 , 2.5725 , 3.0662 , 8.3439 ,\n",
192 | " 14.206 , 8.643 , 0.57531 , 1.9058 , 13.368 , 12.279 ,\n",
193 | " 1.3264 ])"
194 | ]
195 | },
196 | "execution_count": 6,
197 | "metadata": {},
198 | "output_type": "execute_result"
199 | }
200 | ],
201 | "source": [
202 | "X_train[0,:,10]"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "### Split test into test and validation:"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 7,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | "UW 86\n",
222 | "L 86\n",
223 | "EH 86\n",
224 | "Y 86\n",
225 | "JH 86\n",
226 | "UH 86\n",
227 | "SH 86\n",
228 | "K 86\n",
229 | "AY 86\n",
230 | "P 86\n",
231 | "F 86\n",
232 | "D 86\n",
233 | "ER 86\n",
234 | "DH 86\n",
235 | "R 86\n",
236 | "Z 86\n",
237 | "M 86\n",
238 | "IH 86\n",
239 | "IY 86\n",
240 | "AE 86\n",
241 | "ZH 85\n",
242 | "OY 86\n",
243 | "EY 86\n",
244 | "N 86\n",
245 | "NG 86\n",
246 | "V 86\n",
247 | "G 86\n",
248 | "AA 86\n",
249 | "TH 86\n",
250 | "AH 86\n",
251 | "AW 86\n",
252 | "B 86\n",
253 | "HH 86\n",
254 | "CH 86\n",
255 | "T 86\n",
256 | "S 86\n",
257 | "OW 86\n",
258 | "AO 86\n",
259 | "W 86\n"
260 | ]
261 | }
262 | ],
263 | "source": [
264 | "y_val = []\n",
265 | "y_test = []\n",
266 | "IDs_val = []\n",
267 | "IDs_test = []\n",
268 | "\n",
269 | "np.random.seed(1)\n",
270 | "for label in list(set(y_test0)):\n",
271 | " idx = np.where(np.array(y_test0) == label)[0]\n",
272 | " idx1 = np.random.choice(idx, len(idx)//2, replace=False)\n",
273 | " idx2 = list(set(idx) - set(idx1))\n",
274 | " IDs_val.extend(idx1)\n",
275 | " IDs_test.extend(idx2)\n",
276 | " y_val.extend(len(idx1) * [label])\n",
277 | " y_test.extend(len(idx2) * [label])\n",
278 | "\n",
279 | " print(label, y_test0.count(label))\n",
280 | " \n",
281 | "X_test = X_test0[IDs_test,:,:]\n",
282 | "X_val = X_test0[IDs_val,:,:]"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 8,
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "name": "stdout",
292 | "output_type": "stream",
293 | "text": [
294 | "(1677, 217, 11) (1676, 217, 11)\n",
295 | "1677 1676\n"
296 | ]
297 | }
298 | ],
299 | "source": [
300 | "print(X_test.shape, X_val.shape)\n",
301 | "print(len(y_test), len(y_val))"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "## Save pre-processed data as numpy files"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 10,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "dataset_name = 'PhenomeSpectra_'\n",
318 | "\n",
319 | "output_path = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\processed'\n",
320 | "np.save(os.path.join(output_path, dataset_name + 'X_train.npy'), X_train)\n",
321 | "np.save(os.path.join(output_path, dataset_name + 'X_val.npy'), X_val)\n",
322 | "np.save(os.path.join(output_path, dataset_name + 'X_test.npy'), X_test)\n",
323 | "np.save(os.path.join(output_path, dataset_name + 'y_train.npy'), y_train)\n",
324 | "np.save(os.path.join(output_path, dataset_name + 'y_val.npy'), y_val)\n",
325 | "np.save(os.path.join(output_path, dataset_name + 'y_test.npy'), y_test)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": []
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "## Or: Create new split of data ?"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": []
348 | }
349 | ],
350 | "metadata": {
351 | "kernelspec": {
352 | "display_name": "Python 3",
353 | "language": "python",
354 | "name": "python3"
355 | },
356 | "language_info": {
357 | "codemirror_mode": {
358 | "name": "ipython",
359 | "version": 3
360 | },
361 | "file_extension": ".py",
362 | "mimetype": "text/x-python",
363 | "name": "python",
364 | "nbconvert_exporter": "python",
365 | "pygments_lexer": "ipython3",
366 | "version": "3.6.5"
367 | }
368 | },
369 | "nbformat": 4,
370 | "nbformat_minor": 2
371 | }
372 |
--------------------------------------------------------------------------------
/notebooks/experiments/dataset_rackets_prepare.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "attachments": {},
5 | "cell_type": "markdown",
6 | "metadata": {},
7 | "source": [
8 | "# Dataset: RacketSports\n",
9 | "\n",
10 | "http://www.timeseriesclassification.com/description.php?Dataset=RacketSports\n",
11 | "\n",
12 | "### Info from data source:\n",
13 | "The data was created by university students plyaing badminton or squash whilst wearing a smart watch (Sony Smart watch 35). The watch relayed the x-y-z coordinates for\n",
14 | "both the gyroscope and accelerometer to an android phone (One Plus 56). The phone\n",
15 | "wrote these values to an Attribute-Relation File Format (arff) file using an app developed\n",
16 | "by a UEA computer science masters student. The problem is to identify which sport and which stroke the players are making. The data was collected at a rate of 10 HZ over 3 seconds whilst the player played\n",
17 | "either a forehand/backhand in squash or a clear/smash in badminton.\n",
18 | "The data was collected as part of an undergraduate project by Phillip Perks in 2017/18.\n",
19 | "\n",
20 | "### Size:\n",
21 | "+ Training samples: 151\t\n",
22 | "+ Test sampels: 152\n",
23 | "+ Dimension: 30 timepoints x 6 channels\n",
24 | "+ Classes: 4\n"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 66,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import numpy as np\n",
34 | "import os\n",
35 | "import sys\n",
36 | "import pandas as pd\n",
37 | "\n",
38 | "CODE = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\mcfly\\\\mcfly'\n",
39 | "DATA = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\RacketSports'\n",
40 | "sys.path.append(CODE)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 6,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "file_train = os.path.join(DATA, 'RacketSports_TRAIN.arff')\n",
50 | "file_test = os.path.join(DATA, 'RacketSports_TEST.arff')"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 34,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "def load_racket_arff(filename):\n",
60 | " start = 0\n",
61 | "\n",
62 | " data = []\n",
63 | " labels = []\n",
64 | " start_line = 0\n",
65 | " with open(filename) as fp:\n",
66 | " line = fp.readline()\n",
67 | " count = 0\n",
68 | " while line:\n",
69 | " if start == 1:\n",
70 | " lines = line.split('\\\\n')\n",
71 | " data_line = []\n",
72 | " for l in lines:\n",
73 | " data_line_sub = []\n",
74 | " for entry in l.split(','):\n",
75 | " if entry.startswith('B') or entry.startswith('S'):\n",
76 | " labels.append(entry.replace(\"'\", \"\").replace('\\n', ''))\n",
77 | " else:\n",
78 | " data_line_sub.append(entry.replace(\"'\", \"\"))\n",
79 | " data_line.append(data_line_sub)\n",
80 | " data.append(data_line)\n",
81 | "\n",
82 | " if line.startswith('@data'):\n",
83 | " start_line = count\n",
84 | " #print(\"Actual data start in line\", start_line)\n",
85 | " start = 1\n",
86 | "\n",
87 | " line = fp.readline()\n",
88 | " count += 1\n",
89 | " \n",
90 | " return np.swapaxes(np.array(data), 1,2), labels\n",
91 | "\n",
92 | "X_train, y_train = load_racket_arff(file_train)\n",
93 | "X_test0, y_test0 = load_racket_arff(file_test)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 35,
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "X_train.shape (151, 30, 6)\n",
106 | "151\n",
107 | "X_test.shape (152, 30, 6)\n",
108 | "152\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "print(\"X_train.shape\", X_train.shape)\n",
114 | "print(len(y_train))\n",
115 | "\n",
116 | "print(\"X_test.shape\", X_test0.shape)\n",
117 | "print(len(y_test0))"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "### Split test into test and validation:"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 42,
130 | "metadata": {},
131 | "outputs": [
132 | {
133 | "name": "stdout",
134 | "output_type": "stream",
135 | "text": [
136 | "[39 12 9 36 11 30 10 13 1 22 16 31 38 24 23 33 26 34 28 18] [0, 32, 2, 3, 4, 5, 6, 7, 8, 35, 37, 14, 15, 17, 19, 20, 21, 25, 27, 29]\n"
137 | ]
138 | }
139 | ],
140 | "source": [
141 | "IDs1 = np.random.choice(idx, len(idx) //2, replace=False)\n",
142 | "IDs2 = list(set(idx) - set(IDs1))\n",
143 | "print(IDs1, IDs2)"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 48,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "Squash_BackhandBoast 34\n",
156 | "Squash_ForehandBoast 35\n",
157 | "Badminton_Smash 40\n",
158 | "Badminton_Clear 43\n"
159 | ]
160 | }
161 | ],
162 | "source": [
163 | "y_val = []\n",
164 | "y_test = []\n",
165 | "IDs_val = []\n",
166 | "IDs_test = []\n",
167 | "\n",
168 | "for label in list(set(y_test0)):\n",
169 | " idx = np.where(np.array(y_test0) == label)[0]\n",
170 | " idx1 = np.random.choice(idx, len(idx)//2, replace=False)\n",
171 | " idx2 = list(set(idx) - set(idx1))\n",
172 | " IDs_val.extend(idx1)\n",
173 | " IDs_test.extend(idx2)\n",
174 | " y_val.extend(len(idx1) * [label])\n",
175 | " y_test.extend(len(idx2) * [label])\n",
176 | "\n",
177 | " print(label, y_test0.count(label))\n",
178 | " \n",
179 | "X_test = X_test0[IDs_test,:,:]\n",
180 | "X_val = X_test0[IDs_val,:,:]"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 54,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "(77, 30, 6) (75, 30, 6)\n",
193 | "77 75\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "print(X_test.shape, X_val.shape)\n",
199 | "print(len(y_test), len(y_val))"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "## Save pre-processed data as numpy files"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 65,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "dataset_name = 'RacketSports_'\n",
216 | "\n",
217 | "output_path = 'C:\\\\OneDrive - Netherlands eScience Center\\\\Project_mcfly\\\\data\\\\processed'\n",
218 | "np.save(os.path.join(output_path, dataset_name + 'X_train.npy'), X_train)\n",
219 | "np.save(os.path.join(output_path, dataset_name + 'X_val.npy'), X_val)\n",
220 | "np.save(os.path.join(output_path, dataset_name + 'X_test.npy'), X_test)\n",
221 | "np.save(os.path.join(output_path, dataset_name + 'y_train.npy'), y_train)\n",
222 | "np.save(os.path.join(output_path, dataset_name + 'y_val.npy'), y_val)\n",
223 | "np.save(os.path.join(output_path, dataset_name + 'y_test.npy'), y_test)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": []
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 61,
236 | "metadata": {},
237 | "outputs": [
238 | {
239 | "data": {
240 | "text/plain": [
241 | "((30,), array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,\n",
242 | " 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,\n",
243 | " 2.7, 2.8, 2.9, 3. ]))"
244 | ]
245 | },
246 | "execution_count": 61,
247 | "metadata": {},
248 | "output_type": "execute_result"
249 | }
250 | ],
251 | "source": [
252 | "time_axis = np.linspace( 0.1, 3, 30)\n",
253 | "time_axis.shape, time_axis"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "## Or: Create new split of data ?"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 30,
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "name": "stdout",
270 | "output_type": "stream",
271 | "text": [
272 | "(303, 30, 6)\n",
273 | "303\n"
274 | ]
275 | }
276 | ],
277 | "source": [
278 | "X_data = np.concatenate((X_train, X_val), axis=0)\n",
279 | "print(X_data.shape)\n",
280 | "\n",
281 | "y_data = y_train.copy()\n",
282 | "y_data.extend(y_val)\n",
283 | "print(len(y_data))"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 31,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "name": "stdout",
293 | "output_type": "stream",
294 | "text": [
295 | "Squash_BackhandBoast 68\n",
296 | "Squash_ForehandBoast 70\n",
297 | "Badminton_Smash 79\n",
298 | "Badminton_Clear 86\n"
299 | ]
300 | }
301 | ],
302 | "source": [
303 | "split = [0.6, 0.2, 0.2]\n",
304 | "\n",
305 | "for label in list(set(y_data)):\n",
306 | " idx = np.where(np.array(y_data) == label)[0]\n",
307 | " \n",
308 | " #print(label, np.where(np.array(y_val) == label)[0].shape)\n",
309 | " print(label, y_data.count(label))"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "metadata": {},
316 | "outputs": [],
317 | "source": []
318 | }
319 | ],
320 | "metadata": {
321 | "kernelspec": {
322 | "display_name": "Python 3",
323 | "language": "python",
324 | "name": "python3"
325 | },
326 | "language_info": {
327 | "codemirror_mode": {
328 | "name": "ipython",
329 | "version": 3
330 | },
331 | "file_extension": ".py",
332 | "mimetype": "text/x-python",
333 | "name": "python",
334 | "nbconvert_exporter": "python",
335 | "pygments_lexer": "ipython3",
336 | "version": "3.6.5"
337 | }
338 | },
339 | "nbformat": 4,
340 | "nbformat_minor": 2
341 | }
342 |
--------------------------------------------------------------------------------
/notebooks/experiments/deeplearning_guinneabissau.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# EEG data classification Guinnea Bissau"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This notebook contains experiments with an EEG dataset. The classes are Epilepsy: 0 or Control 1."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {
20 | "collapsed": false
21 | },
22 | "source": [
23 | "Load dependences and setting output configuration"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [
33 | {
34 | "name": "stderr",
35 | "output_type": "stream",
36 | "text": [
37 | "Using Theano backend.\n"
38 | ]
39 | },
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "Populating the interactive namespace from numpy and matplotlib\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "import numpy as np\n",
50 | "from keras.utils.np_utils import to_categorical\n",
51 | "import keras\n",
52 | "%pylab inline\n",
53 | "%load_ext autoreload\n",
54 | "%autoreload 2"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Load data from npy files"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "Specify location of npy files:"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 2,
74 | "metadata": {
75 | "collapsed": true
76 | },
77 | "outputs": [],
78 | "source": [
79 | "datapath = '/media/windows-share/EEGs_Guinea-Bissau_np/'\n",
80 | "#datapath = '/media/sf_VBox_Shared/timeseries/EEGs_Guinea-Bissau_np/'#"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "Load data stored in 10 seconds at 128 Hertz corresponding to the experiment where the participant had the eyes closed:"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 3,
93 | "metadata": {
94 | "collapsed": false
95 | },
96 | "outputs": [],
97 | "source": [
98 | "condition = '_10seconds_closed.npy'\n",
99 | "X_train = np.load(datapath+'X_train'+condition)\n",
100 | "y_train = np.load(datapath+'y_train'+condition)\n",
101 | "X_val = np.load(datapath+'X_valid'+condition)\n",
102 | "y_val = np.load(datapath+'y_valid'+condition)\n",
103 | "X_test = np.load(datapath+'X_test'+condition)\n",
104 | "y_test = np.load(datapath+'y_test'+condition)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 4,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "{'Epilepsy': 0, 'Control': 1}\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "classlabels = list(set(y_train))\n",
124 | "mapclasses = {classlabels[i] : i for i in range(len(classlabels))}\n",
125 | "print(mapclasses)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 5,
131 | "metadata": {
132 | "collapsed": false
133 | },
134 | "outputs": [],
135 | "source": [
136 | "y_train = np.array([mapclasses[c] for c in y_train], dtype='int')\n",
137 | "y_val = np.array([mapclasses[c] for c in y_test], dtype='int')\n",
138 | "y_test = np.array([mapclasses[c] for c in y_test], dtype='int')\n",
139 | "y_train_binary = to_categorical(y_train)\n",
140 | "y_val_binary = to_categorical(y_val)\n",
141 | "y_test_binary = to_categorical(y_test)"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 6,
147 | "metadata": {
148 | "collapsed": false,
149 | "scrolled": true
150 | },
151 | "outputs": [
152 | {
153 | "data": {
154 | "text/plain": [
155 | "array([[ 1., 0.],\n",
156 | " [ 1., 0.],\n",
157 | " [ 1., 0.],\n",
158 | " [ 0., 1.],\n",
159 | " [ 1., 0.],\n",
160 | " [ 1., 0.],\n",
161 | " [ 1., 0.],\n",
162 | " [ 0., 1.],\n",
163 | " [ 0., 1.],\n",
164 | " [ 1., 0.],\n",
165 | " [ 1., 0.],\n",
166 | " [ 1., 0.],\n",
167 | " [ 1., 0.],\n",
168 | " [ 1., 0.],\n",
169 | " [ 0., 1.],\n",
170 | " [ 0., 1.],\n",
171 | " [ 0., 1.],\n",
172 | " [ 0., 1.],\n",
173 | " [ 0., 1.],\n",
174 | " [ 0., 1.]])"
175 | ]
176 | },
177 | "execution_count": 6,
178 | "metadata": {},
179 | "output_type": "execute_result"
180 | }
181 | ],
182 | "source": [
183 | "y_val_binary"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 8,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [
193 | {
194 | "data": {
195 | "text/plain": [
196 | "(108,)"
197 | ]
198 | },
199 | "execution_count": 8,
200 | "metadata": {},
201 | "output_type": "execute_result"
202 | }
203 | ],
204 | "source": [
205 | "y_train.shape"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": []
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {
221 | "collapsed": true
222 | },
223 | "outputs": [],
224 | "source": []
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "collapsed": true
231 | },
232 | "outputs": [],
233 | "source": []
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {
239 | "collapsed": true
240 | },
241 | "outputs": [],
242 | "source": []
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {
248 | "collapsed": true
249 | },
250 | "outputs": [],
251 | "source": []
252 | }
253 | ],
254 | "metadata": {
255 | "kernelspec": {
256 | "display_name": "Python 3",
257 | "language": "python",
258 | "name": "python3"
259 | },
260 | "language_info": {
261 | "codemirror_mode": {
262 | "name": "ipython",
263 | "version": 3
264 | },
265 | "file_extension": ".py",
266 | "mimetype": "text/x-python",
267 | "name": "python",
268 | "nbconvert_exporter": "python",
269 | "pygments_lexer": "ipython3",
270 | "version": "3.5.2"
271 | }
272 | },
273 | "nbformat": 4,
274 | "nbformat_minor": 0
275 | }
276 |
--------------------------------------------------------------------------------
/notebooks/experiments/experiment_PAMAP2_9fold_small.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Experiment PAMAP2 with mcfly"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This experiment finds an optimal model for the PAMAP2 dataset."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Import required Python modules"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {
28 | "collapsed": false,
29 | "scrolled": false
30 | },
31 | "outputs": [
32 | {
33 | "name": "stderr",
34 | "output_type": "stream",
35 | "text": [
36 | "Using Theano backend.\n"
37 | ]
38 | }
39 | ],
40 | "source": [
41 | "import sys\n",
42 | "import os\n",
43 | "sys.path.insert(0, os.path.abspath('../..'))\n",
44 | "import numpy as np\n",
45 | "import pandas as pd\n",
46 | "# mcfly\n",
47 | "from mcfly import tutorial_pamap2, modelgen, find_architecture, storage\n",
48 | "# Keras module is use for the deep learning\n",
49 | "import keras\n",
50 | "from keras.utils.np_utils import to_categorical\n",
51 | "from keras.models import Sequential\n",
52 | "from keras.layers import Dense, Activation, Convolution1D, Flatten, MaxPooling1D\n",
53 | "from keras.optimizers import Adam\n",
54 | "# We can set some backend options to avoid NaNs\n",
55 | "from keras import backend as K"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Load the data"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 4,
68 | "metadata": {
69 | "collapsed": false
70 | },
71 | "outputs": [],
72 | "source": [
73 | "datapath = '/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_9vars/'\n",
74 | "Xs = []\n",
75 | "ys = []\n",
76 | "\n",
77 | "ext = '.npy'\n",
78 | "for i in range(9):\n",
79 | " Xs.append(np.load(datapath+'X_'+str(i)+ext))\n",
80 | " ys.append(np.load(datapath+'y_'+str(i)+ext))"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 6,
86 | "metadata": {
87 | "collapsed": false
88 | },
89 | "outputs": [],
90 | "source": [
91 | "# Define directory where the results, e.g. json file, will be stored\n",
92 | "resultpath = '/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/results_tutorial/' "
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 7,
98 | "metadata": {
99 | "collapsed": false
100 | },
101 | "outputs": [],
102 | "source": [
103 | "modelname = 'my_bestmodel'\n",
104 | "model_reloaded = storage.loadmodel(resultpath,modelname)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 10,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [],
114 | "source": [
115 | "def split_train_test(X_list, y_list, j):\n",
116 | " X_train = np.concatenate(X_list[0:j]+X_list[j+1:])\n",
117 | " X_test = X_list[j]\n",
118 | " y_train = np.concatenate(y_list[0:j]+y_list[j+1:])\n",
119 | " y_test = y_list[j]\n",
120 | " return X_train, y_train, X_test, y_test\n",
121 | "\n",
122 | "def split_train_small_val(X_list, y_list, j, trainsize=500, valsize=500):\n",
123 | " X = np.concatenate(X_list[0:j]+X_list[j+1:])\n",
124 | " y = np.concatenate(y_list[0:j]+y_list[j+1:])\n",
125 | " rand_ind = np.random.choice(X.shape[0], trainsize+valsize, replace=False)\n",
126 | " X_train = X[rand_ind[:trainsize]]\n",
127 | " y_train = y[rand_ind[:trainsize]]\n",
128 | " X_val = X[rand_ind[trainsize:]]\n",
129 | " y_val = y[rand_ind[trainsize:]]\n",
130 | " return X_train, y_train, X_val, y_val"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 11,
136 | "metadata": {
137 | "collapsed": false
138 | },
139 | "outputs": [],
140 | "source": [
141 | "from keras.models import model_from_json\n",
142 | "\n",
143 | "def get_fresh_copy(model, lr):\n",
144 | " model_json = model.to_json()\n",
145 | " model_copy = model_from_json(model_json)\n",
146 | " model_copy.compile(loss='categorical_crossentropy',\n",
147 | " optimizer=Adam(lr=lr),\n",
148 | " metrics=['accuracy'])\n",
149 | " #for layer in model_copy.layers:\n",
150 | " # layer.build(layer.input_shape)\n",
151 | " return model_copy"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "## Train the best model for real"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "Now that we have identified the best model architecture out of our random pool of models we can continue by training the model on the full training sample. For the purpose of speeding up the example we only train the full model on the first 1000 values. You will need to replace this by 'datasize = X_train.shape[0]' in a real world example."
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 8,
171 | "metadata": {
172 | "collapsed": false
173 | },
174 | "outputs": [],
175 | "source": [
176 | "best_model = model_reloaded"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 16,
182 | "metadata": {
183 | "collapsed": false
184 | },
185 | "outputs": [],
186 | "source": [
187 | "import json\n",
188 | "with open(resultpath+'modelcomparison.json', 'r') as outfile:\n",
189 | " model_json = json.load(outfile)"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 20,
195 | "metadata": {
196 | "collapsed": false
197 | },
198 | "outputs": [],
199 | "source": [
200 | "best_params = model_json[0]"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 22,
206 | "metadata": {
207 | "collapsed": false
208 | },
209 | "outputs": [
210 | {
211 | "name": "stdout",
212 | "output_type": "stream",
213 | "text": [
214 | "Train on 14663 samples, validate on 2155 samples\n",
215 | "Epoch 1/2\n",
216 | "14663/14663 [==============================] - 490s - loss: 1.0568 - acc: 0.8655 - val_loss: 0.9868 - val_acc: 0.8297\n",
217 | "Epoch 2/2\n",
218 | "14663/14663 [==============================] - 523s - loss: 0.5656 - acc: 0.9360 - val_loss: 0.8527 - val_acc: 0.8278\n",
219 | "Train on 14528 samples, validate on 2290 samples\n",
220 | "Epoch 1/2\n",
221 | "14528/14528 [==============================] - 496s - loss: 1.0377 - acc: 0.8771 - val_loss: 0.8812 - val_acc: 0.7638\n",
222 | "Epoch 2/2\n",
223 | "14528/14528 [==============================] - 2782s - loss: 0.5823 - acc: 0.9290 - val_loss: 0.8818 - val_acc: 0.7258\n",
224 | "Train on 15344 samples, validate on 1474 samples\n",
225 | "Epoch 1/2\n",
226 | "15344/15344 [==============================] - 1015s - loss: 1.0461 - acc: 0.8672 - val_loss: 0.2425 - val_acc: 0.9512\n",
227 | "Epoch 2/2\n",
228 | "15344/15344 [==============================] - 518s - loss: 0.5721 - acc: 0.9327 - val_loss: 0.1883 - val_acc: 0.9478\n",
229 | "Train on 14799 samples, validate on 2019 samples\n",
230 | "Epoch 1/2\n",
231 | "14799/14799 [==============================] - 229s - loss: 1.0516 - acc: 0.8664 - val_loss: 0.6108 - val_acc: 0.8920\n",
232 | "Epoch 2/2\n",
233 | "14799/14799 [==============================] - 241s - loss: 0.6011 - acc: 0.9263 - val_loss: 0.4059 - val_acc: 0.9188\n",
234 | "Train on 14438 samples, validate on 2380 samples\n",
235 | "Epoch 1/2\n",
236 | "14438/14438 [==============================] - 240s - loss: 1.0530 - acc: 0.8686 - val_loss: 0.6165 - val_acc: 0.8597\n",
237 | "Epoch 2/2\n",
238 | "14438/14438 [==============================] - 261s - loss: 0.5826 - acc: 0.9341 - val_loss: 0.6550 - val_acc: 0.8122\n",
239 | "Train on 14639 samples, validate on 2179 samples\n",
240 | "Epoch 1/2\n",
241 | "14639/14639 [==============================] - 237s - loss: 1.0891 - acc: 0.8577 - val_loss: 0.4885 - val_acc: 0.9197\n",
242 | "Epoch 2/2\n",
243 | "14639/14639 [==============================] - 244s - loss: 0.6007 - acc: 0.9292 - val_loss: 0.3850 - val_acc: 0.9101\n",
244 | "Train on 14811 samples, validate on 2007 samples\n",
245 | "Epoch 1/2\n",
246 | "14811/14811 [==============================] - 244s - loss: 1.0595 - acc: 0.8606 - val_loss: 0.3250 - val_acc: 0.9482\n",
247 | "Epoch 2/2\n",
248 | "14811/14811 [==============================] - 278s - loss: 0.5904 - acc: 0.9294 - val_loss: 0.2464 - val_acc: 0.9307\n",
249 | "Train on 14543 samples, validate on 2275 samples\n",
250 | "Epoch 1/2\n",
251 | "14543/14543 [==============================] - 251s - loss: 1.0374 - acc: 0.8742 - val_loss: 1.8568 - val_acc: 0.5200\n",
252 | "Epoch 2/2\n",
253 | "14543/14543 [==============================] - 247s - loss: 0.5543 - acc: 0.9392 - val_loss: 2.5112 - val_acc: 0.4686\n",
254 | "Train on 16779 samples, validate on 39 samples\n",
255 | "Epoch 1/2\n",
256 | "16779/16779 [==============================] - 261s - loss: 1.0542 - acc: 0.8620 - val_loss: 0.1967 - val_acc: 0.9744\n",
257 | "Epoch 2/2\n",
258 | "16779/16779 [==============================] - 292s - loss: 0.5617 - acc: 0.9311 - val_loss: 0.1018 - val_acc: 0.9744\n"
259 | ]
260 | }
261 | ],
262 | "source": [
263 | "nr_epochs = 2\n",
264 | "\n",
265 | "np.random.seed(123)\n",
266 | "histories, test_accuracies_list, models = [], [], []\n",
267 | "for j in range(len(Xs)):\n",
268 | " X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j)\n",
269 | " model_copy = get_fresh_copy(best_model, best_params['learning_rate'])\n",
270 | " datasize = X_train.shape[0]\n",
271 | " \n",
272 | " history = model_copy.fit(X_train[:datasize,:,:], y_train[:datasize,:],\n",
273 | " nb_epoch=nr_epochs, validation_data=(X_test, y_test))\n",
274 | " \n",
275 | " histories.append(history)\n",
276 | " test_accuracies_list.append(history.history['val_acc'][-1] )\n",
277 | " models.append(model_copy)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 23,
283 | "metadata": {
284 | "collapsed": false
285 | },
286 | "outputs": [
287 | {
288 | "name": "stdout",
289 | "output_type": "stream",
290 | "text": [
291 | "0.835116382719\n"
292 | ]
293 | },
294 | {
295 | "data": {
296 | "text/plain": [
297 | "[0.82784222737819024,\n",
298 | " 0.72576419198356856,\n",
299 | " 0.94776119402985071,\n",
300 | " 0.91877166914314012,\n",
301 | " 0.81218487384940397,\n",
302 | " 0.91005048189977267,\n",
303 | " 0.9307424017132131,\n",
304 | " 0.46857142858452849,\n",
305 | " 0.97435897588729858]"
306 | ]
307 | },
308 | "execution_count": 23,
309 | "metadata": {},
310 | "output_type": "execute_result"
311 | }
312 | ],
313 | "source": [
314 | "print(np.mean(test_accuracies_list))\n",
315 | "test_accuracies_list"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 24,
321 | "metadata": {
322 | "collapsed": false
323 | },
324 | "outputs": [
325 | {
326 | "name": "stdout",
327 | "output_type": "stream",
328 | "text": [
329 | "fold 0\n",
330 | "fold 1\n",
331 | "fold 2\n",
332 | "fold 3\n",
333 | "fold 4\n",
334 | "fold 5\n",
335 | "fold 6\n",
336 | "fold 7\n",
337 | "fold 8\n"
338 | ]
339 | }
340 | ],
341 | "source": [
342 | "# Calculate 1-NN for each fold:\n",
343 | "nr_epochs = 2\n",
344 | "\n",
345 | "np.random.seed(123)\n",
346 | "knn_test_accuracies_list = []\n",
347 | "for j in range(len(Xs)):\n",
348 | " print(\"fold \", j)\n",
349 | " X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j)\n",
350 | " acc = find_architecture.kNN_accuracy(X_train, y_train, X_test, y_test, k=1)\n",
351 | " knn_test_accuracies_list.append(acc )"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 25,
357 | "metadata": {
358 | "collapsed": false
359 | },
360 | "outputs": [
361 | {
362 | "name": "stdout",
363 | "output_type": "stream",
364 | "text": [
365 | "0.53974709837\n"
366 | ]
367 | },
368 | {
369 | "data": {
370 | "text/html": [
371 | "\n",
372 | "
\n",
373 | " \n",
374 | " \n",
375 | " | \n",
376 | " CNN | \n",
377 | " kNN | \n",
378 | "
\n",
379 | " \n",
380 | " \n",
381 | " \n",
382 | " 0 | \n",
383 | " 0.827842 | \n",
384 | " 0.611601 | \n",
385 | "
\n",
386 | " \n",
387 | " 1 | \n",
388 | " 0.725764 | \n",
389 | " 0.610044 | \n",
390 | "
\n",
391 | " \n",
392 | " 2 | \n",
393 | " 0.947761 | \n",
394 | " 0.613976 | \n",
395 | "
\n",
396 | " \n",
397 | " 3 | \n",
398 | " 0.918772 | \n",
399 | " 0.523031 | \n",
400 | "
\n",
401 | " \n",
402 | " 4 | \n",
403 | " 0.812185 | \n",
404 | " 0.615966 | \n",
405 | "
\n",
406 | " \n",
407 | " 5 | \n",
408 | " 0.910050 | \n",
409 | " 0.523176 | \n",
410 | "
\n",
411 | " \n",
412 | " 6 | \n",
413 | " 0.930742 | \n",
414 | " 0.603886 | \n",
415 | "
\n",
416 | " \n",
417 | " 7 | \n",
418 | " 0.468571 | \n",
419 | " 0.371429 | \n",
420 | "
\n",
421 | " \n",
422 | " 8 | \n",
423 | " 0.974359 | \n",
424 | " 0.384615 | \n",
425 | "
\n",
426 | " \n",
427 | "
\n",
428 | "
"
429 | ],
430 | "text/plain": [
431 | " CNN kNN\n",
432 | "0 0.827842 0.611601\n",
433 | "1 0.725764 0.610044\n",
434 | "2 0.947761 0.613976\n",
435 | "3 0.918772 0.523031\n",
436 | "4 0.812185 0.615966\n",
437 | "5 0.910050 0.523176\n",
438 | "6 0.930742 0.603886\n",
439 | "7 0.468571 0.371429\n",
440 | "8 0.974359 0.384615"
441 | ]
442 | },
443 | "execution_count": 25,
444 | "metadata": {},
445 | "output_type": "execute_result"
446 | }
447 | ],
448 | "source": [
449 | "print(np.mean(knn_test_accuracies_list))\n",
450 | "accs_compared = pd.DataFrame({'CNN': test_accuracies_list, 'kNN':knn_test_accuracies_list})\n",
451 | "accs_compared"
452 | ]
453 | },
454 | {
455 | "cell_type": "markdown",
456 | "metadata": {},
457 | "source": [
458 | "### Saving, loading and comparing reloaded model with orignal model"
459 | ]
460 | },
461 | {
462 | "cell_type": "markdown",
463 | "metadata": {},
464 | "source": [
465 | "The modoel can be saved for future use. The savemodel function will save two separate files: a json file for the architecture and a npy (numpy array) file for the weights."
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 26,
471 | "metadata": {
472 | "collapsed": true
473 | },
474 | "outputs": [],
475 | "source": [
476 | "modelname = 'my_bestmodel'"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 27,
482 | "metadata": {
483 | "collapsed": false
484 | },
485 | "outputs": [],
486 | "source": [
487 | "for i, model in enumerate(models):\n",
488 | " storage.savemodel(model,resultpath,modelname+str(i))"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {
495 | "collapsed": true
496 | },
497 | "outputs": [],
498 | "source": []
499 | }
500 | ],
501 | "metadata": {
502 | "anaconda-cloud": {},
503 | "kernelspec": {
504 | "display_name": "Python [conda env:mcfly]",
505 | "language": "python",
506 | "name": "conda-env-mcfly-py"
507 | },
508 | "language_info": {
509 | "codemirror_mode": {
510 | "name": "ipython",
511 | "version": 3
512 | },
513 | "file_extension": ".py",
514 | "mimetype": "text/x-python",
515 | "name": "python",
516 | "nbconvert_exporter": "python",
517 | "pygments_lexer": "ipython3",
518 | "version": "3.5.2"
519 | }
520 | },
521 | "nbformat": 4,
522 | "nbformat_minor": 1
523 | }
524 |
--------------------------------------------------------------------------------
/notebooks/experiments/experiment_skipconnections.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Exploration of skip connection implementation"
8 | ]
9 | },
10 | {
11 | "cell_type": "raw",
12 | "metadata": {},
13 | "source": [
14 | "In this notebook we explore how skip connections can be implemented. We use the PAMAP2 dataset/tutorial as test case.\n",
15 | "\n",
16 | "As an example dataset we use the publicly available [PAMAP2 dataset](https://archive.ics.uci.edu/ml/datasets/PAMAP2+Physical+Activity+Monitoring). It contains time series data from movement sensors worn by nine individuals. The data is labelled with the activity types that these individuals did and the aim is to train and evaluate a *classifier*.\n",
17 | "\n",
18 | "Before you can start, please make sure you install mcfly (see the [mcfly installation page](https://github.com/NLeSC/mcfly))."
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Import required Python modules"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {
32 | "scrolled": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stderr",
37 | "output_type": "stream",
38 | "text": [
39 | "Using TensorFlow backend.\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "import sys\n",
45 | "import os\n",
46 | "import numpy as np\n",
47 | "import pandas as pd\n",
48 | "# mcfly\n",
49 | "from mcfly import modelgen, find_architecture, storage\n",
50 | "from keras.models import load_model\n",
51 | "np.random.seed(2)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 2,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "sys.path.insert(0, os.path.abspath('../..'))\n",
61 | "from utils import tutorial_pamap2"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "## Download data pre-procesed data"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "We have created a function for you to fetch the preprocessed data from https://zenodo.org/record/834467. Please specify the `directory_to_extract_to` in the code below and then execute the cell. This will download the preprocessed data into the directory in the `data` subdirectory. The output of the function is the path where the preprocessed data was stored."
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "# Specify in which directory you want to store the data:\n",
85 | "directory_to_extract_to = '.'"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 4,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "Downloading data...\n",
98 | "Extracting data...\n",
99 | "Done\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "data_path = tutorial_pamap2.download_preprocessed_data(directory_to_extract_to)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 5,
110 | "metadata": {},
111 | "outputs": [
112 | {
113 | "data": {
114 | "text/plain": [
115 | "'./data/PAMAP2/preprocessed'"
116 | ]
117 | },
118 | "execution_count": 5,
119 | "metadata": {},
120 | "output_type": "execute_result"
121 | }
122 | ],
123 | "source": [
124 | "data_path"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "## Load the pre-processed data"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "Load the preprocessed data as stored in Numpy-files. Please note that the data has already been split up in a training (training), validation (val), and test subsets. It is common practice to call the input data X and the labels y."
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 6,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "X_train, y_train_binary, X_val, y_val_binary, X_test, y_test_binary, labels = tutorial_pamap2.load_data(data_path)"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "Data X and labels y are of type Numpy array. In the cell below we inspect the shape of the data. As you can see the shape of X is expressed as a Python tuple containing: the number of samples, length of the time series, and the number of channels for each sample. Similarly, the shape of y is represents the number of samples and the number of classes (unique labels). Note that y has the format of a binary array where only the correct class for each sample is assigned a 1. This is called one-hot-encoding."
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 7,
160 | "metadata": {},
161 | "outputs": [
162 | {
163 | "name": "stdout",
164 | "output_type": "stream",
165 | "text": [
166 | "x shape: (11397, 512, 9)\n",
167 | "y shape: (11397, 7)\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "print('x shape:', X_train.shape)\n",
173 | "print('y shape:', y_train_binary.shape)"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "The data is split between train test and validation."
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 8,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "train set size: 11397\n",
193 | "validation set size: 100\n",
194 | "test set size: 1000\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "print('train set size:', X_train.shape[0])\n",
200 | "print('validation set size:', X_val.shape[0])\n",
201 | "print('test set size:', X_test.shape[0])"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "Let's have a look at the distribution of the labels:"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 9,
214 | "metadata": {},
215 | "outputs": [
216 | {
217 | "data": {
218 | "text/html": [
219 | "\n",
220 | "\n",
233 | "
\n",
234 | " \n",
235 | " \n",
236 | " | \n",
237 | " frequency | \n",
238 | "
\n",
239 | " \n",
240 | " \n",
241 | " \n",
242 | " lying | \n",
243 | " 0.136615 | \n",
244 | "
\n",
245 | " \n",
246 | " sitting | \n",
247 | " 0.130736 | \n",
248 | "
\n",
249 | " \n",
250 | " standing | \n",
251 | " 0.136703 | \n",
252 | "
\n",
253 | " \n",
254 | " walking | \n",
255 | " 0.176625 | \n",
256 | "
\n",
257 | " \n",
258 | " cycling | \n",
259 | " 0.118540 | \n",
260 | "
\n",
261 | " \n",
262 | " vaccuum_cleaning | \n",
263 | " 0.125208 | \n",
264 | "
\n",
265 | " \n",
266 | " ironing | \n",
267 | " 0.175573 | \n",
268 | "
\n",
269 | " \n",
270 | "
\n",
271 | "
"
272 | ],
273 | "text/plain": [
274 | " frequency\n",
275 | "lying 0.136615\n",
276 | "sitting 0.130736\n",
277 | "standing 0.136703\n",
278 | "walking 0.176625\n",
279 | "cycling 0.118540\n",
280 | "vaccuum_cleaning 0.125208\n",
281 | "ironing 0.175573"
282 | ]
283 | },
284 | "execution_count": 9,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "frequencies = y_train_binary.mean(axis=0)\n",
291 | "frequencies_df = pd.DataFrame(frequencies, index=labels, columns=['frequency'])\n",
292 | "frequencies_df"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "### *Question 1: How many channels does this dataset have?*\n",
300 | "### *Question 2: What is the least common activity label in this dataset?*\n",
301 | "\n",
302 | " "
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {},
308 | "source": [
309 | "## Generate models"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "First step in the development of any deep learning model is to create a model architecture. As we do not know what architecture is best for our data we will create a set of random models to investigate which architecture is most suitable for our data and classification task. This process, creating random models, checking how good they are and then selecting the best one is called a 'random search'. A random search is considered to be the most robust approach to finding a good model. You will need to specificy how many models you want to create with argument 'number_of_models'. See for a full overview of the optional arguments the function documentation of modelgen.generate_models by running `modelgen.generate_models?`.\n",
317 | "\n",
318 | "##### What number of models to select?\n",
319 | "This number differs per dataset. More models will give better results but it will take longer to evaluate them. For the purpose of this tutorial we recommend trying only 2 models to begin with. If you have enough time you can try a larger number of models, e.g. 10 or 20 models. Because mcfly uses random search, you will get better results when using more models."
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 10,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "num_classes = y_train_binary.shape[1]\n",
329 | "\n",
330 | "models = modelgen.generate_models(X_train.shape,\n",
331 | " number_of_classes=num_classes,\n",
332 | " number_of_models = 2)"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 11,
338 | "metadata": {},
339 | "outputs": [
340 | {
341 | "data": {
342 | "text/plain": [
343 | "(11397, 512, 9)"
344 | ]
345 | },
346 | "execution_count": 11,
347 | "metadata": {},
348 | "output_type": "execute_result"
349 | }
350 | ],
351 | "source": [
352 | "X_train.shape"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 12,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": [
361 | "from keras.layers import Input\n",
362 | "from keras.layers.convolutional import Conv2D\n",
363 | "from keras.layers import BatchNormalization, Activation, Convolution1D, Lambda, \\\n",
364 | " Convolution2D, Flatten, \\\n",
365 | " Reshape, LSTM, Dropout, TimeDistributed, BatchNormalization\n",
366 | "from keras.regularizers import l2\n",
367 | "import keras"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 13,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "x = Input(shape=(512, 3))\n",
377 | "\n",
378 | "# 1x3 conv with 3 output channels (same as input channels)\n",
379 | "y = Convolution1D(3, (3), padding='same')(x)\n",
380 | "# this returns x + y.\n",
381 | "z = keras.layers.add([x, y],name='skipconnection')"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 14,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": [
390 | " nn = keras.models.Model(inputs=x, outputs=z)\n",
391 | " "
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": 15,
397 | "metadata": {},
398 | "outputs": [
399 | {
400 | "name": "stdout",
401 | "output_type": "stream",
402 | "text": [
403 | "__________________________________________________________________________________________________\n",
404 | "Layer (type) Output Shape Param # Connected to \n",
405 | "==================================================================================================\n",
406 | "input_1 (InputLayer) (None, 512, 3) 0 \n",
407 | "__________________________________________________________________________________________________\n",
408 | "conv1d_4 (Conv1D) (None, 512, 3) 30 input_1[0][0] \n",
409 | "__________________________________________________________________________________________________\n",
410 | "skipconnection (Add) (None, 512, 3) 0 input_1[0][0] \n",
411 | " conv1d_4[0][0] \n",
412 | "==================================================================================================\n",
413 | "Total params: 30\n",
414 | "Trainable params: 30\n",
415 | "Non-trainable params: 0\n",
416 | "__________________________________________________________________________________________________\n"
417 | ]
418 | }
419 | ],
420 | "source": [
421 | "nn.summary()"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "metadata": {},
428 | "outputs": [],
429 | "source": []
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "metadata": {},
435 | "outputs": [],
436 | "source": []
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": null,
441 | "metadata": {},
442 | "outputs": [],
443 | "source": []
444 | }
445 | ],
446 | "metadata": {
447 | "anaconda-cloud": {},
448 | "kernelspec": {
449 | "display_name": "Python 3",
450 | "language": "python",
451 | "name": "python3"
452 | },
453 | "language_info": {
454 | "codemirror_mode": {
455 | "name": "ipython",
456 | "version": 3
457 | },
458 | "file_extension": ".py",
459 | "mimetype": "text/x-python",
460 | "name": "python",
461 | "nbconvert_exporter": "python",
462 | "pygments_lexer": "ipython3",
463 | "version": "3.6.6"
464 | }
465 | },
466 | "nbformat": 4,
467 | "nbformat_minor": 1
468 | }
469 |
--------------------------------------------------------------------------------
/notebooks/experiments/preproces_Guinea-Bisseau_Nigeria.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Load Guinea-Bissau data and save as numpy file"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import pandas as pd\n",
20 | "from pandas import merge \n",
21 | "from os import listdir\n",
22 | "from numpy import genfromtxt, random"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "metadata": {
29 | "collapsed": false
30 | },
31 | "outputs": [],
32 | "source": [
33 | "datadir = \"/media/windows-share/EEG/EEGs_Guinea-Bissau_cleaned\"\n",
34 | "outputdir = \"/media/windows-share/EEG/EEGs_Guinea-Bissau_np\"\n",
35 | "namecontrol = 'Control'\n",
36 | "nameepilepsy = 'Epilepsy'\n",
37 | "#datadir = \"/media/windows-share/EEG/EEGs_Nigeria_cleaned\"\n",
38 | "#outputdir = \"/media/windows-share/EEG/EEGs_Nigeria_np\"\n",
39 | "#namecontrol = 'control'\n",
40 | "#nameepilepsy = 'epilepsy'\n",
41 | "filenames = listdir(datadir)\n",
42 | "D = []\n",
43 | "sf = 128\n",
44 | "nc = 14\n",
45 | "#Nfiles = len(filenames)\n",
46 | "#X = np.zeros((Nfiles,maxtslength,nc)) "
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 4,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "id = list(map(int,list(map(lambda file: file[file.find('id')+2:file.find('dur')-1],filenames))))\n",
58 | "dur = list(map(int,list(map(lambda file: file[file.find('dur')+3:file.find('epoch')-1],filenames))))\n",
59 | "#epoch = list(map(int,list(map(lambda file: file[file.find('epoch')+5:file.find('gro')-1],filenames))))\n",
60 | "group = list(map(str,list(map(lambda file: file[file.find('gro')+3:file.find('.csv')],filenames))))\n",
61 | "protocol = list(map(str,list(map(lambda file: file[file.find('yes')+3:file.find('id')-1],filenames))))"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 5,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [],
71 | "source": [
72 | "mydata = id, dur, group\n",
73 | "df = pd.DataFrame.from_items([('id',id),('dur',dur),('group',group),('filenames',filenames),\n",
74 | " ('protocol',protocol)])"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 6,
80 | "metadata": {
81 | "collapsed": true
82 | },
83 | "outputs": [],
84 | "source": [
85 | "def getids(x,y,prop,N):\n",
86 | " ix = np.sort(np.random.choice(x,round(N*prop),replace=False))\n",
87 | " iy = np.sort(np.random.choice(y,round(N*(1-prop)),replace=False))\n",
88 | " if (len(ix)+len(iy)) < 20:\n",
89 | " print(prop,N,len(x),len(y))\n",
90 | " x = [x for i,x in enumerate(x) if x not in ix] \n",
91 | " y = [x for i,x in enumerate(y) if x not in iy] \n",
92 | " icon = np.concatenate((ix,iy))\n",
93 | " return icon, x, y"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 7,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "train_10seconds_open 91\n",
108 | "(413, 1280, 14)\n",
109 | "(413, 1)\n",
110 | "valid_10seconds_open 20\n",
111 | "(141, 1280, 14)\n",
112 | "(141, 1)\n",
113 | "test_10seconds_open 20\n",
114 | "(119, 1280, 14)\n",
115 | "(119, 1)\n",
116 | "train_10seconds_closed 84\n",
117 | "(440, 1280, 14)\n",
118 | "(440, 1)\n",
119 | "valid_10seconds_closed 20\n",
120 | "(145, 1280, 14)\n",
121 | "(145, 1)\n",
122 | "test_10seconds_closed 20\n",
123 | "(162, 1280, 14)\n",
124 | "(162, 1)\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "logstructure = []\n",
130 | "for mindur in [10]: # minimum duration of an epoch in seconds\n",
131 | " for protocol in ['open','closed']:\n",
132 | " df2 = df[(df['protocol']==protocol) & (df['dur'] >= mindur)]\n",
133 | " maxtslength = mindur * sf\n",
134 | " #Identify training, test and validation group\n",
135 | " con = np.unique(df2[df2['group'] == namecontrol]['id'])\n",
136 | " epi = np.unique(df2[df2['group'] == nameepilepsy]['id'])\n",
137 | " Nid = len(con) + len(epi) #number of ids\n",
138 | " prop = 0.5 # len(con) / Nid #proportion of controls\n",
139 | " random.seed(300)\n",
140 | " ival, con, epi = getids(con,epi,prop,N=20) # validation set\n",
141 | " ites, con, epi = getids(con,epi,prop,N=20) # test set\n",
142 | " itra = np.concatenate((con, epi)) # training set\n",
143 | " #print(len(ival),len(ites),len(itra))\n",
144 | " # Now use identifies per group to load the data\n",
145 | " for subset in ['train','valid','test']:\n",
146 | " conditionname = subset+'_'+str(mindur)+'seconds_'+protocol\n",
147 | " if subset == 'train':\n",
148 | " tmp = df2[df2.id.isin(itra)]\n",
149 | " filenames = tmp['filenames']\n",
150 | " if subset == 'valid':\n",
151 | " tmp = df2[(df2.id.isin(ival))]\n",
152 | " tmp = tmp.sort_values(by=['id']).groupby('id').first() # select first available epoch\n",
153 | " filenames = tmp['filenames']\n",
154 | " if subset == 'test':\n",
155 | " tmp = df2[(df2.id.isin(ites))]\n",
156 | " tmp = tmp.sort_values(by=['id']).groupby('id').first() # select first available epoch\n",
157 | " filenames = tmp['filenames']\n",
158 | " X = np.zeros((0,maxtslength,nc)) #len(filenames)\n",
159 | " y = np.zeros((0,1)) #len(filenames)\n",
160 | " \n",
161 | " print(conditionname + ' ' + str(len(filenames)))\n",
162 | " for file in filenames:\n",
163 | " path = datadir + '/' + file\n",
164 | " D = pd.read_csv(path, sep=',',header=0,usecols=list(range(0,14)))\n",
165 | " if D.shape[0] > maxtslength:\n",
166 | " slicesize = sf * 10\n",
167 | " for slicei in range(int((len(D)/slicesize)-1)):\n",
168 | " sta = (((slicei)*slicesize))+1\n",
169 | " end = ((slicei+1)*slicesize)+1\n",
170 | " D2 = np.array(D[sta:end]) # take first part or should these be a random selection?\n",
171 | " #D = np.array(D[0:maxtslength]) # take first part or should these be a random selection?\n",
172 | " D2 = np.reshape(D2,(1,D2.shape[0],D2.shape[1]))\n",
173 | " m = D2.mean(axis=1,keepdims=True)\n",
174 | " D2 = D2 - m # subtract mean\n",
175 | " if X.shape[2] != D2.shape[2]:\n",
176 | " print(X.shape)\n",
177 | " print(D2.shape)\n",
178 | " X = np.vstack((X,D2))\n",
179 | " logstructure.append([subset,mindur,protocol,file])\n",
180 | " diagnosis = tmp.group[(tmp.filenames == file)]\n",
181 | " y = np.vstack((y,diagnosis))\n",
182 | " \n",
183 | " fnameX = outputdir + '/X_' + conditionname\n",
184 | " fnamey = outputdir + '/y_' + conditionname\n",
185 | " np.save(file=fnameX,arr=X)\n",
186 | " #y = np.array(tmp['group'])\n",
187 | " np.save(file=fnamey,arr=y) \n",
188 | " print(X.shape)\n",
189 | " print(y.shape)\n",
190 | "np.savetxt(outputdir + '/log.csv', logstructure,\n",
191 | " delimiter=\",\", fmt='%s')"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 8,
197 | "metadata": {
198 | "collapsed": false
199 | },
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "/media/windows-share/EEG/EEGs_Guinea-Bissau_np/X_test_10seconds_closed\n",
206 | "/media/windows-share/EEG/EEGs_Guinea-Bissau_np/y_test_10seconds_closed\n"
207 | ]
208 | }
209 | ],
210 | "source": [
211 | "print(fnameX)\n",
212 | "print(fnamey)\n",
213 | "testreadX = np.load(file=fnameX+'.npy')\n",
214 | "testready = np.load(file=fnamey+'.npy')"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 9,
220 | "metadata": {
221 | "collapsed": false
222 | },
223 | "outputs": [
224 | {
225 | "name": "stdout",
226 | "output_type": "stream",
227 | "text": [
228 | "(162, 1280, 14)\n",
229 | "(162, 1)\n"
230 | ]
231 | }
232 | ],
233 | "source": [
234 | "print(testreadX.shape)\n",
235 | "print(testready.shape)"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {
242 | "collapsed": true
243 | },
244 | "outputs": [],
245 | "source": []
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {
251 | "collapsed": true
252 | },
253 | "outputs": [],
254 | "source": []
255 | }
256 | ],
257 | "metadata": {
258 | "kernelspec": {
259 | "display_name": "Python 3",
260 | "language": "python",
261 | "name": "python3"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.5.2"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 0
278 | }
279 |
--------------------------------------------------------------------------------
/notebooks/tutorial/model/model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLeSC/mcfly-tutorial/4b1548058c158d0efef41bfb6c7b2caa575a8858/notebooks/tutorial/model/model.h5
--------------------------------------------------------------------------------
/notebooks/tutorial/workshop.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tutorial PAMAP2 with mcfly (workshop version)"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "In preparation of the workshop, please run code below to check if mcfly is installed correctly and to download the dataset (PAMAP2).\n",
15 | "\n",
16 | "Before you can start, please make sure you install mcfly (see the [mcfly installation page](https://github.com/NLeSC/mcfly)) and make sure your jupyter notebook has a python3 kernel."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## Import required Python modules"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "collapsed": false,
31 | "scrolled": false
32 | },
33 | "outputs": [],
34 | "source": [
35 | "import sys\n",
36 | "import os\n",
37 | "import numpy as np\n",
38 | "import pandas as pd\n",
39 | "# mcfly\n",
40 | "from mcfly import modelgen, find_architecture, storage\n",
41 | "np.random.seed(2)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "collapsed": false
49 | },
50 | "outputs": [],
51 | "source": [
52 | "sys.path.insert(0, os.path.abspath('../..'))\n",
53 | "from utils import tutorial_pamap2"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Download data pre-procesed data"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "We have created a function for you to fetch the preprocessed data. Please specify the `directory_to_extract_to` in the code below and then execute the cell. This will download the preprocessed data into the directory in the `data` subdirectory. The output of the function is the path where the preprocessed data was stored."
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "collapsed": true
75 | },
76 | "outputs": [],
77 | "source": [
78 | "# Specify in which directory you want to store the data:\n",
79 | "directory_to_extract_to = '.'"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "collapsed": false
87 | },
88 | "outputs": [],
89 | "source": [
90 | "data_path = tutorial_pamap2.download_preprocessed_data(directory_to_extract_to)"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {
97 | "collapsed": true
98 | },
99 | "outputs": [],
100 | "source": []
101 | }
102 | ],
103 | "metadata": {
104 | "anaconda-cloud": {},
105 | "kernelspec": {
106 | "display_name": "Python [conda root]",
107 | "language": "python",
108 | "name": "conda-root-py"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 3
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython3",
120 | "version": "3.5.2"
121 | }
122 | },
123 | "nbformat": 4,
124 | "nbformat_minor": 1
125 | }
126 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mcfly
2 | matplotlib
3 | pandas
4 | jupyter
5 | numpy
6 | scipy
7 |
--------------------------------------------------------------------------------
/scripts/Actitracker_train.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 | import sys
7 | import os
8 | import numpy as np
9 | import pandas as pd
10 | import json
11 | # mcfly
12 | from mcfly import modelgen, find_architecture, storage
13 |
14 |
15 | # In[2]:
16 |
17 | data_path = '/media/sf_VBox_Shared/timeseries/actitiracker/WISDM_at_v2.0/'
18 | preprocessed_path = os.path.join(data_path, 'preprocessed')
19 | result_path = os.path.join(data_path, 'models_test')
20 |
21 |
22 |
23 | # In[3]:
24 |
25 | X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy'))
26 | X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy'))
27 | X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy'))
28 | y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy'))
29 | y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy'))
30 | y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy'))
31 |
32 |
33 |
34 | with open(os.path.join(preprocessed_path, 'labels.json')) as f:
35 | labels = json.load(f)
36 |
37 |
38 | # ## Generate models
39 |
40 |
41 | num_classes = y_train.shape[1]
42 |
43 | models = modelgen.generate_models(X_train.shape,
44 | number_of_classes=num_classes,
45 | number_of_models = 15)
46 |
47 |
48 |
49 |
50 | #what is the fraction of classes in the validation set?
51 | pd.Series(y_val.mean(axis=0), index=labels)
52 |
53 |
54 | if not os.path.exists(result_path):
55 | os.makedirs(result_path)
56 |
57 |
58 |
59 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,
60 | X_val, y_val,
61 | models,nr_epochs=5,
62 | subset_size=512,
63 | verbose=True,
64 | batch_size=32,
65 | outputpath=result_path,
66 | early_stopping=True)
67 |
68 |
69 |
70 | print('Details of the training process were stored in ',os.path.join(result_path, 'models.json'))
71 |
72 |
73 |
74 | best_model_index = np.argmax(val_accuracies)
75 | best_model, best_params, best_model_types = models[best_model_index]
76 | print('Model type and parameters of the best model:')
77 | print(best_model_types)
78 | print(best_params)
79 |
80 |
81 | nr_epochs = 3
82 | datasize = X_train.shape[0]
83 | history = best_model.fit(X_train[:datasize,:,:], y_train[:datasize,:],
84 | epochs=nr_epochs, validation_data=(X_val, y_val))
85 |
86 |
87 | best_model.save(os.path.join(result_path, 'best_model.h5'))
88 |
89 |
90 |
91 | ## Test on Testset
92 | score_test = best_model.evaluate(X_test, y_test, verbose=True)
93 | print('Score of best model: ' + str(score_test))
94 |
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/scripts/EEG_alcoholic_train.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[2]:
5 |
6 | import sys
7 | import os
8 | import numpy as np
9 | import pandas as pd
10 | # mcfly
11 | from mcfly import modelgen, find_architecture, storage
12 |
13 | # Parameters
14 | data_path = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/'
15 | number_of_models = 10
16 | nr_epochs = 5
17 | subset_size = 512
18 | batch_size = 32
19 | early_stopping = True
20 |
21 | # In[3]:
22 |
23 |
24 | preprocessed_path = os.path.join(data_path, 'preprocessed')
25 | result_path = os.path.join(data_path, 'models')
26 |
27 |
28 | # In[4]:
29 |
30 |
31 | X_train = np.load(os.path.join(preprocessed_path, 'X_train.npy'))
32 | X_val = np.load(os.path.join(preprocessed_path, 'X_val.npy'))
33 | X_test = np.load(os.path.join(preprocessed_path, 'X_test.npy'))
34 | y_train = np.load(os.path.join(preprocessed_path, 'y_train.npy'))
35 | y_val = np.load(os.path.join(preprocessed_path, 'y_val.npy'))
36 | y_test = np.load(os.path.join(preprocessed_path, 'y_test.npy'))
37 |
38 |
39 | # ## Generate models
40 |
41 | # In[5]:
42 |
43 | num_classes = y_train.shape[1]
44 |
45 | models = modelgen.generate_models(X_train.shape,
46 | number_of_classes=num_classes,
47 | number_of_models = number_of_models)
48 |
49 |
50 | # In[6]:
51 |
52 | #what is the fraction of a vs c in the validation set?
53 | y_val.mean(axis=0)
54 |
55 |
56 | # In[7]:
57 |
58 | if not os.path.exists(result_path):
59 | os.makedirs(result_path)
60 |
61 |
62 | # In[ ]:
63 |
64 | outputfile = os.path.join(result_path, 'modelcomparison.json')
65 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,
66 | X_val, y_val,
67 | models,nr_epochs=nr_epochs,
68 | subset_size=subset_size,
69 | verbose=True,
70 | batch_size=batch_size,
71 | outputfile=outputfile,
72 | early_stopping=early_stopping)
73 | print('Details of the training process were stored in ',outputfile)
74 |
75 |
76 | # In[ ]:
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/scripts/experiment_PAMAP.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # # Experiment PAMAP with mcfly
5 |
6 | # ## Import required Python modules
7 |
8 | # In[1]:
9 |
10 | import sys
11 | import os
12 | import numpy as np
13 | import pandas as pd
14 | # mcfly
15 | from mcfly import modelgen, find_architecture, storage
16 | from keras.models import load_model
17 | np.random.seed(2)
18 |
19 |
20 | # In[2]:
21 |
22 | sys.path.insert(0, os.path.abspath('../..'))
23 | from utils import tutorial_pamap2
24 |
25 |
26 | # Load the preprocessed data as stored in Numpy-files. Please note that the data has already been split up in a training (training), validation (val), and test subsets. It is common practice to call the input data X and the labels y.
27 |
28 | # In[3]:
29 |
30 | data_path = '/media/sf_VBox_Shared/timeseries/PAMAP_Dataset/cleaned_7act/'
31 |
32 |
33 | # In[4]:
34 |
35 | X_train, y_train_binary, X_val, y_val_binary, X_test, y_test_binary, labels = tutorial_pamap2.load_data(data_path)
36 |
37 |
38 | # In[5]:
39 |
40 | print('x shape:', X_train.shape)
41 | print('y shape:', y_train_binary.shape)
42 |
43 |
44 | # The data is split between train test and validation.
45 |
46 | # In[6]:
47 |
48 | print('train set size:', X_train.shape[0])
49 | print('validation set size:', X_val.shape[0])
50 | print('test set size:', X_test.shape[0])
51 |
52 |
53 | # Let's have a look at the distribution of the labels:
54 |
55 | # In[7]:
56 |
57 | frequencies = y_train_binary.mean(axis=0)
58 | frequencies_df = pd.DataFrame(frequencies, index=labels, columns=['frequency'])
59 | frequencies_df
60 |
61 |
62 | # ## Generate models
63 |
64 | # In[8]:
65 |
66 | num_classes = y_train_binary.shape[1]
67 |
68 | models = modelgen.generate_models(X_train.shape,
69 | number_of_classes=num_classes,
70 | number_of_models = 5)
71 |
72 |
73 | # In[10]:
74 |
75 | models_to_print = range(len(models))
76 | for i, item in enumerate(models):
77 | if i in models_to_print:
78 | model, params, model_types = item
79 | print("-------------------------------------------------------------------------------------------------------")
80 | print("Model " + str(i))
81 | print(" ")
82 | print("Hyperparameters:")
83 | print(params)
84 | print(" ")
85 | print("Model description:")
86 | model.summary()
87 | print(" ")
88 | print("Model type:")
89 | print(model_types)
90 | print(" ")
91 |
92 |
93 | # ## Compare models
94 |
95 | # In[13]:
96 |
97 | # Define directory where the results, e.g. json file, will be stored
98 | resultpath = os.path.join(data_path, '..', 'data/models')
99 | if not os.path.exists(resultpath):
100 | os.makedirs(resultpath)
101 |
102 |
103 | # In[14]:
104 |
105 | outputfile = os.path.join(resultpath, 'modelcomparison_pamap.json')
106 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train_binary,
107 | X_val, y_val_binary,
108 | models,nr_epochs=5,
109 | subset_size=1000,
110 | verbose=True,
111 | outputfile=outputfile)
112 | print('Details of the training process were stored in ',outputfile)
113 |
114 |
115 | # In[15]:
116 |
117 | best_model_index = np.argmax(val_accuracies)
118 | best_model, best_params, best_model_types = models[best_model_index]
119 | print('Model type and parameters of the best model:')
120 | print(best_model_types)
121 | print(best_params)
122 |
123 |
124 | # ## Train the best model on the full dataset
125 |
126 | # In[16]:
127 |
128 | #We make a copy of the model, to start training from fresh
129 | nr_epochs = 1
130 | datasize = X_train.shape[0]
131 | history = best_model.fit(X_train[:datasize,:,:], y_train_binary[:datasize,:],
132 | epochs=nr_epochs, validation_data=(X_val, y_val_binary))
133 |
134 |
135 | # In[17]:
136 |
137 | modelname = 'my_bestmodel.h5'
138 | model_path = os.path.join(resultpath,modelname)
139 |
140 |
141 | # In[18]:
142 |
143 | best_model.save(model_path)
144 |
145 |
146 | # In[ ]:
147 |
148 |
149 |
150 |
--------------------------------------------------------------------------------
/scripts/experiment_PAMAP2_9fold.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # # Experiment PAMAP2 with mcfly
5 |
6 | # This experiment finds an optimal model for the PAMAP2 dataset.
7 |
8 | # ## Import required Python modules
9 |
10 | # In[1]:
11 |
12 | import sys
13 | import os
14 | import numpy as np
15 | import pandas as pd
16 | # mcfly
17 | from mcfly import modelgen, find_architecture, storage
18 |
19 |
20 | # In[11]:
21 |
22 | trainsize = 500
23 | valsize = 100
24 | nr_models = 10
25 | nr_epochs = 10
26 | subset_size = trainsize
27 |
28 |
29 | # ## Load the data
30 |
31 | # In[2]:
32 |
33 | # Define directory where the results, e.g. json file, will be stored
34 | datapath = '/data/mcfly/input'
35 | resultpath = '/data/mcfly/output'
36 | if not os.path.exists(resultpath):
37 | os.makedirs(resultpath)
38 |
39 |
40 | # In[3]:
41 |
42 | Xs = []
43 | ys = []
44 |
45 | ext = '.npy'
46 | for i in range(9):
47 | Xs.append(np.load(os.path.join(datapath,'X_'+str(i)+ext)))
48 | ys.append(np.load(os.path.join(datapath, 'y_'+str(i)+ext)))
49 |
50 |
51 | # In[4]:
52 |
53 | print(Xs[0].shape, ys[0].shape)
54 |
55 |
56 | # ## Generate models
57 |
58 | # First step is to create a model architecture. As we do not know what architecture is best for our data we will create a set of models to investigate which architecture is most suitable for our data and classification task. You will need to specificy how many models you want to create with argument 'number_of_models', the type of model which can been 'CNN' or 'DeepConvLSTM', and maximum number of layers per modeltype. See for a full overview of the optional arguments the function documentation of modelgen.generate_models
59 |
60 | # In[16]:
61 |
62 | num_classes = ys[0].shape[1]
63 | np.random.seed(123)
64 | models = modelgen.generate_models(Xs[0].shape,
65 | number_of_classes=num_classes,
66 | number_of_models = nr_models)
67 |
68 |
69 | # In[ ]:
70 |
71 |
72 |
73 |
74 | # In[19]:
75 |
76 | for i, (model, params, model_type) in enumerate(models):
77 | storage.savemodel(model,resultpath,"model_"+str(i))
78 |
79 |
80 | # ## Compare models
81 | # Now that the model architectures have been generated it is time to compare the models by training them in a subset of the training data and evaluating the models in the validation subset. This will help us to choose the best candidate model. Performance results are stored in a json file.
82 |
83 | # In[25]:
84 |
85 | def split_train_test(X_list, y_list, j):
86 | X_train = np.concatenate(X_list[0:j]+X_list[j+1:])
87 | X_test = X_list[j]
88 | y_train = np.concatenate(y_list[0:j]+y_list[j+1:])
89 | y_test = y_list[j]
90 | return X_train, y_train, X_test, y_test
91 |
92 | def split_train_small_val(X_list, y_list, j, trainsize=500, valsize=500):
93 | X = np.concatenate(X_list[0:j]+X_list[j+1:])
94 | y = np.concatenate(y_list[0:j]+y_list[j+1:])
95 | rand_ind = np.random.choice(X.shape[0], trainsize+valsize, replace=False)
96 | X_train = X[rand_ind[:trainsize]]
97 | y_train = y[rand_ind[:trainsize]]
98 | X_val = X[rand_ind[trainsize:]]
99 | y_val = y[rand_ind[trainsize:]]
100 | return X_train, y_train, X_val, y_val
101 |
102 |
103 | # In[26]:
104 |
105 | from keras.optimizers import Adam
106 | from keras.models import model_from_json
107 |
108 | def get_fresh_copy(model, lr):
109 | model_json = model.to_json()
110 | model_copy = model_from_json(model_json)
111 | model_copy.compile(loss='categorical_crossentropy',
112 | optimizer=Adam(lr=lr),
113 | metrics=['accuracy'])
114 | #for layer in model_copy.layers:
115 | # layer.build(layer.input_shape)
116 | return model_copy
117 |
118 |
119 | # In[10]:
120 |
121 | models = [(get_fresh_copy(model, params['learning_rate']), params, model_type) for model, params, model_type in models]
122 |
123 |
124 | # In[12]:
125 |
126 | import time
127 | t = time.time()
128 | np.random.seed(123)
129 | histories_list, val_accuracies_list, val_losses_list = [], [], []
130 | for j in range(len(Xs)):
131 | print('fold '+str(j))
132 | models = [(get_fresh_copy(model, params['learning_rate']), params, model_type) for model, params, model_type in models]
133 | X_train, y_train, X_val, y_val = split_train_small_val(Xs, ys, j, trainsize=trainsize, valsize=valsize)
134 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,
135 | X_val, y_val,
136 | models,
137 | nr_epochs=nr_epochs,
138 | subset_size=subset_size,
139 | verbose=True,
140 | outputfile=os.path.join(resultpath,
141 | 'experiment'+str(j)+'.json'),
142 | early_stopping=True)
143 | histories_list.append(histories)
144 | val_accuracies_list.append(val_accuracies)
145 | val_losses.append(val_losses)
146 | print(time.time()-t)
147 |
148 |
149 | # In[6]:
150 |
151 | # Read them all back in
152 | import json
153 | model_jsons = []
154 | for j in len(Xs):
155 | with open(os.path.join(resultpath, 'experiment'+str(j)+'.json'), 'r') as outfile:
156 | model_jsons.append(json.load(outfile))
157 |
158 |
159 | # In[12]:
160 |
161 | val_accuracies = np.array([[mod['val_acc'][-1] for mod in fold] for fold in model_jsons])
162 |
163 |
164 | # In[13]:
165 |
166 | val_acc = np.array([np.array([mod['val_acc'][-1] for mod in fold], dtype='float') for fold in model_jsons])
167 | train_acc = np.array([np.array([mod['train_acc'][-1] for mod in fold], dtype='float') for fold in model_jsons])
168 | train_loss = np.array([np.array([mod['train_loss'][-1] for mod in fold], dtype='float') for fold in model_jsons])
169 | val_loss = np.array([np.array([mod['val_loss'][-1] for mod in fold], dtype='float') for fold in model_jsons])
170 |
171 |
172 | # In[14]:
173 |
174 | val_accuracies_avg = val_acc.mean(axis=0)
175 | print('val_accuracies_avg:', val_accuracies_avg)
176 |
177 |
178 | # In[23]:
179 |
180 | best_model_index = np.argmax(val_accuracies_avg)
181 | best_model = storage.loadmodel(resultpath, 'model_'+str(best_model_index))
182 |
183 |
184 | # In[28]:
185 |
186 | best_params = model_jsons[0][best_model_index]
187 |
188 |
189 | # ## Train the best model for real
190 |
191 | # Now that we have identified the best model architecture out of our random pool of models we can continue by training the model on the full training sample. For the purpose of speeding up the example we only train the full model on the first 1000 values. You will need to replace this by 'datasize = X_train.shape[0]' in a real world example.
192 |
193 | # In[ ]:
194 |
195 | nr_epochs = 2
196 |
197 | np.random.seed(123)
198 | histories, test_accuracies_list, models = [], [], []
199 | for j in range(len(Xs)):
200 | X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j)
201 | model_copy = get_fresh_copy(best_model, best_params['learning_rate'])
202 | datasize = X_train.shape[0]
203 |
204 | history = model_copy.fit(X_train[:datasize,:,:], y_train[:datasize,:],
205 | nb_epoch=nr_epochs, validation_data=(X_test, y_test))
206 |
207 | histories.append(history)
208 | test_accuracies_list.append(history.history['val_acc'][-1] )
209 | models.append(model_copy)
210 |
211 |
212 | # In[ ]:
213 |
214 | modelname = 'my_bestmodel'
215 |
216 |
217 | # In[ ]:
218 |
219 | for i, model in enumerate(models):
220 | storage.savemodel(model,resultpath,modelname+str(i))
221 |
222 |
223 | # In[ ]:
224 |
225 | print('accuracies: ', test_accuracies_list)
226 |
227 |
228 | # In[ ]:
229 |
230 | print(np.mean(test_accuracies_list))
231 |
232 |
233 | # In[ ]:
234 |
235 |
236 |
237 |
--------------------------------------------------------------------------------
/scripts/pamap2.py:
--------------------------------------------------------------------------------
1 | """
2 | Script to do a model comparison
3 | Run this script from the root of repository:
4 |
5 | `python scripts/pamap2.py`
6 | """
7 | import sys
8 | import os
9 | import numpy as np
10 | import pandas as pd
11 | from mcfly import modelgen, find_architecture, storage
12 |
13 | np.random.seed(2)
14 | sys.path.insert(0, os.path.abspath('.'))
15 | print(sys.path)
16 | from utils import tutorial_pamap2
17 |
18 | # ## Settings
19 | # Specify in which directory you want to store the data:
20 | directory_to_extract_to = 'notebooks/tutorial/'
21 | number_of_models = 2
22 | subset_size = 10
23 | nr_epochs = 1
24 |
25 | # ## Download data and pre-proces data
26 | data_path = tutorial_pamap2.download_preprocessed_data(directory_to_extract_to)
27 | X_train, y_train_binary, X_val, y_val_binary, X_test, y_test_binary, labels = tutorial_pamap2.load_data(data_path)
28 |
29 | # The data is split between train test and validation.
30 |
31 | print('train set size:', X_train.shape[0])
32 | print('validation set size:', X_val.shape[0])
33 | print('test set size:', X_test.shape[0])
34 |
35 | # ## Generate models
36 |
37 | num_classes = y_train_binary.shape[1]
38 | models = modelgen.generate_models(X_train.shape,
39 | number_of_classes=num_classes,
40 | number_of_models=number_of_models)
41 |
42 | # Define output path
43 | resultpath = os.path.join(directory_to_extract_to, 'data/models')
44 | if not os.path.exists(resultpath):
45 | os.makedirs(resultpath)
46 | outputfile = os.path.join(resultpath, 'modelcomparison.json')
47 |
48 | histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train_binary,
49 | X_val, y_val_binary,
50 | models, nr_epochs=nr_epochs,
51 | subset_size=subset_size,
52 | verbose=True,
53 | outputfile=outputfile)
54 | print('Details of the training process were stored in ', outputfile)
55 |
56 | # # Inspect model performance (table)
57 | modelcomparisons = pd.DataFrame({'model': [str(params) for model, params, model_types in models],
58 | 'train_acc': [history.history['acc'][-1] for history in histories],
59 | 'train_loss': [history.history['loss'][-1] for history in histories],
60 | 'val_acc': [history.history['val_acc'][-1] for history in histories],
61 | 'val_loss': [history.history['val_loss'][-1] for history in histories]
62 | })
63 | modelcomparisons.to_csv(os.path.join(resultpath, 'modelcomparisons.csv'))
64 |
65 | modelcomparisons
66 |
67 | # # Choose the best model and save it
68 |
69 |
70 | best_model_index = np.argmax(val_accuracies)
71 | best_model, best_params, best_model_types = models[best_model_index]
72 | print('Model type and parameters of the best model:')
73 | print(best_model_types)
74 | print(best_params)
75 | modelname = 'my_bestmodel'
76 | storage.savemodel(best_model, resultpath, modelname)
77 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLeSC/mcfly-tutorial/4b1548058c158d0efef41bfb6c7b2caa575a8858/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_tutorial_pamap2.py:
--------------------------------------------------------------------------------
1 | from utils import tutorial_pamap2
2 | import numpy as np
3 | import pandas as pd
4 | import os.path
5 | import unittest
6 |
7 |
8 | class TutorialPAMAP2Suite(unittest.TestCase):
9 | """Basic test cases."""
10 |
11 | def test_split_activities(self):
12 | """
13 | Test whether split_activities produces a Numpy array
14 | """
15 | labels = np.ones(3000)
16 | labels[range(150)] = 2
17 | X = np.ones((3000,9))
18 | splittedX, splitted_y = tutorial_pamap2.split_activities(labels,X,[0], borders=50)
19 | assert splittedX[0].shape == (50, 9)
20 | assert splittedX[1].shape == (2750, 9)
21 |
22 |
23 | def test_sliding_window(self):
24 | """ Test whether sliding_window correctly updates x_train to the
25 | right size"""
26 | frame_length = 512
27 | step = 100
28 | x_trainlist = [np.zeros((25187,9)) for b in range(78)]
29 | y_trainlist = [np.zeros((12,9)) for b in range(78)]
30 | x_train, y_train = tutorial_pamap2.sliding_window(frame_length, step, x_trainlist, y_trainlist)
31 | test = len(x_train) == 19266
32 | assert test
33 |
34 | def test_transform_y(self):
35 | """ Test whether function produces Numpy array of expected size """
36 | mapclasses = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, \
37 | 12: 7, 13: 8, 16: 9, 17: 10, 24: 11}
38 | nr_classes = 12
39 | y = list([1,2,5,7,13,16,24,1,2,5,7,13,16,24]) #14 values
40 | transformedy = tutorial_pamap2.transform_y(y, mapclasses, nr_classes)
41 | test = transformedy.shape == (14,12)
42 | assert test
43 |
44 | def test_addheader(self):
45 | """ Test whether addheader produces dataframe of same shape as input
46 | """
47 | datasets = [pd.DataFrame(index=range(100),columns=range(54)) for b in range(10)]
48 | datasetsnew = tutorial_pamap2.addheader(datasets)
49 | test = datasetsnew[0].shape == datasets[0].shape
50 | assert test
51 |
52 | def test_numpify_and_store(self):
53 | """ Test whether numpify_and_store produces npy-file """
54 | Nsamples = 9
55 | Ntimesteps = 10
56 | Ncolumns = 3
57 | X = [[[0 for a in range(Ncolumns)] for b in range(Ntimesteps)] \
58 | for c in range(Nsamples)]
59 | y = [[0 for a in range(Ntimesteps)] for b in range(Nsamples)]
60 | xname = 'xname'
61 | yname = 'yname'
62 | outdatapath = os.getcwd()
63 | tutorial_pamap2.numpify_and_store(X, y, xname, yname, outdatapath, \
64 | shuffle=True)
65 | filename = os.path.join(outdatapath, xname+ '.npy')
66 | test = os.path.isfile(filename)
67 | if test == True:
68 | os.remove(filename)
69 | os.remove(os.path.join(outdatapath, yname + '.npy'))
70 | assert test
71 |
72 | def test_split_data(self):
73 | """ Test whether function produces numpy arrays
74 | of the correct dimensions """
75 | Xlists = tuple([[np.zeros((200,9)) for b in range(14)] for c in range(9)])
76 | ybinarylists = [np.zeros((14,12)) for c in range(9)]
77 | indices = slice(7, 9)
78 | x_test, y_test = tutorial_pamap2.split_data(Xlists, ybinarylists, \
79 | indices)
80 | test = y_test[0].shape == (12,) and x_test[0].shape == (200, 9)
81 | assert test
82 |
83 | def test_load_model(self):
84 | from tensorflow.keras.models import load_model
85 | model = load_model('./notebooks/tutorial/model/model.h5')
86 | assert len(model.layers) > 1
87 |
88 | if __name__ == '__main__':
89 | unittest.main()
90 |
--------------------------------------------------------------------------------
/tests/test_tutorial_weather.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import unittest
3 | from pathlib import Path
4 |
5 | from utils.tutorial_weather import load_data
6 |
7 |
8 | class TutorialWeatherSuite(unittest.TestCase):
9 | """ Weather data set test cases."""
10 | temp_test_dir = 'temp_weather_test'
11 |
12 | def test_data_downloading_has_correct_shape(self):
13 | n_features = 89
14 | n_train_instances = 767
15 | n_test_instances = 329
16 |
17 | X_train, X_test, y_train, y_test = load_data(self.temp_test_dir)
18 |
19 | assert X_train.shape == (n_train_instances, n_features)
20 | assert X_test.shape == (n_test_instances, n_features)
21 | assert y_train.shape == (n_train_instances,)
22 | assert y_test.shape == (n_test_instances,)
23 |
24 | def setUp(self) -> None:
25 | Path(self.temp_test_dir).mkdir()
26 |
27 | def tearDown(self) -> None:
28 | shutil.rmtree(Path(self.temp_test_dir))
29 |
30 |
31 | if __name__ == '__main__':
32 | unittest.main()
33 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .tutorial_pamap2 import *
--------------------------------------------------------------------------------
/utils/tutorial_pamap2.py:
--------------------------------------------------------------------------------
1 | """
2 | Summary:
3 | Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4 | preproces the data.
5 | Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6 | """
7 | import numpy as np
8 | import pandas as pd
9 | from os import listdir
10 | import os.path
11 | import zipfile
12 | from tensorflow.keras.utils import to_categorical
13 | import six.moves.urllib as urllib
14 | import json
15 |
16 |
17 | def split_activities(labels, X, exclude_activities, borders=10 * 100):
18 | """
19 | Splits up the data per activity and exclude activity=0.
20 | Also remove borders for each activity.
21 | Returns lists with subdatasets
22 |
23 | Parameters
24 | ----------
25 | labels : numpy array
26 | Activity labels
27 | X : numpy array
28 | Data points
29 | borders : int
30 | Nr of timesteps to remove from the borders of an activity
31 | exclude_activities : list or tuple
32 | activities to exclude from the
33 |
34 | Returns
35 | -------
36 | X_list
37 | y_list
38 | """
39 | tot_len = len(labels)
40 | startpoints = np.where([1] + [labels[i] != labels[i - 1]
41 | for i in range(1, tot_len)])[0]
42 | endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
43 | acts = [labels[s] for s, e in zip(startpoints, endpoints)]
44 | # Also split up the data, and only keep the non-zero activities
45 | xysplit = [(X[s + borders:e - borders + 1, :], a)
46 | for s, e, a in zip(startpoints, endpoints, acts)
47 | if a not in exclude_activities and e-borders+1 >= 0 and s+borders < tot_len]
48 | xysplit = [(Xs, y) for Xs, y in xysplit if len(Xs) > 0]
49 | Xlist = [Xs for Xs, y in xysplit]
50 | ylist = [y for X, y in xysplit]
51 | return Xlist, ylist
52 |
53 |
54 | def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
55 | """
56 | Splits time series in ysampleslist and Xsampleslist
57 | into segments by applying a sliding overlapping window
58 | of size equal to frame_length with steps equal to step
59 | it does this for all the samples and appends all the output together.
60 | So, the participant distinction is not kept
61 |
62 | Parameters
63 | ----------
64 | frame_length : int
65 | Length of sliding window
66 | step : int
67 | Stepsize between windows
68 | Xsamples : list
69 | Existing list of window fragments
70 | ysamples : list
71 | Existing list of window fragments
72 | Xsampleslist : list
73 | Samples to take sliding windows from
74 | ysampleslist
75 | Samples to take sliding windows from
76 | """
77 | Xsamples = []
78 | ysamples = []
79 | for j in range(len(Xsampleslist)):
80 | X = Xsampleslist[j]
81 | ybinary = ysampleslist[j]
82 | for i in range(0, X.shape[0] - frame_length, step):
83 | xsub = X[i:i + frame_length, :]
84 | ysub = ybinary
85 | Xsamples.append(xsub)
86 | ysamples.append(ysub)
87 | return Xsamples, ysamples
88 |
89 |
90 | def transform_y(y, mapclasses, nr_classes):
91 | """
92 | Transforms y, a list with one sequence of A timesteps
93 | and B unique classes into a binary Numpy matrix of
94 | shape (A, B)
95 |
96 | Parameters
97 | ----------
98 | y : list or array
99 | List of classes
100 | mapclasses : dict
101 | dictionary that maps the classes to numbers
102 | nr_classes : int
103 | total number of classes
104 | """
105 | ymapped = np.array([mapclasses[c] for c in y], dtype='int')
106 | ybinary = to_categorical(ymapped, nr_classes)
107 | return ybinary
108 |
109 | def get_header():
110 | axes = ['x', 'y', 'z']
111 | IMUsensor_columns = ['temperature'] + \
112 | ['acc_16g_' + i for i in axes] + \
113 | ['acc_6g_' + i for i in axes] + \
114 | ['gyroscope_' + i for i in axes] + \
115 | ['magnometer_' + i for i in axes] + \
116 | ['orientation_' + str(i) for i in range(4)]
117 | header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
118 | for s in IMUsensor_columns] \
119 | + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
120 | for s in IMUsensor_columns]
121 | return header
122 |
123 | def addheader(datasets):
124 | """
125 | The columns of the pandas data frame are numbers
126 | this function adds the column labels
127 |
128 | Parameters
129 | ----------
130 | datasets : list
131 | List of pandas dataframes
132 | """
133 | header = get_header()
134 | for i in range(0, len(datasets)):
135 | datasets[i].columns = header
136 | return datasets
137 |
138 |
139 | def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False):
140 | """
141 | Converts python lists x 3D and y 1D into numpy arrays
142 | and stores the numpy array in directory outdatapath
143 | shuffle is optional and shuffles the samples
144 |
145 | Parameters
146 | ----------
147 | X : list
148 | list with data
149 | y : list
150 | list with data
151 | X_name : str
152 | name to store the x arrays
153 | y_name : str
154 | name to store the y arrays
155 | outdatapath : str
156 | path to the directory to store the data
157 | shuffle : bool
158 | whether to shuffle the data before storing
159 | """
160 | X = np.array(X)
161 | y = np.array(y)
162 | # Shuffle the train set
163 | if shuffle is True:
164 | np.random.seed(123)
165 | neworder = np.random.permutation(X.shape[0])
166 | X = X[neworder, :, :]
167 | y = y[neworder, :]
168 | # Save binary file
169 | xpath = os.path.join(outdatapath, X_name)
170 | ypath = os.path.join(outdatapath, y_name)
171 | np.save(xpath, X)
172 | np.save(ypath, y)
173 | print('Stored ' + xpath, y_name)
174 |
175 |
176 | def fetch_data(directory_to_extract_to):
177 | """
178 | Fetch the data and extract the contents of the zip file
179 | to the directory_to_extract_to.
180 | First check whether this was done before, if yes, then skip
181 |
182 | Parameters
183 | ----------
184 | directory_to_extract_to : str
185 | directory to create subfolder 'PAMAP2'
186 |
187 | Returns
188 | -------
189 | targetdir: str
190 | directory where the data is extracted
191 | """
192 | targetdir = os.path.join(directory_to_extract_to, "PAMAP2")
193 | if os.path.exists(targetdir):
194 | print('Data previously downloaded and stored in ' + targetdir)
195 | else:
196 | os.makedirs(targetdir) # create target directory
197 | # Download the PAMAP2 data, this is 688 Mb
198 | path_to_zip_file = os.path.join(directory_to_extract_to, 'PAMAP2_Dataset.zip')
199 | test_file_exist = os.path.isfile(path_to_zip_file)
200 | if test_file_exist is False:
201 | url = str('https://archive.ics.uci.edu/ml/' +
202 | 'machine-learning-databases/00231/PAMAP2_Dataset.zip')
203 | # retrieve data from url
204 | local_fn, headers = urllib.request.urlretrieve(url,
205 | filename=path_to_zip_file)
206 | print('Download complete and stored in: ' + path_to_zip_file)
207 | else:
208 | print('The data was previously downloaded and stored in ' +
209 | path_to_zip_file)
210 | # unzip
211 |
212 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
213 | zip_ref.extractall(targetdir)
214 | os.remove(path_to_zip_file)
215 | return targetdir
216 |
217 |
218 | def map_class(datasets_filled, exclude_activities):
219 | ysetall = [set(np.array(data.activityID)) - set(exclude_activities)
220 | for data in datasets_filled]
221 | class_ids = list(set.union(*[set(y) for y in ysetall]))
222 | class_labels = [ACTIVITIES_MAP[i] for i in class_ids]
223 | nr_classes = len(class_ids)
224 | mapclasses = {class_ids[i]: i for i in range(len(class_ids))}
225 | return class_labels, nr_classes, mapclasses
226 |
227 |
228 | def split_data(Xlists, ybinarylists, indices):
229 | """ Function takes subset from list given indices
230 |
231 | Parameters
232 | ----------
233 | Xlists: tuple
234 | tuple (samples) of lists (windows) of numpy-arrays (time, variable)
235 | ybinarylist :
236 | list (samples) of numpy-arrays (window, class)
237 | indices :
238 | indices of the slice of data (samples) to be taken
239 |
240 | Returns
241 | -------
242 | x_setlist : list
243 | list (windows across samples) of numpy-arrays (time, variable)
244 | y_setlist: list
245 | list (windows across samples) of numpy-arrays (class, )
246 | """
247 | tty = str(type(indices))
248 | # or statement in next line is to account for python2 and python3
249 | # difference
250 | if tty == "" or tty == "":
251 | x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
252 | y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
253 | else:
254 | x_setlist = [X for X in Xlists[indices]]
255 | y_setlist = [y for y in ybinarylists[indices]]
256 | return x_setlist, y_setlist
257 |
258 | def split_data_random(X, y, val_size, test_size):
259 | X = np.array(X)
260 | y = np.array(y)
261 | size = len(X)
262 | train_size = size - val_size - test_size
263 | indices = np.random.permutation(size)
264 | X_train = X[indices[:train_size]]
265 | y_train = y[indices[:train_size]]
266 | X_val = X[indices[train_size:train_size+val_size]]
267 | y_val = y[indices[train_size:train_size+val_size]]
268 | X_test = X[indices[train_size+val_size:]]
269 | y_test = y[indices[train_size+val_size:]]
270 | return X_train, y_train, X_val, y_val, X_test, y_test
271 |
272 | def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold,
273 | val_test_size=None):
274 | """ Function to preprocess the PAMAP2 data after it is fetched
275 |
276 | Parameters
277 | ----------
278 | targetdir : str
279 | subdirectory of directory_to_extract_to, targetdir
280 | is defined by function fetch_data
281 | outdatapath : str
282 | a subdirectory of directory_to_extract_to, outdatapath
283 | is the direcotry where the Numpy output will be stored.
284 | columns_to_use : list
285 | list of column names to use
286 | exclude_activities : list or tuple
287 | activities to exclude from the
288 | fold : boolean
289 | Whether to store each fold seperately ('False' creates
290 | Train, Test and Validation sets)
291 |
292 | Returns
293 | -------
294 | None
295 | """
296 | datadir = os.path.join(targetdir, 'PAMAP2_Dataset', 'Protocol')
297 | filenames = listdir(datadir)
298 | filenames.sort()
299 | print('Start pre-processing all ' + str(len(filenames)) + ' files...')
300 | # load the files and put them in a list of pandas dataframes:
301 | datasets = [pd.read_csv(os.path.join(datadir, fn), header=None, sep=' ')
302 | for fn in filenames]
303 | datasets = addheader(datasets) # add headers to the datasets
304 | # Interpolate dataset to get same sample rate between channels
305 | datasets_filled = [d.interpolate() for d in datasets]
306 | # Create mapping for class labels
307 | class_labels, nr_classes, mapclasses = map_class(datasets_filled, exclude_activities)
308 | # Save class labels
309 | with open(os.path.join(outdatapath, 'labels.json'), 'w') as fp:
310 | json.dump(class_labels, fp)
311 | # Create input (x) and output (y) sets
312 | xall = [np.array(data[columns_to_use]) for data in datasets_filled]
313 | yall = [np.array(data.activityID) for data in datasets_filled]
314 | xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]
315 | Xlists, ylists = zip(*xylists)
316 | ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
317 | frame_length = int(5.12 * 100)
318 | step = 1 * 100
319 | if not fold:
320 | if val_test_size is None:
321 | # Split in train, test and val
322 | x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
323 | test_range = slice(7, len(datasets_filled))
324 | x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
325 | x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
326 | indices=slice(0, 6))
327 | # Take sliding-window frames, target is label of last time step,
328 | # and store as numpy file
329 | x_train, y_train = sliding_window(frame_length, step, x_trainlist,
330 | y_trainlist)
331 | x_val, y_val = sliding_window(frame_length, step, x_vallist,
332 | y_vallist)
333 | x_test, y_test = sliding_window(frame_length, step, x_testlist,
334 | y_testlist)
335 |
336 | else:
337 | val_size, test_size = val_test_size
338 | X_list, y_list = split_data(Xlists, ybinarylists,
339 | slice(0, len(datasets_filled)))
340 | X, y = sliding_window(frame_length, step, X_list,
341 | y_list)
342 | x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size)
343 |
344 | numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train',
345 | outdatapath=outdatapath, shuffle=True)
346 | numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val',
347 | outdatapath=outdatapath, shuffle=False)
348 | numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test',
349 | outdatapath=outdatapath, shuffle=False)
350 | else:
351 | for i in range(len(Xlists)):
352 | X_i, y_i = split_data(Xlists, ybinarylists, i)
353 | X, y = sliding_window(frame_length, step, X_i, y_i)
354 | numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i),
355 | outdatapath=outdatapath, shuffle=True)
356 |
357 |
358 | print('Processed data succesfully stored in ' + outdatapath)
359 | return None
360 |
361 |
362 | def fetch_and_preprocess(directory_to_extract_to,
363 | columns_to_use=None,
364 | output_dir='preprocessed',
365 | exclude_activities=[0],
366 | fold=False,
367 | val_test_size=None):
368 | """High level function to fetch_and_preprocess the PAMAP2 dataset.
369 |
370 | Parameters
371 | ----------
372 | directory_to_extract_to : str
373 | the directory where the data will be stored
374 | columns_to_use : list
375 | the columns to use
376 | ouptput_dir : str
377 | name of the directory to write the outputdata to
378 | exclude_activities : list or tuple
379 | activities to exclude from the
380 | fold : boolean
381 | Whether to store each fold seperately ('False' creates
382 | Train, Test and Validation sets)
383 |
384 | Returns
385 | -------
386 | outdatapath: str
387 | The directory in which the numpy files are stored
388 | """
389 | if columns_to_use is None:
390 | columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
391 | 'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
392 | 'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
393 | targetdir = fetch_data(directory_to_extract_to)
394 | outdatapath = os.path.join(targetdir, output_dir)
395 | if not os.path.exists(outdatapath):
396 | os.makedirs(outdatapath)
397 | if os.path.isfile(os.path.join(outdatapath, 'X_train.npy')):
398 | print('Data previously pre-processed and np-files saved to ' +
399 | outdatapath)
400 | else:
401 | preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size)
402 | return outdatapath
403 |
404 |
405 | def load_data(outputpath):
406 | """Load the numpy data as stored in directory outputpath.
407 |
408 | Parameters
409 | ----------
410 | outputpath : str
411 | directory where the numpy files are stored
412 |
413 | Returns
414 | -------
415 | x_train
416 | y_train_binary
417 | x_val
418 | y_val_binary
419 | x_test
420 | y_test_binary
421 | """
422 | ext = '.npy'
423 | x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
424 | y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
425 | x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
426 | y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext))
427 | x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
428 | y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext))
429 | with open(os.path.join(outputpath, 'labels.json'), 'r') as fn:
430 | labels = json.load(fn)
431 | return x_train, y_train_binary, x_val, y_val_binary, \
432 | x_test, y_test_binary, labels
433 |
434 |
435 | def download_preprocessed_data(directory_to_extract_to):
436 | """Load already preprocessed data from zenodo.
437 |
438 | Args:
439 | ----
440 | directory_to_extract_to: str
441 | Define directory to extract dataset to (if not yet present).
442 | """
443 | data_path = os.path.join(directory_to_extract_to,
444 | 'data', 'PAMAP2', 'preprocessed')
445 |
446 | if not os.path.isdir(data_path):
447 | path_to_zip_file = os.path.join(directory_to_extract_to, 'data.zip')
448 |
449 | # Download zip file with data
450 | if not os.path.isfile(path_to_zip_file):
451 | print("Downloading data...")
452 | local_fn, headers = urllib.request.urlretrieve(
453 | 'https://zenodo.org/record/834467/files/data03.zip',
454 | filename=path_to_zip_file)
455 | else:
456 | print("Data already downloaded")
457 | # Extract the zip file
458 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
459 | print("Extracting data...")
460 | zip_ref.extractall(directory_to_extract_to)
461 | os.rename(os.path.join(directory_to_extract_to, 'data03'),
462 | os.path.join(directory_to_extract_to, 'data'))
463 | print("Done")
464 | else:
465 | print("Data already downloaded and extracted.")
466 |
467 | return data_path
468 |
469 |
470 | ACTIVITIES_MAP = {
471 | 0: 'no_activity',
472 | 1: 'lying',
473 | 2: 'sitting',
474 | 3: 'standing',
475 | 4: 'walking',
476 | 5: 'running',
477 | 6: 'cycling',
478 | 7: 'nordic_walking',
479 | 9: 'watching_tv',
480 | 10: 'computer_work',
481 | 11: 'car_driving',
482 | 12: 'ascending_stairs',
483 | 13: 'descending_stairs',
484 | 16: 'vaccuum_cleaning',
485 | 17: 'ironing',
486 | 18: 'folding_laundry',
487 | 19: 'house_cleaning',
488 | 20: 'playing_soccer',
489 | 24: 'rope_jumping'
490 | }
491 |
--------------------------------------------------------------------------------
/utils/tutorial_racketsports.py:
--------------------------------------------------------------------------------
1 | """Data precprocessing or loading for RacketSports dataset.
2 |
3 | Summary:
4 | Contains script to preprocess RacketSports dataset and function to load the
5 | already preprocessed dataset.
6 |
7 | This dataset is rather simple which makes it well suited for quick training
8 | of mcfly models.
9 | """
10 | import numpy as np
11 | import os.path
12 | import zipfile
13 | import six.moves.urllib as urllib
14 |
15 |
16 | def download_preprocessed_data(directory_to_extract_to):
17 | """Load already preprocessed data from zenodo.
18 |
19 | Args:
20 | ----
21 | directory_to_extract_to: str
22 | Define directory to extract dataset to (if not yet present).
23 | """
24 | data_path = os.path.join(directory_to_extract_to,
25 | 'RacketSports', 'preprocessed')
26 |
27 | if not os.path.isdir(data_path):
28 | path_to_zip_file = os.path.join(directory_to_extract_to, 'RacketSports.zip')
29 |
30 | # Download zip file with data
31 | if not os.path.isfile(path_to_zip_file):
32 | print("Downloading data...")
33 | local_fn, headers = urllib.request.urlretrieve(
34 | 'https://zenodo.org/record/3743603/files/RacketSports.zip',
35 | filename=path_to_zip_file)
36 | else:
37 | print("Data already downloaded")
38 | # Extract the zip file
39 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
40 | print("Extracting data...")
41 | zip_ref.extractall(directory_to_extract_to)
42 | print("Done")
43 | else:
44 | print("Data already downloaded and extracted.")
45 |
46 | return data_path
47 |
48 |
49 | def fetch_and_preprocess(directory_to_extract_to,
50 | output_dir='preprocessed'):
51 | """High level function to fetch_and_preprocess the RacketSports dataset.
52 |
53 | Parameters
54 | ----------
55 | directory_to_extract_to : str
56 | the directory where the data will be stored
57 | ouptput_dir : str
58 | name of the directory to write the outputdata to
59 |
60 | Returns
61 | -------
62 | outdatapath: str
63 | The directory in which the numpy files are stored
64 | """
65 | targetdir = fetch_data(directory_to_extract_to)
66 | outdatapath = os.path.join(targetdir, output_dir)
67 | if not os.path.exists(outdatapath):
68 | os.makedirs(outdatapath)
69 | if os.path.isfile(os.path.join(outdatapath, 'X_train.npy')):
70 | print('Data previously pre-processed and np-files saved to ' +
71 | outdatapath)
72 | else:
73 | preprocess(targetdir, outdatapath)
74 | return outdatapath
75 |
76 |
77 | def preprocess(targetdir, outdatapath):
78 | """ Function to preprocess the RacketSports data after it is fetched
79 |
80 | Parameters
81 | ----------
82 | targetdir : str
83 | subdirectory of directory_to_extract_to, targetdir
84 | is defined by function fetch_data
85 | outdatapath : str
86 | a subdirectory of directory_to_extract_to, outdatapath
87 | is the direcotry where the Numpy output will be stored.
88 |
89 | Returns
90 | -------
91 | None
92 | """
93 | datadir = os.path.join(targetdir) #, 'RacketSports')
94 | filenames = os.listdir(datadir)
95 | filenames.sort()
96 | print('Start pre-processing all ' + str(len(filenames)) + ' files...')
97 |
98 | # Load ans split data
99 | file_train = os.path.join(datadir, 'RacketSports_TRAIN.arff')
100 | file_test = os.path.join(datadir, 'RacketSports_TEST.arff')
101 | X_train, y_train = load_racket_arff(file_train)
102 | X_test, X_val, y_test, y_val = load_and_split(file_test, random_seed=1)
103 |
104 | store_data(X_train, y_train, X_name='X_train', y_name='y_train',
105 | outdatapath=outdatapath, shuffle=True)
106 | store_data(X_val, y_val, X_name='X_val', y_name='y_val',
107 | outdatapath=outdatapath, shuffle=False)
108 | store_data(X_test, y_test, X_name='X_test', y_name='y_test',
109 | outdatapath=outdatapath, shuffle=False)
110 |
111 | print('Processed data succesfully stored in ' + outdatapath)
112 | return None
113 |
114 |
115 | def fetch_data(directory_to_extract_to):
116 | """
117 | Fetch the data and extract the contents of the zip file
118 | to the directory_to_extract_to.
119 | First check whether this was done before, if yes, then skip
120 |
121 | Parameters
122 | ----------
123 | directory_to_extract_to : str
124 | directory to create subfolder 'PAMAP2'
125 |
126 | Returns
127 | -------
128 | targetdir: str
129 | directory where the data is extracted
130 | """
131 | targetdir = os.path.join(directory_to_extract_to, "RacketSports")
132 | if os.path.exists(targetdir):
133 | print('Data previously downloaded and stored in ' + targetdir)
134 | else:
135 | os.makedirs(targetdir) # create target directory
136 | # Download the PAMAP2 data, this is 688 Mb
137 | path_to_zip_file = os.path.join(directory_to_extract_to, 'RacketSports.zip')
138 | test_file_exist = os.path.isfile(path_to_zip_file)
139 | if test_file_exist is False:
140 | url = str('http://www.timeseriesclassification.com/' +
141 | 'Downloads/RacketSports.zip')
142 | # retrieve data from url
143 | local_fn, headers = urllib.request.urlretrieve(url,
144 | filename=path_to_zip_file)
145 | print('Download complete and stored in: ' + path_to_zip_file)
146 | else:
147 | print('The data was previously downloaded and stored in ' +
148 | path_to_zip_file)
149 | # unzip
150 |
151 | with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
152 | zip_ref.extractall(targetdir)
153 | os.remove(path_to_zip_file)
154 | return targetdir
155 |
156 |
157 | def load_racket_arff(filename):
158 | """Load data from arff file."""
159 | start = 0
160 | data = []
161 | labels = []
162 | with open(filename) as fp:
163 | line = fp.readline()
164 | count = 0
165 | while line:
166 | if start == 1:
167 | lines = line.split('\\n')
168 | data_line = []
169 | for l in lines:
170 | data_line_sub = []
171 | for entry in l.split(','):
172 | if entry.startswith('B') or entry.startswith('S'):
173 | labels.append(entry.replace("'", "").replace('\n', ''))
174 | else:
175 | data_line_sub.append(float(entry.replace("'", "")))
176 | data_line.append(data_line_sub)
177 | data.append(data_line)
178 |
179 | if line.startswith('@data'):
180 | start = 1
181 |
182 | line = fp.readline()
183 | count += 1
184 |
185 | return np.swapaxes(np.array(data), 1, 2), labels
186 |
187 |
188 | def load_and_split(file_test, random_seed=1):
189 | """Load data and split into train, test, validation."""
190 | # Load data from arff files
191 | X_test0, y_test0 = load_racket_arff(file_test)
192 |
193 | # Split dataset
194 | np.random.seed(random_seed)
195 | y_val = []
196 | y_test = []
197 | IDs_val = []
198 | IDs_test = []
199 |
200 | for label in list(set(y_test0)):
201 | idx = np.where(np.array(y_test0) == label)[0]
202 | idx1 = np.random.choice(idx, len(idx)//2, replace=False)
203 | idx2 = list(set(idx) - set(idx1))
204 | IDs_val.extend(idx1)
205 | IDs_test.extend(idx2)
206 | y_val.extend(len(idx1) * [label])
207 | y_test.extend(len(idx2) * [label])
208 |
209 | print(label, y_test0.count(label))
210 |
211 | X_test = X_test0[IDs_test, :, :]
212 | X_val = X_test0[IDs_val, :, :]
213 | return X_test, X_val, y_test, y_val
214 |
215 |
216 | def store_data(X, y, X_name, y_name, outdatapath, shuffle=False):
217 | """
218 | Converts python lists x 3D and y 1D into numpy arrays
219 | and stores the numpy array in directory outdatapath
220 | shuffle is optional and shuffles the samples
221 |
222 | Parameters
223 | ----------
224 | X : list
225 | list with data
226 | y : list
227 | list with data
228 | X_name : str
229 | name to store the x arrays
230 | y_name : str
231 | name to store the y arrays
232 | outdatapath : str
233 | path to the directory to store the data
234 | shuffle : bool
235 | whether to shuffle the data before storing
236 | """
237 | X = np.array(X)
238 | y = np.array(y)
239 | # Shuffle the train set
240 | if shuffle:
241 | np.random.seed(123)
242 | neworder = np.random.permutation(X.shape[0])
243 | X = X[neworder, :, :]
244 | y = y[neworder]
245 | # Save binary file
246 | xpath = os.path.join(outdatapath, X_name)
247 | ypath = os.path.join(outdatapath, y_name)
248 | np.save(xpath, X)
249 | np.save(ypath, y)
250 | print('Stored ' + xpath, y_name)
251 |
252 |
253 | def load_data(outputpath):
254 | """Load the numpy data as stored in directory outputpath.
255 |
256 | Parameters
257 | ----------
258 | outputpath : str
259 | directory where the numpy files are stored
260 |
261 | Returns
262 | -------
263 | x_train
264 | y_train_binary
265 | x_val
266 | y_val_binary
267 | x_test
268 | y_test_binary
269 | """
270 | ext = '.npy'
271 | X_train = np.load(os.path.join(outputpath, 'X_train' + ext))
272 | y_train = np.load(os.path.join(outputpath, 'y_train' + ext))
273 | X_val = np.load(os.path.join(outputpath, 'X_val' + ext))
274 | y_val = np.load(os.path.join(outputpath, 'y_val' + ext))
275 | X_test = np.load(os.path.join(outputpath, 'X_test' + ext))
276 | y_test = np.load(os.path.join(outputpath, 'y_test' + ext))
277 | return X_train, y_train, X_val, y_val, X_test, y_test
278 |
--------------------------------------------------------------------------------
/utils/tutorial_vu.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import os
4 | import os.path
5 | import numpy as np
6 | import scipy.io
7 | import xlrd
8 |
9 | import logging
10 |
11 | logging.basicConfig(level=logging.INFO, format='%(message)s')
12 | logger = logging.getLogger(__name__)
13 |
14 | ONE_TIME_FALL_DATASET = 'one' # Value to pass to load the one time fallers + controls data set
15 | MULTI_TIME_FALL_DATASET = 'multi' # Value to pass to load the multi time fallers + controls data set
16 |
17 |
18 | class DataLoader:
19 | base_path = './Data espen/'
20 | train_fraction = 0.80 # fraction of subjects used for train set. number of segments per subject is variable.
21 | validation_fraction = 0.10 # fraction of subjects used for validation. number of segments per subject is variable.
22 |
23 | CONTROL_LABEL = 0
24 | ONE_TIME_FALLER_LABEL = 1
25 | MULTI_TIME_FALLER_LABEL = 2
26 |
27 | def load(self, dataset_selection=ONE_TIME_FALL_DATASET):
28 | """
29 | Gets subject ids from excel file and loads acc and vel data from mat file for each subject. Return a train,
30 | validation and test set. Each set consists of the data X and the label y.
31 | :param dataset_selection:
32 | Determines whether datasets from the single time fallers and their controls or the multi time fallers with
33 | their controls should be loaded. Can be either 'one' or 'multi'.
34 | :return: train_X, train_y, validation_X,
35 | validation_y, test_X, test_y
36 | """
37 | multi_time_fallers, multi_time_fallers_controls, one_time_fallers, one_time_fallers_controls = self.read_ids_from_excel()
38 | logger.debug('')
39 |
40 | if dataset_selection == ONE_TIME_FALL_DATASET:
41 | train_X, train_y, validation_X, validation_y, test_X, test_y = self.get_split_shuffled_data_set(
42 | self.ONE_TIME_FALLER_LABEL, self.CONTROL_LABEL, one_time_fallers, one_time_fallers_controls)
43 | elif dataset_selection == MULTI_TIME_FALL_DATASET:
44 | train_X, train_y, validation_X, validation_y, test_X, test_y = self.get_split_shuffled_data_set(
45 | self.MULTI_TIME_FALLER_LABEL, self.CONTROL_LABEL, multi_time_fallers, multi_time_fallers_controls)
46 |
47 | logger.info('Loaded train samples with shape {} and train labels with shape {}.'
48 | .format(train_X.shape, train_y.shape))
49 | logger.info('Loaded validation samples with shape {} and test labels with shape {}.'
50 | .format(validation_X.shape, validation_y.shape))
51 | logger.info('Loaded test samples with shape {} and test labels with shape {}.'
52 | .format(test_X.shape, test_y.shape))
53 | logger.info('Of {} instances loaded, {}% is used for training, {}% for validation, {}% for testing.'
54 | .format(len(train_y) + len(test_y) + len(validation_y),
55 | np.round(100.0 * len(train_y) / (len(train_y) + len(test_y) + len(validation_y)), 1),
56 | np.round(100.0 * len(validation_y) / (len(train_y) + len(test_y) + len(validation_y)), 1),
57 | np.round(100.0 * len(test_y) / (len(train_y) + len(test_y) + len(validation_y)), 1)))
58 |
59 | return train_X, train_y, validation_X, validation_y, test_X, test_y
60 |
61 | def read_ids_from_excel(self):
62 | sheet = xlrd.open_workbook(os.path.join(self.base_path, 'File_number_Fall_class.xlsx')).sheet_by_index(0)
63 | one_time_fallers = self.get_ids_from_column(1, sheet)
64 | one_time_fallers_controls = self.get_ids_from_column(3, sheet)
65 | multi_time_fallers = self.get_ids_from_column(6, sheet)
66 | multi_time_fallers_controls = self.get_ids_from_column(8, sheet)
67 | return multi_time_fallers, multi_time_fallers_controls, one_time_fallers, one_time_fallers_controls
68 |
69 | def get_ids_from_column(self, column, sheet):
70 | return list(
71 | [int(sheet.cell_value(i, column)) for i in range(2, sheet.nrows) if sheet.cell_value(i, column) != ''])
72 |
73 | def get_split_shuffled_data_set(self, label, control_label, fallers, controls):
74 | indices = list(range(len(fallers)))
75 | np.random.shuffle(indices)
76 |
77 | n_train_instances = int(self.train_fraction * len(indices))
78 | n_validation_instances = int(self.validation_fraction * len(indices))
79 | logger.info('Loading training data.')
80 | train_X, train_y = self.get_data_set(fallers,
81 | controls,
82 | indices[:n_train_instances],
83 | label,
84 | control_label)
85 | logger.info('Loading validation data.')
86 | validation_X, validation_y = self.get_data_set(fallers,
87 | controls,
88 | indices[
89 | n_train_instances:n_train_instances + n_validation_instances],
90 | label,
91 | control_label)
92 | logger.info('Loading test data.')
93 | test_X, test_y = self.get_data_set(fallers,
94 | controls,
95 | indices[n_train_instances + n_validation_instances:],
96 | label,
97 | control_label)
98 | return train_X, train_y, validation_X, validation_y, test_X, test_y
99 |
100 | def get_data_set(self, fallers, controls, indices, label, control_label):
101 | train_instance_sets = []
102 | train_label_sets = []
103 | for index in indices:
104 | fall_id = fallers[index]
105 | fall_X, fall_y = self.get_user_data_and_labels_for_id(fall_id, label)
106 | train_instance_sets.append(fall_X)
107 | train_label_sets.append(fall_y)
108 |
109 | control_id = controls[index]
110 | control_X, control_y = self.get_user_data_and_labels_for_id(control_id, control_label)
111 | train_instance_sets.append(control_X)
112 | train_label_sets.append(control_y)
113 | train_set = np.concatenate(train_instance_sets, axis=0)
114 | train_labels = np.concatenate(train_label_sets)
115 | return train_set, train_labels
116 |
117 | def get_user_data_and_labels_for_id(self, id, label):
118 | filename = 'Acc_Vel_gait_30sec_{}.mat'.format(id)
119 | logger.info('Processing file {}'.format(filename))
120 | user_data = self.load_user_data(filename)
121 | user_labels = [label for _ in user_data]
122 | return user_data, user_labels
123 |
124 | def load_user_data(self, filename):
125 | path = os.path.join(self.base_path, filename)
126 | data = scipy.io.loadmat(path)
127 | acc = np.array([data['Acc_gait_30sec'][0][i] for i in range(len(data['Acc_gait_30sec'][0]))])
128 | vel = np.array([data['Vel_gait_30sec'][0][i] for i in range(len(data['Vel_gait_30sec'][0]))])
129 | userdata = np.concatenate((acc, vel), axis=2)
130 | return userdata
131 |
132 |
133 | def load_one_time_fall_dataset():
134 | """
135 | Loads a dataset containing the one time fallers and there matched controls. Fallers are distributed over train,
136 | validation and test set. Controls are kept in the same set as their matched subjects. All segments of a specific
137 | subject, control of faller, end up in the same set. Gets subject ids from excel file and loads acc and vel data
138 | from mat file for each subject. Return a train, validation and test set. Each set consists of the data X and the
139 | label y.
140 | :return: train_X, train_y, validation_X, validation_y, test_X, test_y
141 | """
142 | return DataLoader().load(dataset_selection=ONE_TIME_FALL_DATASET)
143 |
144 |
145 | def load_multi_time_fall_dataset():
146 | """
147 | Loads a dataset containing the multiple time fallers and there matched controls. Fallers are distributed over
148 | train, validation and test set. Controls are kept in the same set as their matched subjects. All segments of a
149 | specific subject, control of faller, end up in the same set. Gets subject ids from excel file and loads acc and
150 | vel data from mat file for each subject. Return a train, validation and test set. Each set consists of the data X
151 | and the label y.
152 | :return: train_X, train_y, validation_X, validation_y, test_X, test_y
153 | """
154 | return DataLoader().load(dataset_selection=MULTI_TIME_FALL_DATASET)
155 |
--------------------------------------------------------------------------------
/utils/tutorial_weather.py:
--------------------------------------------------------------------------------
1 | import typing
2 | import urllib
3 | from pathlib import Path
4 |
5 | from urllib.request import urlretrieve
6 | import pandas as pd
7 | from sklearn.model_selection import train_test_split
8 |
9 |
10 | def load_data(path: str = '.'):
11 | """
12 | Load weather dataset (10.5281/zenodo.4770936.). If it's not on the path specified, it will be downloaded.
13 | Parameters
14 | ----------
15 | path : str
16 | The local path to the data set folder.
17 |
18 | Returns
19 | -------
20 | X_train
21 | X_test
22 | y_train
23 | y_test
24 | """
25 | data_path = download_preprocessed_data(path)
26 | data = pd.read_csv(data_path)
27 | nr_rows = 365 * 3
28 | X_data = data.loc[:nr_rows].drop(columns=['DATE', 'MONTH'])
29 |
30 | days_ahead = 1
31 | y_data = data.loc[days_ahead:(nr_rows + days_ahead)]["MAASTRICHT_sunshine"]
32 | X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=0)
33 |
34 | return X_train, X_test, y_train, y_test
35 |
36 |
37 | def download_preprocessed_data(directory_to_extract_to: typing.Union[str, Path]):
38 | data_path = Path(directory_to_extract_to) / 'weather'
39 | data_path.mkdir(exist_ok=True)
40 | data_set_light_path = data_path / 'weather_prediction_dataset_light.csv'
41 | if not data_set_light_path.exists():
42 | _, _ = urllib.request.urlretrieve(
43 | 'https://zenodo.org/record/7053722/files/weather_prediction_dataset_light.csv',
44 | filename=data_set_light_path)
45 | return data_set_light_path
46 |
--------------------------------------------------------------------------------