├── src
    └── pyphm
    │   ├── __init__.py
    │   └── datasets
    │       ├── __init__.py
    │       ├── auxilary_metadata
    │           ├── __init__.py
    │           ├── milling_labels_with_tool_class.csv
    │           └── airbus_dfvalid_groundtruth.csv
    │       ├── pyphm.py
    │       ├── airbus.py
    │       ├── ims.py
    │       ├── milling.py
    │       └── utils.py
├── tests
    ├── integration
    │   ├── __init__.py
    │   ├── fixtures
    │   │   ├── milling
    │   │   │   ├── mill.mat
    │   │   │   ├── milling_truncated_results.csv.gz
    │   │   │   └── milling_labels_with_tool_class_truncated.csv
    │   │   └── ims
    │   │   │   ├── ims_truncated_results.csv.gz
    │   │   │   └── 1st_test
    │   │   │       ├── 2003.10.22.12.06.24
    │   │   │       └── 2003.10.22.12.09.13
    │   ├── test_integration_ims.py
    │   └── test_integration_milling.py
    └── conftest.py
├── notebooks
    ├── scratch
    │   ├── test._mill.ipynb
    │   ├── test.py
    │   ├── get_hash.ipynb
    │   ├── import_package_resources.ipynb
    │   ├── milling_examp.ipynb
    │   ├── test.ipynb
    │   ├── airbus_download.ipynb
    │   └── ims_download.ipynb
    └── images
    │   ├── logo.png
    │   ├── vae.png
    │   ├── cut_signals.png
    │   ├── flank_wear.png
    │   ├── thresholds.png
    │   ├── violin_plot.png
    │   ├── face_milling.png
    │   ├── simple_trend.png
    │   ├── latent_space_cnc.png
    │   ├── trend_spash_image.png
    │   ├── vae_training_step3.jpg
    │   ├── vae_training_random_search.png
    │   ├── prauc_params_cnc.svg
    │   ├── logo.svg
    │   └── prauc_cnc.svg
├── .gitattributes
├── requirements.txt
├── setup.py
├── env_pyphm.yml
├── .github
    └── workflows
    │   └── main.yml
├── pyproject.toml
├── LICENSE
├── setup.cfg
├── README.md
├── .gitignore
└── references
    └── sources.bib


/src/pyphm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/scratch/test._mill.ipynb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/auxilary_metadata/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-documentation
2 | 


--------------------------------------------------------------------------------
/notebooks/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/logo.png


--------------------------------------------------------------------------------
/notebooks/images/vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae.png


--------------------------------------------------------------------------------
/notebooks/images/cut_signals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/cut_signals.png


--------------------------------------------------------------------------------
/notebooks/images/flank_wear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/flank_wear.png


--------------------------------------------------------------------------------
/notebooks/images/thresholds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/thresholds.png


--------------------------------------------------------------------------------
/notebooks/images/violin_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/violin_plot.png


--------------------------------------------------------------------------------
/notebooks/images/face_milling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/face_milling.png


--------------------------------------------------------------------------------
/notebooks/images/simple_trend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/simple_trend.png


--------------------------------------------------------------------------------
/notebooks/images/latent_space_cnc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/latent_space_cnc.png


--------------------------------------------------------------------------------
/notebooks/images/trend_spash_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/trend_spash_image.png


--------------------------------------------------------------------------------
/notebooks/images/vae_training_step3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae_training_step3.jpg


--------------------------------------------------------------------------------
/tests/integration/fixtures/milling/mill.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/milling/mill.mat


--------------------------------------------------------------------------------
/notebooks/images/vae_training_random_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/notebooks/images/vae_training_random_search.png


--------------------------------------------------------------------------------
/tests/integration/fixtures/ims/ims_truncated_results.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/ims/ims_truncated_results.csv.gz


--------------------------------------------------------------------------------
/tests/integration/fixtures/milling/milling_truncated_results.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvhahn/PyPHM/HEAD/tests/integration/fixtures/milling/milling_truncated_results.csv.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # local package
 2 | -e .
 3 | 
 4 | # external requirements
 5 | pandas
 6 | numpy
 7 | py7zr
 8 | rarfile
 9 | tqdm
10 | scipy
11 | requests
12 | h5py
13 | tables
14 | gdown


--------------------------------------------------------------------------------
/tests/integration/fixtures/milling/milling_labels_with_tool_class_truncated.csv:
--------------------------------------------------------------------------------
1 | case,run,VB,time,DOC,feed,material,cut_no,tool_class,window_start,window_end
2 | 1,1,0,2,1.5,0.5,1,0,0,2496,6976
3 | 1,2,,4,1.5,0.5,1,1,0,2496,6976
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | if __name__ == "__main__":
 4 |     setuptools.setup()
 5 | 
 6 | # from setuptools import setup, find_packages
 7 | 
 8 | # setup(
 9 | #     name="pyphm",
10 | #     version="0.1.0",
11 | #     packages=find_packages(),
12 | # )
13 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Dummy conftest.py for pyphm.
 3 | 
 4 |     If you don't know what this is for, just leave it empty.
 5 |     Read more about conftest.py under:
 6 |     - https://docs.pytest.org/en/stable/fixture.html
 7 |     - https://docs.pytest.org/en/stable/writing_plugins.html
 8 | """
 9 | 
10 | # import pytest
11 | 


--------------------------------------------------------------------------------
/env_pyphm.yml:
--------------------------------------------------------------------------------
 1 | name: pyphm
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.11
 6 |   - conda
 7 |   - mamba
 8 |   - jupyterlab
 9 |   - ipywidgets
10 |   - scipy
11 |   - matplotlib
12 |   - seaborn
13 |   - pandas
14 |   - scikit-learn
15 |   - py7zr
16 |   - rarfile 
17 |   - pytables
18 |   - requests 
19 |   - gdown=4.6.0
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI/CD
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     strategy:
16 |       matrix:
17 |         python-version: [3.7, 3.8, 3.9]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v2
21 | 
22 |       - name: Set up Python all python versions
23 |         uses: actions/setup-python@v2
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |           architecture: x64
27 |       
28 |       - name: Install dependencies
29 |         run: pip install -r requirements.txt
30 | 
31 |       - name: Run Tests
32 |         run: python -m unittest discover -s tests


--------------------------------------------------------------------------------
/src/pyphm/datasets/pyphm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | from typing import Any, Callable, List, Optional, Tuple
 5 | from .utils import download_and_extract_archive, extract_archive, check_integrity
 6 | 
 7 | 
 8 | class PHMDataset:
 9 |     """
10 |     Base class for making PyPHM data sets.
11 | 
12 |     Args:
13 |         root (string): Root directory to place all the  data sets.
14 | 
15 |         dataset_folder_name (string): Name of folder containing raw data.
16 |             This folder will be created in the root directory if not present.
17 | 
18 |     """
19 | 
20 |     def __init__(
21 |         self,
22 |         root: Path,
23 |         dataset_folder_name: str,
24 |     ) -> None:
25 | 
26 |         self.root = Path(root)
27 |         self.dataset_folder_name = dataset_folder_name
28 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pyphm"
 3 | version = "0.0.5"
 4 | description = "Machinery data, made easy"
 5 | requires-python = ">=3.6"
 6 | readme = "README.md"
 7 | authors = [
 8 |   { name = "Tim von Hahn", email = "t.vonhahn@queensu.ca" },
 9 | ]
10 | classifiers = [
11 |   "Programming Language :: Python :: 3",
12 |   "License :: OSI Approved :: MIT License",
13 |   "Operating System :: OS Independent",
14 | ]
15 | dependencies = [
16 |   "pandas",
17 |   "numpy",
18 |   "py7zr",
19 |   "rarfile",
20 |   "tqdm",
21 |   "scipy",
22 |   "requests",
23 |   "h5py",
24 |   "tables",
25 |   "gdown"
26 | ]
27 | 
28 | [project.urls]
29 | Homepage = "https://github.com/tvhahn/PyPHM"
30 | Repository = "https://github.com/tvhahn/PyPHM"
31 | Documentation = "https://github.com/tvhahn/PyPHM"
32 | 
33 | [project.optional-dependencies]
34 | doc = ["sphinx~=4.4.0", "myst-parser"]
35 | 


--------------------------------------------------------------------------------
/notebooks/scratch/test.py:
--------------------------------------------------------------------------------
 1 | resources = [
 2 |     {
 3 |         "name": "aws",
 4 |         "url": "https://phm-datasets.s3.amazonaws.com/NASA/",
 5 |         "files": [
 6 |             {
 7 |                 "filename": "3.+Milling.zip",
 8 |                 "md5": "4da3afb0aa50cb3dcdd8e20ed1ed1c7c",
 9 |             },
10 |             {
11 |                 "filename": "another_file.zip",
12 |                 "md5": "some_other_md5_checksum",
13 |             },
14 |         ],
15 |     },
16 |     {
17 |         "name": "google_drive",
18 |         "url": "https://drive.google.com/file/d/1_4Hm8RO_7Av1LzGtFnhx6cIN-zi-W40j/view?usp=sharing",
19 |         "files": [
20 |             {
21 |                 "filename": "mill.zip",
22 |                 "md5": "81d821fdef812183a7d38b6f83f7cefa",
23 |             },
24 |             {
25 |                 "filename": "another_file.zip",
26 |                 "md5": "some_other_md5_checksum",
27 |             },
28 |         ],
29 |     },
30 |     # Additional sources can be added here in the same format.
31 | ]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | The MIT License (MIT)
 3 | Copyright (c) 2022, Tim von Hahn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = pyphm
 3 | version = 0.0.5
 4 | author = Tim von Hahn
 5 | author_email = t.vonhahn@queensu.ca
 6 | description = Machinery data, made easy
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | url = https://github.com/tvhahn/PyPHM
10 | project_urls =
11 |     Bug Tracker = https://github.com/tvhahn/PyPHM/issues
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 | 
17 | [options]
18 | package_dir =
19 |     = src
20 | packages = find:
21 | python_requires = >=3.7
22 | include_package_data = True
23 | 
24 | # Add here dependencies of your project (semicolon/line-separated)
25 | install_requires =
26 |     pandas
27 |     wheel
28 |     scipy
29 |     numpy
30 |     py7zr
31 |     rarfile
32 |     tqdm
33 |     requests
34 |     versioned-hdf5
35 |     h5py
36 |     tables
37 |     gdown
38 | 
39 | [options.package_data]
40 | * = *.csv, *.mat
41 | 
42 | [options.packages.find]
43 | where = src
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/integration/fixtures/ims/1st_test/2003.10.22.12.06.24:
--------------------------------------------------------------------------------
 1 | -0.022	-0.039	-0.183	-0.054	-0.105	-0.134	-0.129	-0.142
 2 | -0.105	-0.017	-0.164	-0.183	-0.049	0.029	-0.115	-0.122
 3 | -0.183	-0.098	-0.195	-0.125	-0.005	-0.007	-0.171	-0.071
 4 | -0.178	-0.161	-0.159	-0.178	-0.100	-0.115	-0.112	-0.078
 5 | -0.208	-0.129	-0.261	-0.098	-0.151	-0.205	-0.063	-0.066
 6 | -0.232	-0.061	-0.281	-0.125	0.046	-0.088	-0.078	-0.078
 7 | -0.112	-0.132	-0.181	-0.186	-0.132	-0.051	-0.132	-0.076
 8 | -0.054	-0.107	-0.173	-0.134	-0.164	0.002	-0.146	-0.125
 9 | -0.159	-0.032	-0.161	-0.181	-0.110	-0.044	-0.173	-0.137
10 | -0.225	-0.044	-0.090	-0.159	-0.100	-0.151	-0.139	-0.076
11 | -0.093	-0.117	-0.039	-0.161	-0.132	-0.161	-0.090	-0.098
12 | -0.002	-0.161	-0.042	-0.054	-0.095	-0.232	-0.137	-0.042
13 | 0.000	-0.117	-0.081	-0.088	-0.142	-0.183	-0.117	-0.171
14 | -0.154	-0.142	-0.027	-0.093	-0.183	-0.251	-0.095	-0.083
15 | -0.129	-0.068	0.083	-0.071	-0.129	-0.117	-0.183	-0.071
16 | -0.015	-0.049	0.044	-0.088	-0.188	-0.081	-0.183	-0.020
17 | -0.015	-0.046	0.005	-0.061	-0.049	-0.098	-0.139	-0.085
18 | -0.090	-0.105	0.020	-0.012	-0.181	-0.186	-0.107	-0.037
19 | -0.088	-0.012	0.037	-0.093	-0.078	-0.105	-0.134	-0.039
20 | -0.127	-0.081	-0.051	-0.073	-0.100	-0.105	-0.115	-0.051
21 | 


--------------------------------------------------------------------------------
/tests/integration/fixtures/ims/1st_test/2003.10.22.12.09.13:
--------------------------------------------------------------------------------
 1 | -0.117	-0.076	-0.127	-0.144	-0.083	-0.002	-0.098	-0.051
 2 | -0.132	-0.068	-0.117	-0.083	-0.132	-0.076	-0.117	-0.085
 3 | -0.186	-0.120	-0.217	-0.212	-0.081	-0.112	-0.132	-0.054
 4 | -0.098	-0.125	-0.117	-0.093	-0.022	-0.112	-0.090	-0.164
 5 | -0.137	-0.120	-0.188	-0.142	-0.129	-0.046	-0.098	-0.129
 6 | -0.103	-0.078	-0.127	-0.156	-0.110	-0.061	-0.061	-0.129
 7 | -0.120	-0.046	-0.085	-0.056	-0.149	-0.042	-0.103	-0.039
 8 | -0.110	-0.068	-0.076	-0.078	-0.168	-0.134	-0.146	-0.168
 9 | -0.088	-0.110	-0.022	-0.044	-0.225	-0.083	-0.100	-0.044
10 | -0.120	-0.073	-0.034	-0.076	-0.217	-0.073	-0.107	-0.088
11 | -0.159	-0.129	0.034	-0.022	-0.090	-0.139	-0.107	-0.049
12 | -0.073	-0.090	-0.032	-0.044	-0.076	-0.132	-0.134	-0.049
13 | -0.105	-0.122	-0.073	0.015	-0.078	-0.107	-0.195	-0.027
14 | -0.139	-0.056	0.000	-0.154	-0.068	-0.146	-0.193	0.032
15 | -0.129	-0.095	-0.012	-0.078	0.034	-0.127	-0.110	0.046
16 | -0.134	-0.159	-0.139	-0.210	-0.112	-0.107	-0.112	-0.005
17 | -0.071	-0.129	-0.134	-0.024	-0.156	-0.042	-0.132	-0.049
18 | -0.183	-0.093	-0.090	-0.112	-0.054	-0.088	-0.127	-0.127
19 | -0.278	-0.010	-0.007	-0.007	0.066	-0.103	-0.078	-0.071
20 | -0.154	-0.046	-0.198	-0.129	-0.078	-0.046	-0.093	-0.051
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![PyPHM Logo](./notebooks/images/logo.png)
 2 | 
 3 | # Machinery data, made easy
 4 | ![example workflow](https://github.com/tvhahn/PyPHM/actions/workflows/main.yml/badge.svg) [![arXiv](https://img.shields.io/badge/arXiv-2205.15489-b31b1b.svg)](https://arxiv.org/abs/2205.15489)
 5 | 
 6 | 
 7 | Datasets specific to PHM (prognostics and health management). Use Python to easily download and prepare the data, before feature engineering or model training. 
 8 | 
 9 | Current datasets:
10 | - **UC-Berkeley Milling Dataset**: [example notebook](https://github.com/tvhahn/PyPHM/blob/master/notebooks/milling_example.ipynb) ([open in Colab](https://colab.research.google.com/github/tvhahn/PyPHM/blob/master/notebooks/milling_example.ipynb)); [dataset source](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#milling)
11 | - **IMS Bearing Dataset**: [dataset source](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#bearing)
12 | - **Airbus Helicopter Accelerometer Dataset**: [dataset source](https://www.research-collection.ethz.ch/handle/20.500.11850/415151)
13 | - More coming soon!
14 | 
15 | 
16 | ## Alpha Notice
17 | PyPHM is in active development. Expect considerable changes in the near future.
18 | 
19 | Our goals are to create:
20 | 
21 | * A package that implements **common data preprocessing methods** used by others.
22 | * A package with a **coherent and thoughtful API**.
23 | * Thorough **documentation**, with plenty of **examples**.
24 | * A package that is well **tested**, with the use of **type hints**.
25 | * A package built with **continuous integration and continuous deployment**.
26 | 
27 | 
28 | ## Installation
29 | Install with pip: `pip install pyphm`
30 | 
31 | Install from github repository: clone with git `clone https://github.com/tvhahn/PyPHM.git`. Then run `python -m pip install -e .` to install the package on your local machine.
32 | 
33 | Run tests: `python -m unittest discover -s tests`
34 | 
35 | 


--------------------------------------------------------------------------------
/tests/integration/test_integration_ims.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from pathlib import Path
 4 | import pandas as pd
 5 | from pandas.testing import assert_frame_equal
 6 | from pyphm.datasets.ims import ImsDataLoad
 7 | 
 8 | 
 9 | class TestIms(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         pass
14 | 
15 |     def setUp(self):
16 |         # path to mill_truncated.mat
17 |         self.root = (
18 |             Path(__file__).parent / "fixtures"
19 |         )
20 | 
21 |         # path to ims_truncated_results.csv.gz
22 |         self.results_path = (
23 |             self.root / "ims/ims_truncated_results.csv.gz"
24 |         )
25 | 
26 |     def tearDown(self):
27 |         pass
28 | 
29 |     def test_milling_data_prep(self):
30 |         """Test that the milling data prep works as expected."""
31 | 
32 |         # load the data and instantiate the data prep class
33 |         ims = ImsDataLoad(self.root, download=False)
34 | 
35 |         # create the results dataframe
36 |         df = ims.load_run_as_df(1, n_jobs=1)
37 | 
38 |         # load the ground truth results dataframe
39 |         col_names_ordered = ["id", "run", "file", "time_step"] + ims.col_1st_names
40 | 
41 |         col_dtype = [
42 |             str,
43 |             int,
44 |             str,
45 |             np.float32,
46 |             np.float32,
47 |             np.float32,
48 |             np.float32,
49 |             np.float32,
50 |             np.float32,
51 |             np.float32,
52 |             np.float32,
53 |             np.float32,
54 |         ]
55 | 
56 |         col_dtype_dict = dict(zip(col_names_ordered, col_dtype))
57 | 
58 |         # load the ground truth results dataframe
59 |         df_gt = pd.read_csv(
60 |             self.results_path,
61 |             compression="gzip",
62 |         ).astype(col_dtype_dict)
63 | 
64 |         # compare the results
65 |         assert_frame_equal(df, df_gt)
66 | 
67 | 
68 | if __name__ == "__main__":
69 | 
70 |     unittest.main()
71 | 


--------------------------------------------------------------------------------
/notebooks/scratch/get_hash.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "The autoreload extension is already loaded. To reload it, use:\n",
 13 |       "  %reload_ext autoreload\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import hashlib\n",
 19 |     "from pathlib import Path\n",
 20 |     "import pandas as pd\n",
 21 |     "from pyphm.datasets.utils import calculate_md5, check_md5\n",
 22 |     "\n",
 23 |     "\n",
 24 |     "%load_ext autoreload\n",
 25 |     "%autoreload 2"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 7,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "/home/tim/Documents/PyPHM\n",
 38 |       "/home/tim/Documents/PyPHM/data/raw/milling\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "root_dir = Path.cwd().parent.parent\n",
 44 |     "print(root_dir)\n",
 45 |     "path_data_raw_folder = Path(root_dir / 'data/raw/milling/' )\n",
 46 |     "print(path_data_raw_folder)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 8,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "4da3afb0aa50cb3dcdd8e20ed1ed1c7c\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "hash_md5 = calculate_md5(path_data_raw_folder / \"3.+Milling.zip\")\n",
 64 |     "print(hash_md5)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": []
 73 |   }
 74 |  ],
 75 |  "metadata": {
 76 |   "interpreter": {
 77 |    "hash": "bb5c389ed065b0664b086eb1393fdb5729447cbf21b18fded646434c15c951b5"
 78 |   },
 79 |   "kernelspec": {
 80 |    "display_name": "Python 3.8.12 ('featstore')",
 81 |    "language": "python",
 82 |    "name": "python3"
 83 |   },
 84 |   "language_info": {
 85 |    "codemirror_mode": {
 86 |     "name": "ipython",
 87 |     "version": 3
 88 |    },
 89 |    "file_extension": ".py",
 90 |    "mimetype": "text/x-python",
 91 |    "name": "python",
 92 |    "nbconvert_exporter": "python",
 93 |    "pygments_lexer": "ipython3",
 94 |    "version": "3.11.7"
 95 |   },
 96 |   "orig_nbformat": 4
 97 |  },
 98 |  "nbformat": 4,
 99 |  "nbformat_minor": 2
100 | }
101 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # folders
132 | data/
133 | 


--------------------------------------------------------------------------------
/tests/integration/test_integration_milling.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from pathlib import Path
 4 | import pandas as pd
 5 | from pandas.testing import assert_frame_equal
 6 | from pyphm.datasets.milling import MillingPrepMethodA
 7 | 
 8 | 
 9 | class TestMilling(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         pass
14 | 
15 | 
16 |     def setUp(self):
17 |         # path to mill_truncated.mat
18 |         self.root = (
19 |             Path(__file__).parent / "fixtures"
20 |         )
21 | 
22 |         # path to milling_labels_with_tool_class_truncated.csv
23 |         self.labels_path = (
24 |             self.root
25 |             / "milling/milling_labels_with_tool_class_truncated.csv"
26 |         )
27 | 
28 |         # path to milling_truncated_results.csv.gz
29 |         self.results_path = (
30 |             self.root / "milling/milling_truncated_results.csv.gz"
31 |         )
32 | 
33 |     def test_load_run_as_df(self):
34 |         """Test the loading of an individual run as a dataframe."""
35 | 
36 |         # load the data and instantiate the data prep class
37 |         mill = MillingPrepMethodA(
38 |             self.root,
39 |             window_len=64,
40 |             stride=64,
41 |             cut_drop_list=[],
42 |             path_csv_labels=self.labels_path,
43 |             download=False,
44 |         )
45 | 
46 |         # create the results dataframe
47 |         df = mill.create_xy_dataframe()
48 | 
49 |         # load the ground truth results dataframe
50 |         col_names_ordered = [
51 |             "cut_id",
52 |             "cut_no",
53 |             "case",
54 |             "time",
55 |             "ae_spindle",
56 |             "ae_table",
57 |             "vib_spindle",
58 |             "vib_table",
59 |             "smcdc",
60 |             "smcac",
61 |             "tool_class",
62 |         ]
63 | 
64 |         col_dtype = [
65 |             str,
66 |             int,
67 |             int,
68 |             np.float32,
69 |             np.float32,
70 |             np.float32,
71 |             np.float32,
72 |             np.float32,
73 |             np.float32,
74 |             np.float32,
75 |             int,
76 |         ]
77 | 
78 |         col_dtype_dict = dict(zip(col_names_ordered, col_dtype))
79 | 
80 |         # load the ground truth results dataframe
81 |         df_gt = pd.read_csv(
82 |             self.results_path,
83 |             compression="gzip",
84 |         ).astype(col_dtype_dict)
85 | 
86 |         # compare the results
87 |         assert_frame_equal(df, df_gt)
88 | 
89 | 
90 | if __name__ == "__main__":
91 | 
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/notebooks/images/prauc_params_cnc.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | <svg width="853.47" height="402.95" version="1.1" viewBox="0 0 853.47 402.95" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><metadata><rdf:RDF><cc:Work rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/><dc:title/></cc:Work></rdf:RDF></metadata><defs><clipPath id="clipPath22"><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath id="clipPath62"><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath id="clipPath72"><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath id="clipPath82"><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath id="clipPath92"><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath id="clipPath102"><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath id="clipPath112"><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath><path d="m47.591 7.2h558v271.8h-558z"/></clipPath><clipPath><path d="m47.591 7.2h558v271.8h-558z"/></clipPath></defs><g transform="matrix(1.3333 0 0 -1.3333 0 402.95)"><path d="m0 0h640.1v302.22h-640.1z" fill="#fff"/><path d="m47.591 7.2h558v271.8h-558z" fill="#fff"/><g clip-path="url(#clipPath22)"><path d="m47.591 7.2v271.8" fill="none" stroke="#ccc" stroke-linecap="round" stroke-linejoin="round" stroke-miterlimit="10" stroke-opacity=".7"/></g><g fill="#262626" font-family="'DejaVu Sans'"><g font-size="12.1px"><text transform="matrix(1 0 0 -1 30.388 251.76)"><tspan x="0" y="0">5</tspan></text>
 4 | <text transform="matrix(1 0 0 -1 30.388 206.46)"><tspan x="0" y="0">2</tspan></text>
 5 | <text transform="matrix(1 0 0 -1 30.388 161.16)"><tspan x="0" y="0">4</tspan></text>
 6 | <text transform="matrix(1 0 0 -1 30.388 115.86)"><tspan x="0" y="0">6</tspan></text>
 7 | <text transform="matrix(1 0 0 -1 30.388 70.556)"><tspan x="0" y="0">3</tspan></text>
 8 | <text transform="matrix(1 0 0 -1 23.966 25.256)"><tspan x="0 7.4173002 10.7811" y="0">all</tspan></text>
 9 | </g><text transform="matrix(0 1 1 0 17.216 88.217)" font-size="13.2px"><tspan x="0 8.382 16.750799 25.132799 29.898001 39.111599 47.4804 52.6548 56.852402 66.725998 75.094803 87.951599 96.333603 104.4516" y="0">Sub-Cut Number</tspan></text>
10 | </g><g clip-path="url(#clipPath62)"><path d="m47.591 274.47h531.43v-36.24h-531.43z" fill="#394a56" stroke="#fff" stroke-miterlimit="10"/></g><g clip-path="url(#clipPath72)"><path d="m47.591 229.17h468.6v-36.24h-468.6z" fill="#3f617a" stroke="#fff" stroke-miterlimit="10"/></g><g clip-path="url(#clipPath82)"><path d="m47.591 183.87h382.21v-36.24h-382.21z" fill="#45789d" stroke="#fff" stroke-miterlimit="10"/></g><g clip-path="url(#clipPath92)"><path d="m47.591 138.57h342.94v-36.24h-342.94z" fill="#588fb6" stroke="#fff" stroke-miterlimit="10"/></g><g clip-path="url(#clipPath102)"><path d="m47.591 93.27h304.98v-36.24h-304.98z" fill="#76a4c3" stroke="#fff" stroke-miterlimit="10"/></g><g clip-path="url(#clipPath112)"><path d="m47.591 47.97h195.03v-36.24h-195.03z" fill="#95b9d1" stroke="#fff" stroke-miterlimit="10"/></g><g fill="#262626" font-family="'DejaVu Sans'"><g font-size="12px" font-weight="bold"><text transform="matrix(1 0 0 -1 594.96 253.04)"><tspan x="0 8.3520002 12.912 21.264 29.615999" y="0">0.406</tspan></text>
11 | <text transform="matrix(1 0 0 -1 532.13 207.74)"><tspan x="0 8.3520002 12.912 21.264 29.615999" y="0">0.358</tspan></text>
12 | <text transform="matrix(1 0 0 -1 445.74 162.44)"><tspan x="0 8.3520002 12.912 21.264 29.615999" y="0">0.292</tspan></text>
13 | <text transform="matrix(1 0 0 -1 406.48 117.14)"><tspan x="0 8.3520002 12.912 21.264 29.615999" y="0">0.262</tspan></text>
14 | <text transform="matrix(1 0 0 -1 368.52 71.838)"><tspan x="0 8.3520002 12.912 21.264 29.615999" y="0">0.233</tspan></text>
15 | <text transform="matrix(1 0 0 -1 258.57 26.538)"><tspan x="0 8.3520002 12.912 21.264 29.615999" y="0">0.149</tspan></text>
16 | </g><text transform="matrix(1 0 0 -1 47.591 285)" font-size="13.2px"><tspan x="0 7.9596 17.1336 21.8988 30.927601 40.59 49.8036 54.001202 62.383202 69.643204 77.721603 83.146797 91.264801 95.462402 99.132004 107.5008 111.6984 116.8728 125.2416 133.3596 137.55721 144.90961 153.00121 158.1756 166.29359 174.6624 179.83681 184.03439 192.4164 200.7984 208.89 216.14999 224.26801 228.46561 233.112 241.1904 246.6156 250.8132 255.98759 264.35641 272.4744 276.672 288.0636 296.142 303.0192 308.1936 312.3912 321.6048 329.6832 342.54001 355.39679 363.47519 371.84399 376.0416 387.4332 395.52481 403.89359 412.26239 416.90881 425.0004 432.26041 437.43481 445.80359 451.22879 459.3468 467.72879 471.92639 479.88599 487.9776 493.4028" y="0">PR-AUC Score in the Latent Space for the Most Common Manufactured Part</tspan></text>
17 | </g></g></svg>
18 | 


--------------------------------------------------------------------------------
/notebooks/images/logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | 
 4 | <svg
 5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 6 |    xmlns:cc="http://creativecommons.org/ns#"
 7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 8 |    xmlns:svg="http://www.w3.org/2000/svg"
 9 |    xmlns="http://www.w3.org/2000/svg"
10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 |    width="159.11525mm"
13 |    height="33.492615mm"
14 |    viewBox="0 0 159.11525 33.492615"
15 |    version="1.1"
16 |    id="svg8"
17 |    inkscape:version="0.92.3 (2405546, 2018-03-11)"
18 |    sodipodi:docname="logo.svg"
19 |    inkscape:export-filename="/home/tim/Documents/PyPHM/notebooks/images/logo.svg.png"
20 |    inkscape:export-xdpi="809.24402"
21 |    inkscape:export-ydpi="809.24402">
22 |   <defs
23 |      id="defs2" />
24 |   <sodipodi:namedview
25 |      id="base"
26 |      pagecolor="#ffffff"
27 |      bordercolor="#666666"
28 |      borderopacity="1.0"
29 |      inkscape:pageopacity="0.0"
30 |      inkscape:pageshadow="2"
31 |      inkscape:zoom="1.2181641"
32 |      inkscape:cx="93.921128"
33 |      inkscape:cy="62.132834"
34 |      inkscape:document-units="mm"
35 |      inkscape:current-layer="layer1"
36 |      showgrid="false"
37 |      inkscape:window-width="1853"
38 |      inkscape:window-height="1052"
39 |      inkscape:window-x="1987"
40 |      inkscape:window-y="0"
41 |      inkscape:window-maximized="1"
42 |      fit-margin-top="0"
43 |      fit-margin-left="0"
44 |      fit-margin-right="0"
45 |      fit-margin-bottom="0" />
46 |   <metadata
47 |      id="metadata5">
48 |     <rdf:RDF>
49 |       <cc:Work
50 |          rdf:about="">
51 |         <dc:format>image/svg+xml</dc:format>
52 |         <dc:type
53 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
54 |         <dc:title></dc:title>
55 |       </cc:Work>
56 |     </rdf:RDF>
57 |   </metadata>
58 |   <g
59 |      inkscape:label="Layer 1"
60 |      inkscape:groupmode="layer"
61 |      id="layer1"
62 |      transform="translate(-39.824264,-131.78003)">
63 |     <path
64 |        id="path10-3"
65 |        d="m 67.643709,143.54831 h -1.172967 c -0.9569,0 -1.759455,-0.82571 -1.759455,-1.80576 0,-0.49388 0.208355,-0.94147 0.578766,-1.27329 l 0.756258,-0.74083 c 0.74854,-0.74082 0.74854,-1.95237 0,-2.69319 L 64.32544,135.3298 c -0.339543,-0.33954 -0.841142,-0.54018 -1.350457,-0.54018 -0.509318,0 -1.003199,0.20064 -1.35046,0.54018 l -0.725389,0.72539 c -0.34726,0.38584 -0.810274,0.5942 -1.311873,0.5942 -0.987764,0 -1.813473,-0.80256 -1.813473,-1.75174 v -1.18068 c 0,-1.04179 -0.841144,-1.93694 -1.890642,-1.93694 h -2.34594 c -1.049499,0 -1.882925,0.88744 -1.882925,1.93694 v 1.17297 c 0,0.94918 -0.825709,1.75173 -1.813473,1.75173 -0.493881,0 -0.949179,-0.20835 -1.281006,-0.57105 l -0.74854,-0.74082 c -0.339544,-0.34726 -0.841143,-0.54018 -1.350459,-0.54018 -0.509316,0 -1.003197,0.20064 -1.350458,0.54018 l -1.736304,1.69772 c -0.740823,0.74082 -0.740823,1.95238 0,2.68549 l 0.725389,0.72538 c 0.385845,0.34727 0.601919,0.81028 0.601919,1.30416 0,0.98777 -0.802558,1.80576 -1.759455,1.80576 h -1.17297 c -1.057215,0 -1.94466,0.8257 -1.94466,1.8752 v 1.17297 1.17298 c 0,1.04177 0.887445,1.8752 1.94466,1.8752 h 1.17297 c 0.956897,0 1.759455,0.82571 1.759455,1.80575 0,0.49389 -0.216074,0.9569 -0.601919,1.30416 l -0.725389,0.71768 c -0.740823,0.74082 -0.740823,1.95237 0,2.68548 l 1.72087,1.71315 c 0.339544,0.34726 0.841143,0.54019 1.350459,0.54019 0.509315,0 1.003197,-0.20064 1.350458,-0.54019 l 0.74854,-0.74082 c 0.32411,-0.3627 0.787124,-0.57105 1.281006,-0.57105 0.987765,0 1.813473,0.80256 1.813473,1.75173 v 1.17298 c 0,1.04178 0.833426,1.93694 1.890642,1.93694 h 2.34594 c 1.049499,0 1.882925,-0.88745 1.882925,-1.93694 v -1.17298 c 0,-0.94917 0.825709,-1.75173 1.813473,-1.75173 0.493883,0 0.956896,0.21607 1.311873,0.5942 l 0.725391,0.72539 c 0.347259,0.33954 0.841142,0.54019 1.350457,0.54019 0.509316,0 1.003199,-0.20065 1.350458,-0.54019 l 1.72087,-1.71315 c 0.740824,-0.74083 0.740824,-1.95238 0,-2.6932 l -0.756254,-0.74083 c -0.370413,-0.33182 -0.578771,-0.78712 -0.578771,-1.27328 0,-0.98777 0.80256,-1.80576 1.759455,-1.80576 h 1.17297 c 1.0495,0 1.798039,-0.82571 1.798039,-1.87521 v -1.18069 -1.17297 c 0.01542,-1.0495 -0.733104,-1.8752 -1.782606,-1.8752 z m -6.767724,3.04817 v 0 c 0,3.40316 -2.754936,6.17353 -6.173526,6.17353 -3.41859,0 -6.173525,-2.77037 -6.173525,-6.17353 v 0 0 c 0,-3.40315 2.754935,-6.17352 6.173525,-6.17352 3.41859,0 6.173526,2.77037 6.173526,6.17352 z"
66 |        inkscape:connector-curvature="0"
67 |        style="stroke-width:0.07716905" />
68 |     <text
69 |        xml:space="preserve"
70 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:32.84453583px;line-height:1.25;font-family:calibri;-inkscape-font-specification:calibri;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.82111335"
71 |        x="71.586754"
72 |        y="158.18413"
73 |        id="text52"><tspan
74 |          sodipodi:role="line"
75 |          id="tspan50"
76 |          x="71.586754"
77 |          y="158.18413"
78 |          style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-family:sans-serif;-inkscape-font-specification:'sans-serif Bold';stroke-width:0.82111335"><tspan
79 |    style="fill:#990000;fill-opacity:1"
80 |    id="tspan867">Py</tspan>PHM</tspan></text>
81 |     <flowRoot
82 |        xml:space="preserve"
83 |        id="flowRoot54"
84 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:40px;line-height:1.25;font-family:calibri;-inkscape-font-specification:calibri;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none"
85 |        transform="scale(0.26458333)"><flowRegion
86 |          id="flowRegion56"><rect
87 |            id="rect58"
88 |            width="22.164503"
89 |            height="51.717171"
90 |            x="463.81274"
91 |            y="480.57004" /></flowRegion><flowPara
92 |          id="flowPara60"></flowPara></flowRoot>  </g>
93 | </svg>
94 | 


--------------------------------------------------------------------------------
/notebooks/scratch/import_package_resources.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pkg_resources\n",
 10 |     "from pathlib import Path\n",
 11 |     "import pandas as pd\n"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 7,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "meta_data_path = Path(pkg_resources.resource_filename('pyphm', 'datasets/auxilary_metadata/'))"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 8,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/plain": [
 31 |        "WindowsPath('C:/Users/Tim/Anaconda3/envs/featstore/lib/site-packages/pyphm/datasets/auxilary_metadata')"
 32 |       ]
 33 |      },
 34 |      "execution_count": 8,
 35 |      "metadata": {},
 36 |      "output_type": "execute_result"
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "meta_data_path"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 10,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/html": [
 51 |        "<div>\n",
 52 |        "<style scoped>\n",
 53 |        "    .dataframe tbody tr th:only-of-type {\n",
 54 |        "        vertical-align: middle;\n",
 55 |        "    }\n",
 56 |        "\n",
 57 |        "    .dataframe tbody tr th {\n",
 58 |        "        vertical-align: top;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe thead th {\n",
 62 |        "        text-align: right;\n",
 63 |        "    }\n",
 64 |        "</style>\n",
 65 |        "<table border=\"1\" class=\"dataframe\">\n",
 66 |        "  <thead>\n",
 67 |        "    <tr style=\"text-align: right;\">\n",
 68 |        "      <th></th>\n",
 69 |        "      <th>case</th>\n",
 70 |        "      <th>run</th>\n",
 71 |        "      <th>VB</th>\n",
 72 |        "      <th>time</th>\n",
 73 |        "      <th>DOC</th>\n",
 74 |        "      <th>feed</th>\n",
 75 |        "      <th>material</th>\n",
 76 |        "      <th>cut_no</th>\n",
 77 |        "      <th>tool_class</th>\n",
 78 |        "      <th>window_start</th>\n",
 79 |        "      <th>window_end</th>\n",
 80 |        "    </tr>\n",
 81 |        "  </thead>\n",
 82 |        "  <tbody>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>0</th>\n",
 85 |        "      <td>1</td>\n",
 86 |        "      <td>1</td>\n",
 87 |        "      <td>0.00</td>\n",
 88 |        "      <td>2</td>\n",
 89 |        "      <td>1.5</td>\n",
 90 |        "      <td>0.5</td>\n",
 91 |        "      <td>1</td>\n",
 92 |        "      <td>0</td>\n",
 93 |        "      <td>0</td>\n",
 94 |        "      <td>2496</td>\n",
 95 |        "      <td>6976</td>\n",
 96 |        "    </tr>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>1</th>\n",
 99 |        "      <td>1</td>\n",
100 |        "      <td>2</td>\n",
101 |        "      <td>NaN</td>\n",
102 |        "      <td>4</td>\n",
103 |        "      <td>1.5</td>\n",
104 |        "      <td>0.5</td>\n",
105 |        "      <td>1</td>\n",
106 |        "      <td>1</td>\n",
107 |        "      <td>0</td>\n",
108 |        "      <td>2496</td>\n",
109 |        "      <td>6976</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>2</th>\n",
113 |        "      <td>1</td>\n",
114 |        "      <td>3</td>\n",
115 |        "      <td>NaN</td>\n",
116 |        "      <td>6</td>\n",
117 |        "      <td>1.5</td>\n",
118 |        "      <td>0.5</td>\n",
119 |        "      <td>1</td>\n",
120 |        "      <td>2</td>\n",
121 |        "      <td>0</td>\n",
122 |        "      <td>2496</td>\n",
123 |        "      <td>6976</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>3</th>\n",
127 |        "      <td>1</td>\n",
128 |        "      <td>4</td>\n",
129 |        "      <td>0.11</td>\n",
130 |        "      <td>7</td>\n",
131 |        "      <td>1.5</td>\n",
132 |        "      <td>0.5</td>\n",
133 |        "      <td>1</td>\n",
134 |        "      <td>3</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>2496</td>\n",
137 |        "      <td>6976</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>4</th>\n",
141 |        "      <td>1</td>\n",
142 |        "      <td>5</td>\n",
143 |        "      <td>NaN</td>\n",
144 |        "      <td>11</td>\n",
145 |        "      <td>1.5</td>\n",
146 |        "      <td>0.5</td>\n",
147 |        "      <td>1</td>\n",
148 |        "      <td>4</td>\n",
149 |        "      <td>0</td>\n",
150 |        "      <td>2496</td>\n",
151 |        "      <td>6976</td>\n",
152 |        "    </tr>\n",
153 |        "  </tbody>\n",
154 |        "</table>\n",
155 |        "</div>"
156 |       ],
157 |       "text/plain": [
158 |        "   case  run    VB  time  DOC  feed  material  cut_no  tool_class  \\\n",
159 |        "0     1    1  0.00     2  1.5   0.5         1       0           0   \n",
160 |        "1     1    2   NaN     4  1.5   0.5         1       1           0   \n",
161 |        "2     1    3   NaN     6  1.5   0.5         1       2           0   \n",
162 |        "3     1    4  0.11     7  1.5   0.5         1       3           0   \n",
163 |        "4     1    5   NaN    11  1.5   0.5         1       4           0   \n",
164 |        "\n",
165 |        "   window_start  window_end  \n",
166 |        "0          2496        6976  \n",
167 |        "1          2496        6976  \n",
168 |        "2          2496        6976  \n",
169 |        "3          2496        6976  \n",
170 |        "4          2496        6976  "
171 |       ]
172 |      },
173 |      "execution_count": 10,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "df = pd.read_csv(meta_data_path / 'milling_labels_with_tool_class.csv')\n",
180 |     "df.head()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": []
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "interpreter": {
193 |    "hash": "bb5c389ed065b0664b086eb1393fdb5729447cbf21b18fded646434c15c951b5"
194 |   },
195 |   "kernelspec": {
196 |    "display_name": "Python 3.8.12 ('featstore')",
197 |    "language": "python",
198 |    "name": "python3"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 3
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython3",
210 |    "version": "3.8.12"
211 |   },
212 |   "orig_nbformat": 4
213 |  },
214 |  "nbformat": 4,
215 |  "nbformat_minor": 2
216 | }
217 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/auxilary_metadata/milling_labels_with_tool_class.csv:
--------------------------------------------------------------------------------
  1 | case,run,VB,time,DOC,feed,material,cut_no,tool_class,window_start,window_end
  2 | 1,1,0,2,1.5,0.5,1,0,0,2496,6976
  3 | 1,2,,4,1.5,0.5,1,1,0,2496,6976
  4 | 1,3,,6,1.5,0.5,1,2,0,2496,6976
  5 | 1,4,0.11,7,1.5,0.5,1,3,0,2496,6976
  6 | 1,5,,11,1.5,0.5,1,4,0,2496,6976
  7 | 1,6,0.2,15,1.5,0.5,1,5,1,2496,6976
  8 | 1,7,0.24,19,1.5,0.5,1,6,1,2496,6976
  9 | 1,8,0.29,22,1.5,0.5,1,7,1,2496,6976
 10 | 1,9,0.28,26,1.5,0.5,1,8,1,2496,6976
 11 | 1,10,0.29,29,1.5,0.5,1,9,1,2496,6976
 12 | 1,11,0.38,32,1.5,0.5,1,10,1,2496,6976
 13 | 1,12,0.4,35,1.5,0.5,1,11,1,2496,6976
 14 | 1,13,0.43,38,1.5,0.5,1,12,1,2496,6976
 15 | 1,14,0.45,41,1.5,0.5,1,13,1,2496,6976
 16 | 1,15,0.5,44,1.5,0.5,1,14,1,2496,6976
 17 | 1,16,,46,1.5,0.5,1,15,1,2496,6976
 18 | 1,17,0.44,48,1.5,0.5,1,16,1,2496,6976
 19 | 2,1,0.08,3,0.75,0.5,1,17,0,64,128
 20 | 2,2,0.14,9,0.75,0.5,1,18,0,2496,6976
 21 | 2,3,0.14,12,0.75,0.5,1,19,0,2496,6976
 22 | 2,4,0.14,15,0.75,0.5,1,20,0,2496,6976
 23 | 2,5,0.15,22,0.75,0.5,1,21,0,4224,6976
 24 | 2,6,,24,0.75,0.5,1,22,0,5056,6976
 25 | 2,7,0.18,27,0.75,0.5,1,23,0,2496,6976
 26 | 2,8,0.22,33,0.75,0.5,1,24,1,2496,6976
 27 | 2,9,0.26,39,0.75,0.5,1,25,1,2496,6976
 28 | 2,10,0.31,45,0.75,0.5,1,26,1,3520,8000
 29 | 2,11,0.38,51,0.75,0.5,1,27,1,2496,6976
 30 | 2,12,0.43,59,0.75,0.5,1,28,1,2496,6976
 31 | 2,13,0.48,66,0.75,0.5,1,29,1,2496,6976
 32 | 2,14,0.55,72,0.75,0.5,1,30,1,3520,8000
 33 | 3,1,0,0,0.75,0.25,1,31,0,4480,8960
 34 | 3,2,0.13,3,0.75,0.25,1,32,0,4480,8960
 35 | 3,3,0.13,9,0.75,0.25,1,33,0,4480,8960
 36 | 3,5,0.17,21,0.75,0.25,1,34,0,4480,8960
 37 | 3,6,0.19,27,0.75,0.25,1,35,0,3520,8960
 38 | 3,7,0.2,33,0.75,0.25,1,36,1,3520,8960
 39 | 3,8,0.23,39,0.75,0.25,1,37,1,3520,8960
 40 | 3,9,0.23,45,0.75,0.25,1,38,1,4480,8960
 41 | 3,10,0.26,51,0.75,0.25,1,39,1,3520,8960
 42 | 3,11,0.28,57,0.75,0.25,1,40,1,4160,8960
 43 | 3,12,0.33,63,0.75,0.25,1,41,1,4160,8960
 44 | 3,14,0.36,69,0.75,0.25,1,42,1,4480,8960
 45 | 3,15,0.44,75,0.75,0.25,1,43,1,4480,8960
 46 | 3,16,0.55,81,0.75,0.25,1,44,1,4480,8960
 47 | 4,1,0.08,3,1.5,0.25,1,45,0,4480,8960
 48 | 4,2,0.13,9,1.5,0.25,1,46,0,4480,8960
 49 | 4,3,0.2,15,1.5,0.25,1,47,1,4160,8960
 50 | 4,4,0.31,21,1.5,0.25,1,48,1,4160,8960
 51 | 4,5,0.35,27,1.5,0.25,1,49,1,4160,8960
 52 | 4,6,0.4,34,1.5,0.25,1,50,1,4160,8960
 53 | 4,7,0.49,39,1.5,0.25,1,51,1,4160,8960
 54 | 9,1,0,1,1.5,0.5,1,52,0,2112,6720
 55 | 9,2,0.1,3,1.5,0.5,1,53,0,2112,6720
 56 | 9,3,0.14,9,1.5,0.5,1,54,0,2112,6464
 57 | 9,4,0.19,16,1.5,0.5,1,55,0,2496,6720
 58 | 9,5,0.27,22,1.5,0.5,1,56,1,2496,6720
 59 | 9,6,0.38,28,1.5,0.5,1,57,1,2496,6720
 60 | 9,7,0.47,34,1.5,0.5,1,58,1,2496,6720
 61 | 9,8,0.64,40,1.5,0.5,1,59,1,2496,6720
 62 | 9,9,0.81,46,1.5,0.5,1,60,2,2112,6464
 63 | 10,1,0,0,1.5,0.25,1,61,0,4480,8960
 64 | 10,2,0.04,4,1.5,0.25,1,62,0,4480,8960
 65 | 10,3,0.08,9,1.5,0.25,1,63,0,4480,8960
 66 | 10,4,0.16,15,1.5,0.25,1,64,0,4160,8960
 67 | 10,5,0.25,21,1.5,0.25,1,65,1,4160,8960
 68 | 10,6,0.36,27,1.5,0.25,1,66,1,4160,8960
 69 | 10,7,0.43,33,1.5,0.25,1,67,1,4160,8960
 70 | 10,8,0.47,39,1.5,0.25,1,68,1,4160,8960
 71 | 10,9,0.53,45,1.5,0.25,1,69,1,4160,8960
 72 | 10,10,0.7,57,1.5,0.25,1,70,2,5056,8960
 73 | 11,1,0,1,0.75,0.25,1,71,0,4160,8960
 74 | 11,2,0.04,3,0.75,0.25,1,72,0,4160,8960
 75 | 11,3,0.07,10,0.75,0.25,1,73,0,4160,8960
 76 | 11,4,0.07,12,0.75,0.25,1,74,0,4160,8960
 77 | 11,5,0.08,14,0.75,0.25,1,75,0,4160,8960
 78 | 11,6,0.09,17,0.75,0.25,1,76,0,4160,8960
 79 | 11,7,,19,0.75,0.25,1,77,0,4160,8960
 80 | 11,8,0.12,21,0.75,0.25,1,78,0,4160,8960
 81 | 11,9,0.16,27,0.75,0.25,1,79,0,4160,8960
 82 | 11,10,0.18,33,0.75,0.25,1,80,0,4160,8960
 83 | 11,11,0.2,39,0.75,0.25,1,81,1,4160,8960
 84 | 11,12,0.23,45,0.75,0.25,1,82,1,4160,8960
 85 | 11,13,0.26,51,0.75,0.25,1,83,1,4160,8960
 86 | 11,14,,54,0.75,0.25,1,84,1,4160,8960
 87 | 11,15,0.31,57,0.75,0.25,1,85,1,4160,8960
 88 | 11,16,0.37,63,0.75,0.25,1,86,1,4160,8960
 89 | 11,17,,67,0.75,0.25,1,87,1,4160,8960
 90 | 11,18,0.42,72,0.75,0.25,1,88,1,4160,8960
 91 | 11,19,0.47,80,0.75,0.25,1,89,1,4160,8960
 92 | 11,20,0.57,86,0.75,0.25,1,90,1,4160,8960
 93 | 11,21,0.65,93,0.75,0.25,1,91,1,4160,8960
 94 | 11,22,0.68,100,0.75,0.25,1,92,1,4160,8960
 95 | 11,23,0.76,105,0.75,0.25,1,93,2,4160,8960
 96 | 12,1,,1,0.75,0.5,1,94,0,64,128
 97 | 12,2,0.05,3,0.75,0.5,1,95,0,2496,6720
 98 | 12,3,0.08,6,0.75,0.5,1,96,0,2496,6464
 99 | 12,4,,12,0.75,0.5,1,97,0,2496,6464
100 | 12,5,0.12,19,0.75,0.5,1,98,0,3008,6464
101 | 12,6,0.17,24,0.75,0.5,1,99,0,2496,6720
102 | 12,7,0.2,30,0.75,0.5,1,100,1,2496,6720
103 | 12,8,0.24,36,0.75,0.5,1,101,1,2496,6720
104 | 12,9,0.32,42,0.75,0.5,1,102,1,2496,6720
105 | 12,10,,45,0.75,0.5,1,103,1,2496,6720
106 | 12,11,0.4,49,0.75,0.5,1,104,1,2496,6464
107 | 12,12,0.45,55,0.75,0.5,1,105,1,2496,3904
108 | 12,13,0.49,61,0.75,0.5,1,106,1,2496,6720
109 | 12,14,0.58,67,0.75,0.5,1,107,1,2496,6720
110 | 12,15,0.65,74,0.75,0.5,1,108,1,2496,6464
111 | 5,1,0,0,1.5,0.5,2,109,0,2496,6464
112 | 5,2,0.16,3,1.5,0.5,2,110,0,2496,6720
113 | 5,3,0.29,6,1.5,0.5,2,111,1,2496,6976
114 | 5,4,0.44,9,1.5,0.5,2,112,1,2496,6976
115 | 5,5,0.53,12,1.5,0.5,2,113,1,2496,6976
116 | 5,6,0.74,15,1.5,0.5,2,114,2,2496,6720
117 | 6,1,0,0,1.5,0.25,2,115,0,4160,8960
118 | 7,1,0,1,0.75,0.25,2,116,0,4160,8960
119 | 7,2,0.09,3,0.75,0.25,2,117,0,4160,8960
120 | 7,3,0.13,6,0.75,0.25,2,118,0,4160,8960
121 | 7,4,0.22,10,0.75,0.25,2,119,1,4160,8960
122 | 7,5,0.24,13,0.75,0.25,2,120,1,4480,8960
123 | 7,6,0.34,15,0.75,0.25,2,121,1,4160,8960
124 | 7,7,0.46,19,0.75,0.25,2,122,1,4160,8960
125 | 7,8,,21,0.75,0.25,2,123,1,4480,8960
126 | 8,1,0,0,0.75,0.5,2,124,0,2496,6720
127 | 8,2,0.18,3,0.75,0.5,2,125,0,2496,6720
128 | 8,3,0.3,6,0.75,0.5,2,126,1,2496,6720
129 | 8,4,,8,0.75,0.5,2,127,1,2496,6720
130 | 8,5,0.44,9,0.75,0.5,2,128,1,2496,6720
131 | 8,6,0.62,12,0.75,0.5,2,129,1,2496,6720
132 | 13,1,,1,0.75,0.25,2,130,0,4480,8960
133 | 13,2,,2,0.75,0.25,2,131,0,4480,8960
134 | 13,3,0.1,4,0.75,0.25,2,132,0,4480,8960
135 | 13,4,0.13,7,0.75,0.25,2,133,0,4480,8960
136 | 13,5,0.17,11,0.75,0.25,2,134,0,4160,8960
137 | 13,6,0.32,16,0.75,0.25,2,135,1,4160,8960
138 | 13,7,0.38,19,0.75,0.25,2,136,1,4160,8960
139 | 13,8,0.49,22,0.75,0.25,2,137,1,4160,8960
140 | 13,9,0.56,25,0.75,0.25,2,138,1,4160,8960
141 | 13,10,0.68,29,0.75,0.25,2,139,1,4160,8960
142 | 13,11,0.83,32,0.75,0.25,2,140,2,4160,8960
143 | 13,12,0.92,35,0.75,0.25,2,141,2,4160,8960
144 | 13,13,1.07,38,0.75,0.25,2,142,2,4160,8960
145 | 13,14,1.3,42,0.75,0.25,2,143,2,4160,8960
146 | 13,15,1.53,45,0.75,0.25,2,144,2,4160,8960
147 | 14,1,,1,0.75,0.5,2,145,0,2496,6976
148 | 14,2,0.09,3,0.75,0.5,2,146,0,2496,6464
149 | 14,3,0.17,6,0.75,0.5,2,147,0,2496,6720
150 | 14,4,0.24,9,0.75,0.5,2,148,1,2496,6720
151 | 14,5,,11,0.75,0.5,2,149,1,2496,6720
152 | 14,6,0.35,12,0.75,0.5,2,150,1,2496,6464
153 | 14,8,0.6,18,0.75,0.5,2,151,1,2496,6464
154 | 14,9,0.81,21,0.75,0.5,2,152,2,2496,6464
155 | 14,10,1.14,24,0.75,0.5,2,153,2,2496,6464
156 | 15,1,,1,1.5,0.25,2,154,0,4160,8960
157 | 15,2,0.15,3,1.5,0.25,2,155,0,4160,8960
158 | 15,3,0.28,6,1.5,0.25,2,156,1,4160,8960
159 | 15,4,0.37,9,1.5,0.25,2,157,1,4160,8960
160 | 15,5,0.48,13,1.5,0.25,2,158,1,4160,8960
161 | 15,6,0.56,16,1.5,0.25,2,159,1,4480,8960
162 | 15,7,0.7,19,1.5,0.25,2,160,2,4160,8960
163 | 16,1,,1,1.5,0.5,2,161,0,2496,6720
164 | 16,2,,2,1.5,0.5,2,162,0,2496,6720
165 | 16,3,0.24,3,1.5,0.5,2,163,1,2496,6720
166 | 16,4,,4,1.5,0.5,2,164,1,2496,6720
167 | 16,5,0.4,6,1.5,0.5,2,165,1,2496,6720
168 | 16,6,0.62,9,1.5,0.5,2,166,1,2496,6720
169 | 


--------------------------------------------------------------------------------
/notebooks/scratch/milling_examp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyphm.datasets.milling import MillingDataLoad, MillingPrepMethodA\n",
 10 |     "import pandas as pd\n",
 11 |     "from pathlib import Path\n",
 12 |     "\n",
 13 |     "%load_ext autoreload\n",
 14 |     "%autoreload 2"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "root_dir:  /home/tim/Documents/PyPHM\n",
 27 |       "path_data_raw_folder:  /home/tim/Documents/PyPHM/data\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "root_dir = Path.cwd().parent\n",
 33 |     "print('root_dir: ', root_dir)\n",
 34 |     "path_data_raw_folder = Path(root_dir / 'data' )\n",
 35 |     "print('path_data_raw_folder: ', path_data_raw_folder)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "mill = MillingPrepMethodA(root=path_data_raw_folder, dataset_folder_name='milling', window_size=64, stride=64, cut_drop_list=[17, 94], download=False)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 4,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "PosixPath('/home/tim/Documents/PyPHM/data/milling')"
 56 |       ]
 57 |      },
 58 |      "execution_count": 4,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "mill.dataset_folder_path"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "x.shape (11570, 64, 6)\n",
 77 |       "y.shape (11570, 64, 3)\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "x, y = mill.create_xy_arrays()\n",
 83 |     "print(\"x.shape\", x.shape)\n",
 84 |     "print(\"y.shape\", y.shape)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 9,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "text/html": [
 95 |        "<div>\n",
 96 |        "<style scoped>\n",
 97 |        "    .dataframe tbody tr th:only-of-type {\n",
 98 |        "        vertical-align: middle;\n",
 99 |        "    }\n",
100 |        "\n",
101 |        "    .dataframe tbody tr th {\n",
102 |        "        vertical-align: top;\n",
103 |        "    }\n",
104 |        "\n",
105 |        "    .dataframe thead th {\n",
106 |        "        text-align: right;\n",
107 |        "    }\n",
108 |        "</style>\n",
109 |        "<table border=\"1\" class=\"dataframe\">\n",
110 |        "  <thead>\n",
111 |        "    <tr style=\"text-align: right;\">\n",
112 |        "      <th></th>\n",
113 |        "      <th>cut_id</th>\n",
114 |        "      <th>cut_no</th>\n",
115 |        "      <th>case</th>\n",
116 |        "      <th>time</th>\n",
117 |        "      <th>ae_spindle</th>\n",
118 |        "      <th>ae_table</th>\n",
119 |        "      <th>vib_spindle</th>\n",
120 |        "      <th>vib_table</th>\n",
121 |        "      <th>smcdc</th>\n",
122 |        "      <th>smcac</th>\n",
123 |        "      <th>tool_class</th>\n",
124 |        "    </tr>\n",
125 |        "  </thead>\n",
126 |        "  <tbody>\n",
127 |        "    <tr>\n",
128 |        "      <th>0</th>\n",
129 |        "      <td>0_0</td>\n",
130 |        "      <td>0</td>\n",
131 |        "      <td>1</td>\n",
132 |        "      <td>0.000</td>\n",
133 |        "      <td>0.219727</td>\n",
134 |        "      <td>0.272827</td>\n",
135 |        "      <td>0.733643</td>\n",
136 |        "      <td>2.116699</td>\n",
137 |        "      <td>6.840820</td>\n",
138 |        "      <td>0.124512</td>\n",
139 |        "      <td>0</td>\n",
140 |        "    </tr>\n",
141 |        "    <tr>\n",
142 |        "      <th>1</th>\n",
143 |        "      <td>0_0</td>\n",
144 |        "      <td>0</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>0.004</td>\n",
147 |        "      <td>0.246582</td>\n",
148 |        "      <td>0.322266</td>\n",
149 |        "      <td>0.778809</td>\n",
150 |        "      <td>2.277832</td>\n",
151 |        "      <td>6.660156</td>\n",
152 |        "      <td>-0.561523</td>\n",
153 |        "      <td>0</td>\n",
154 |        "    </tr>\n",
155 |        "    <tr>\n",
156 |        "      <th>2</th>\n",
157 |        "      <td>0_0</td>\n",
158 |        "      <td>0</td>\n",
159 |        "      <td>1</td>\n",
160 |        "      <td>0.008</td>\n",
161 |        "      <td>0.294189</td>\n",
162 |        "      <td>0.283813</td>\n",
163 |        "      <td>0.758057</td>\n",
164 |        "      <td>2.343750</td>\n",
165 |        "      <td>6.508789</td>\n",
166 |        "      <td>-2.099609</td>\n",
167 |        "      <td>0</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>3</th>\n",
171 |        "      <td>0_0</td>\n",
172 |        "      <td>0</td>\n",
173 |        "      <td>1</td>\n",
174 |        "      <td>0.012</td>\n",
175 |        "      <td>0.323486</td>\n",
176 |        "      <td>0.260010</td>\n",
177 |        "      <td>0.726318</td>\n",
178 |        "      <td>2.448730</td>\n",
179 |        "      <td>6.542969</td>\n",
180 |        "      <td>-2.731934</td>\n",
181 |        "      <td>0</td>\n",
182 |        "    </tr>\n",
183 |        "    <tr>\n",
184 |        "      <th>4</th>\n",
185 |        "      <td>0_0</td>\n",
186 |        "      <td>0</td>\n",
187 |        "      <td>1</td>\n",
188 |        "      <td>0.016</td>\n",
189 |        "      <td>0.290527</td>\n",
190 |        "      <td>0.253296</td>\n",
191 |        "      <td>0.653076</td>\n",
192 |        "      <td>2.546387</td>\n",
193 |        "      <td>6.621094</td>\n",
194 |        "      <td>-3.505859</td>\n",
195 |        "      <td>0</td>\n",
196 |        "    </tr>\n",
197 |        "  </tbody>\n",
198 |        "</table>\n",
199 |        "</div>"
200 |       ],
201 |       "text/plain": [
202 |        "  cut_id  cut_no  case   time  ae_spindle  ae_table  vib_spindle  vib_table  \\\n",
203 |        "0    0_0       0     1  0.000    0.219727  0.272827     0.733643   2.116699   \n",
204 |        "1    0_0       0     1  0.004    0.246582  0.322266     0.778809   2.277832   \n",
205 |        "2    0_0       0     1  0.008    0.294189  0.283813     0.758057   2.343750   \n",
206 |        "3    0_0       0     1  0.012    0.323486  0.260010     0.726318   2.448730   \n",
207 |        "4    0_0       0     1  0.016    0.290527  0.253296     0.653076   2.546387   \n",
208 |        "\n",
209 |        "      smcdc     smcac  tool_class  \n",
210 |        "0  6.840820  0.124512           0  \n",
211 |        "1  6.660156 -0.561523           0  \n",
212 |        "2  6.508789 -2.099609           0  \n",
213 |        "3  6.542969 -2.731934           0  \n",
214 |        "4  6.621094 -3.505859           0  "
215 |       ]
216 |      },
217 |      "execution_count": 9,
218 |      "metadata": {},
219 |      "output_type": "execute_result"
220 |     }
221 |    ],
222 |    "source": [
223 |     "df = mill.create_xy_dataframe()\n",
224 |     "df.head()"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": []
233 |   }
234 |  ],
235 |  "metadata": {
236 |   "interpreter": {
237 |    "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16"
238 |   },
239 |   "kernelspec": {
240 |    "display_name": "Python 3.8.12 64-bit ('featstore': conda)",
241 |    "language": "python",
242 |    "name": "python3"
243 |   },
244 |   "language_info": {
245 |    "codemirror_mode": {
246 |     "name": "ipython",
247 |     "version": 3
248 |    },
249 |    "file_extension": ".py",
250 |    "mimetype": "text/x-python",
251 |    "name": "python",
252 |    "nbconvert_exporter": "python",
253 |    "pygments_lexer": "ipython3",
254 |    "version": "3.8.12"
255 |   },
256 |   "orig_nbformat": 4
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 2
260 | }
261 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/auxilary_metadata/airbus_dfvalid_groundtruth.csv:
--------------------------------------------------------------------------------
  1 | seqID,anomaly
  2 | 0,0.0
  3 | 1,1.0
  4 | 2,0.0
  5 | 3,0.0
  6 | 4,1.0
  7 | 5,1.0
  8 | 6,0.0
  9 | 7,0.0
 10 | 8,1.0
 11 | 9,0.0
 12 | 10,0.0
 13 | 11,0.0
 14 | 12,1.0
 15 | 13,1.0
 16 | 14,1.0
 17 | 15,0.0
 18 | 16,1.0
 19 | 17,1.0
 20 | 18,0.0
 21 | 19,0.0
 22 | 20,0.0
 23 | 21,1.0
 24 | 22,0.0
 25 | 23,0.0
 26 | 24,1.0
 27 | 25,1.0
 28 | 26,0.0
 29 | 27,1.0
 30 | 28,1.0
 31 | 29,0.0
 32 | 30,0.0
 33 | 31,0.0
 34 | 32,1.0
 35 | 33,0.0
 36 | 34,0.0
 37 | 35,0.0
 38 | 36,1.0
 39 | 37,0.0
 40 | 38,0.0
 41 | 39,0.0
 42 | 40,0.0
 43 | 41,0.0
 44 | 42,1.0
 45 | 43,1.0
 46 | 44,1.0
 47 | 45,0.0
 48 | 46,0.0
 49 | 47,0.0
 50 | 48,0.0
 51 | 49,1.0
 52 | 50,0.0
 53 | 51,1.0
 54 | 52,0.0
 55 | 53,1.0
 56 | 54,1.0
 57 | 55,1.0
 58 | 56,0.0
 59 | 57,1.0
 60 | 58,1.0
 61 | 59,1.0
 62 | 60,0.0
 63 | 61,1.0
 64 | 62,0.0
 65 | 63,1.0
 66 | 64,0.0
 67 | 65,0.0
 68 | 66,1.0
 69 | 67,1.0
 70 | 68,0.0
 71 | 69,1.0
 72 | 70,0.0
 73 | 71,0.0
 74 | 72,0.0
 75 | 73,1.0
 76 | 74,1.0
 77 | 75,1.0
 78 | 76,1.0
 79 | 77,1.0
 80 | 78,1.0
 81 | 79,1.0
 82 | 80,1.0
 83 | 81,1.0
 84 | 82,1.0
 85 | 83,0.0
 86 | 84,0.0
 87 | 85,1.0
 88 | 86,0.0
 89 | 87,1.0
 90 | 88,1.0
 91 | 89,0.0
 92 | 90,1.0
 93 | 91,0.0
 94 | 92,0.0
 95 | 93,0.0
 96 | 94,0.0
 97 | 95,0.0
 98 | 96,0.0
 99 | 97,1.0
100 | 98,1.0
101 | 99,1.0
102 | 100,1.0
103 | 101,1.0
104 | 102,1.0
105 | 103,0.0
106 | 104,1.0
107 | 105,0.0
108 | 106,0.0
109 | 107,0.0
110 | 108,0.0
111 | 109,0.0
112 | 110,0.0
113 | 111,1.0
114 | 112,1.0
115 | 113,0.0
116 | 114,0.0
117 | 115,1.0
118 | 116,1.0
119 | 117,0.0
120 | 118,1.0
121 | 119,0.0
122 | 120,1.0
123 | 121,0.0
124 | 122,1.0
125 | 123,1.0
126 | 124,0.0
127 | 125,1.0
128 | 126,1.0
129 | 127,0.0
130 | 128,1.0
131 | 129,0.0
132 | 130,0.0
133 | 131,1.0
134 | 132,1.0
135 | 133,0.0
136 | 134,1.0
137 | 135,0.0
138 | 136,0.0
139 | 137,0.0
140 | 138,0.0
141 | 139,0.0
142 | 140,0.0
143 | 141,1.0
144 | 142,1.0
145 | 143,1.0
146 | 144,1.0
147 | 145,1.0
148 | 146,1.0
149 | 147,1.0
150 | 148,0.0
151 | 149,0.0
152 | 150,0.0
153 | 151,0.0
154 | 152,0.0
155 | 153,0.0
156 | 154,1.0
157 | 155,0.0
158 | 156,0.0
159 | 157,0.0
160 | 158,0.0
161 | 159,0.0
162 | 160,0.0
163 | 161,1.0
164 | 162,1.0
165 | 163,1.0
166 | 164,0.0
167 | 165,1.0
168 | 166,1.0
169 | 167,1.0
170 | 168,0.0
171 | 169,1.0
172 | 170,1.0
173 | 171,1.0
174 | 172,0.0
175 | 173,1.0
176 | 174,0.0
177 | 175,1.0
178 | 176,0.0
179 | 177,0.0
180 | 178,1.0
181 | 179,1.0
182 | 180,0.0
183 | 181,0.0
184 | 182,0.0
185 | 183,1.0
186 | 184,1.0
187 | 185,0.0
188 | 186,0.0
189 | 187,0.0
190 | 188,0.0
191 | 189,1.0
192 | 190,1.0
193 | 191,1.0
194 | 192,1.0
195 | 193,0.0
196 | 194,0.0
197 | 195,0.0
198 | 196,0.0
199 | 197,1.0
200 | 198,1.0
201 | 199,1.0
202 | 200,0.0
203 | 201,0.0
204 | 202,0.0
205 | 203,0.0
206 | 204,0.0
207 | 205,0.0
208 | 206,1.0
209 | 207,1.0
210 | 208,1.0
211 | 209,1.0
212 | 210,1.0
213 | 211,1.0
214 | 212,0.0
215 | 213,1.0
216 | 214,0.0
217 | 215,0.0
218 | 216,0.0
219 | 217,1.0
220 | 218,0.0
221 | 219,0.0
222 | 220,0.0
223 | 221,1.0
224 | 222,1.0
225 | 223,0.0
226 | 224,0.0
227 | 225,0.0
228 | 226,0.0
229 | 227,0.0
230 | 228,0.0
231 | 229,1.0
232 | 230,0.0
233 | 231,0.0
234 | 232,1.0
235 | 233,0.0
236 | 234,0.0
237 | 235,0.0
238 | 236,0.0
239 | 237,1.0
240 | 238,1.0
241 | 239,1.0
242 | 240,1.0
243 | 241,0.0
244 | 242,1.0
245 | 243,1.0
246 | 244,1.0
247 | 245,0.0
248 | 246,0.0
249 | 247,1.0
250 | 248,0.0
251 | 249,1.0
252 | 250,0.0
253 | 251,0.0
254 | 252,1.0
255 | 253,0.0
256 | 254,0.0
257 | 255,1.0
258 | 256,0.0
259 | 257,0.0
260 | 258,1.0
261 | 259,1.0
262 | 260,0.0
263 | 261,0.0
264 | 262,1.0
265 | 263,0.0
266 | 264,1.0
267 | 265,0.0
268 | 266,1.0
269 | 267,0.0
270 | 268,0.0
271 | 269,1.0
272 | 270,0.0
273 | 271,0.0
274 | 272,1.0
275 | 273,0.0
276 | 274,1.0
277 | 275,0.0
278 | 276,0.0
279 | 277,0.0
280 | 278,0.0
281 | 279,1.0
282 | 280,1.0
283 | 281,0.0
284 | 282,1.0
285 | 283,1.0
286 | 284,1.0
287 | 285,1.0
288 | 286,1.0
289 | 287,1.0
290 | 288,0.0
291 | 289,0.0
292 | 290,0.0
293 | 291,0.0
294 | 292,0.0
295 | 293,0.0
296 | 294,1.0
297 | 295,0.0
298 | 296,0.0
299 | 297,0.0
300 | 298,1.0
301 | 299,0.0
302 | 300,1.0
303 | 301,1.0
304 | 302,1.0
305 | 303,1.0
306 | 304,1.0
307 | 305,1.0
308 | 306,0.0
309 | 307,1.0
310 | 308,0.0
311 | 309,1.0
312 | 310,1.0
313 | 311,0.0
314 | 312,0.0
315 | 313,0.0
316 | 314,1.0
317 | 315,1.0
318 | 316,0.0
319 | 317,0.0
320 | 318,0.0
321 | 319,1.0
322 | 320,1.0
323 | 321,0.0
324 | 322,0.0
325 | 323,1.0
326 | 324,0.0
327 | 325,0.0
328 | 326,0.0
329 | 327,0.0
330 | 328,0.0
331 | 329,1.0
332 | 330,0.0
333 | 331,0.0
334 | 332,1.0
335 | 333,0.0
336 | 334,1.0
337 | 335,0.0
338 | 336,0.0
339 | 337,0.0
340 | 338,1.0
341 | 339,1.0
342 | 340,0.0
343 | 341,0.0
344 | 342,0.0
345 | 343,1.0
346 | 344,0.0
347 | 345,1.0
348 | 346,0.0
349 | 347,1.0
350 | 348,0.0
351 | 349,1.0
352 | 350,0.0
353 | 351,1.0
354 | 352,1.0
355 | 353,1.0
356 | 354,1.0
357 | 355,1.0
358 | 356,0.0
359 | 357,0.0
360 | 358,1.0
361 | 359,1.0
362 | 360,0.0
363 | 361,0.0
364 | 362,1.0
365 | 363,0.0
366 | 364,0.0
367 | 365,0.0
368 | 366,1.0
369 | 367,0.0
370 | 368,1.0
371 | 369,1.0
372 | 370,1.0
373 | 371,1.0
374 | 372,0.0
375 | 373,0.0
376 | 374,0.0
377 | 375,1.0
378 | 376,1.0
379 | 377,1.0
380 | 378,0.0
381 | 379,0.0
382 | 380,1.0
383 | 381,1.0
384 | 382,0.0
385 | 383,0.0
386 | 384,1.0
387 | 385,0.0
388 | 386,0.0
389 | 387,0.0
390 | 388,0.0
391 | 389,1.0
392 | 390,0.0
393 | 391,0.0
394 | 392,1.0
395 | 393,0.0
396 | 394,1.0
397 | 395,1.0
398 | 396,1.0
399 | 397,0.0
400 | 398,0.0
401 | 399,0.0
402 | 400,1.0
403 | 401,1.0
404 | 402,1.0
405 | 403,1.0
406 | 404,1.0
407 | 405,0.0
408 | 406,1.0
409 | 407,1.0
410 | 408,1.0
411 | 409,0.0
412 | 410,1.0
413 | 411,1.0
414 | 412,0.0
415 | 413,0.0
416 | 414,1.0
417 | 415,0.0
418 | 416,0.0
419 | 417,1.0
420 | 418,0.0
421 | 419,0.0
422 | 420,1.0
423 | 421,0.0
424 | 422,0.0
425 | 423,1.0
426 | 424,1.0
427 | 425,0.0
428 | 426,0.0
429 | 427,1.0
430 | 428,0.0
431 | 429,0.0
432 | 430,0.0
433 | 431,1.0
434 | 432,0.0
435 | 433,1.0
436 | 434,0.0
437 | 435,0.0
438 | 436,1.0
439 | 437,0.0
440 | 438,1.0
441 | 439,1.0
442 | 440,1.0
443 | 441,1.0
444 | 442,1.0
445 | 443,0.0
446 | 444,1.0
447 | 445,1.0
448 | 446,1.0
449 | 447,1.0
450 | 448,0.0
451 | 449,0.0
452 | 450,0.0
453 | 451,1.0
454 | 452,1.0
455 | 453,1.0
456 | 454,1.0
457 | 455,1.0
458 | 456,0.0
459 | 457,1.0
460 | 458,1.0
461 | 459,0.0
462 | 460,0.0
463 | 461,1.0
464 | 462,1.0
465 | 463,1.0
466 | 464,0.0
467 | 465,1.0
468 | 466,1.0
469 | 467,0.0
470 | 468,0.0
471 | 469,1.0
472 | 470,1.0
473 | 471,0.0
474 | 472,1.0
475 | 473,0.0
476 | 474,1.0
477 | 475,1.0
478 | 476,0.0
479 | 477,0.0
480 | 478,1.0
481 | 479,1.0
482 | 480,1.0
483 | 481,0.0
484 | 482,0.0
485 | 483,1.0
486 | 484,1.0
487 | 485,0.0
488 | 486,1.0
489 | 487,1.0
490 | 488,1.0
491 | 489,0.0
492 | 490,1.0
493 | 491,0.0
494 | 492,1.0
495 | 493,1.0
496 | 494,1.0
497 | 495,0.0
498 | 496,0.0
499 | 497,1.0
500 | 498,1.0
501 | 499,0.0
502 | 500,0.0
503 | 501,0.0
504 | 502,1.0
505 | 503,1.0
506 | 504,1.0
507 | 505,1.0
508 | 506,1.0
509 | 507,0.0
510 | 508,0.0
511 | 509,1.0
512 | 510,1.0
513 | 511,0.0
514 | 512,1.0
515 | 513,1.0
516 | 514,0.0
517 | 515,0.0
518 | 516,1.0
519 | 517,1.0
520 | 518,1.0
521 | 519,0.0
522 | 520,0.0
523 | 521,0.0
524 | 522,0.0
525 | 523,0.0
526 | 524,0.0
527 | 525,1.0
528 | 526,1.0
529 | 527,1.0
530 | 528,0.0
531 | 529,0.0
532 | 530,1.0
533 | 531,0.0
534 | 532,0.0
535 | 533,0.0
536 | 534,1.0
537 | 535,1.0
538 | 536,1.0
539 | 537,1.0
540 | 538,1.0
541 | 539,0.0
542 | 540,1.0
543 | 541,1.0
544 | 542,1.0
545 | 543,1.0
546 | 544,0.0
547 | 545,1.0
548 | 546,1.0
549 | 547,1.0
550 | 548,0.0
551 | 549,1.0
552 | 550,0.0
553 | 551,1.0
554 | 552,0.0
555 | 553,1.0
556 | 554,1.0
557 | 555,1.0
558 | 556,0.0
559 | 557,1.0
560 | 558,0.0
561 | 559,0.0
562 | 560,1.0
563 | 561,0.0
564 | 562,0.0
565 | 563,1.0
566 | 564,0.0
567 | 565,0.0
568 | 566,0.0
569 | 567,1.0
570 | 568,1.0
571 | 569,1.0
572 | 570,0.0
573 | 571,1.0
574 | 572,0.0
575 | 573,0.0
576 | 574,0.0
577 | 575,0.0
578 | 576,1.0
579 | 577,1.0
580 | 578,1.0
581 | 579,1.0
582 | 580,1.0
583 | 581,1.0
584 | 582,1.0
585 | 583,1.0
586 | 584,0.0
587 | 585,1.0
588 | 586,0.0
589 | 587,1.0
590 | 588,0.0
591 | 589,1.0
592 | 590,1.0
593 | 591,0.0
594 | 592,0.0
595 | 593,1.0
596 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/airbus.py:
--------------------------------------------------------------------------------
  1 | import scipy.io as sio
  2 | import numpy as np
  3 | import pandas as pd
  4 | from pathlib import Path
  5 | from .pyphm import PHMDataset
  6 | from typing import Any, Callable, List, Optional, Tuple
  7 | import pkg_resources
  8 | from .utils import (
  9 |     download_and_extract_archive,
 10 |     extract_archive,
 11 |     check_integrity,
 12 |     download_url,
 13 | )
 14 | import os
 15 | from urllib.error import URLError
 16 | 
 17 | """
 18 | Contains the data prep class for the Airbus Helicopter Accelerometer Dataset.
 19 | 
 20 | Also contains helper functions associated with the dataset.
 21 | """
 22 | 
 23 | 
 24 | ###############################################################################
 25 | # Data Prep Classes
 26 | ###############################################################################
 27 | class AirbusDataLoad(PHMDataset):
 28 |     """
 29 |     Airbus Helicopter Accelerometer Dataset from .h5 file, and download if necessary.
 30 | 
 31 |     Args:
 32 |         root (string): Root directory to place all the  data sets.
 33 | 
 34 |         dataset_folder_name (string): Name of folder containing raw data.
 35 |             This folder will be created in the root directory if not present.
 36 | 
 37 |         download (bool): If True, the data will be downloaded from ETH Zurich.
 38 | 
 39 |     """
 40 | 
 41 |     mirrors = [
 42 |         "https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/",
 43 |     ]
 44 | 
 45 |     resources = [
 46 |         ("dftrain.h5", None),
 47 |         ("dfvalid.h5", None),
 48 |         ("dfvalid_groundtruth.csv", None),
 49 |     ]
 50 | 
 51 |     def __init__(
 52 |         self,
 53 |         root: Path,
 54 |         dataset_folder_name: str = "airbus",
 55 |         download: bool = False,
 56 |         path_df_labels: Path = None,
 57 |     ) -> None:
 58 |         super().__init__(root, dataset_folder_name)
 59 | 
 60 |         self.dataset_folder_path = self.root / self.dataset_folder_name
 61 | 
 62 |         if path_df_labels is not None:
 63 |             self.path_df_labels = path_df_labels
 64 |         else:
 65 |             # path of pyphm source directory using pathlib
 66 |             self.path_df_labels = Path(pkg_resources.resource_filename('pyphm', 'datasets/auxilary_metadata/airbus_dfvalid_groundtruth.csv'))
 67 |             
 68 |         if download:
 69 |             self.download()
 70 | 
 71 |     def _check_exists(self) -> bool:
 72 |         return all(
 73 |             check_integrity(self.dataset_folder_path / file_name)
 74 |             for file_name, _ in self.resources
 75 |         )
 76 | 
 77 |     def download(self) -> None:
 78 |         """Download the Airbus Helicopter Accelerometer Dataset if it doesn't exist already."""
 79 | 
 80 |         if self._check_exists():
 81 |             return
 82 | 
 83 |         # pathlib makdir if not exists
 84 |         self.dataset_folder_path.mkdir(parents=True, exist_ok=True)
 85 | 
 86 |         # download files
 87 |         for filename, md5 in self.resources:
 88 |             for mirror in self.mirrors:
 89 |                 url = f"{mirror}{filename}"
 90 |                 try:
 91 |                     print(f"Downloading {url}")
 92 | 
 93 |                     download_url(url, self.dataset_folder_path, filename, md5)
 94 | 
 95 |                 except URLError as error:
 96 |                     print(f"Failed to download (trying next):\n{error}")
 97 |                     continue
 98 |                 finally:
 99 |                     print()
100 |                 break
101 |             else:
102 |                 raise RuntimeError(f"Error downloading {filename}")
103 | 
104 |     def load_df(
105 |         self,
106 |         train_or_val: str = "train",
107 |     ) -> None:
108 |         """Load the h5 file as df."""
109 | 
110 |         if train_or_val == "train":
111 |             df = pd.read_hdf(self.dataset_folder_path / "dftrain.h5", "dftrain")
112 | 
113 |             # add y column of all zeros (indicating no anomaly)
114 |             df["y"] = 0
115 | 
116 |         else:  # val dataset
117 |             df = pd.read_hdf(self.dataset_folder_path / "dfvalid.h5", "dfvalid")
118 | 
119 |             # load the dfvalid_groundtruth.csv as dataframe
120 |             df_labels = pd.read_csv(
121 |                 self.path_df_labels,
122 |                 dtype={"seqID": int, "anomaly": int},
123 |             )
124 | 
125 |             # append the anomaly label to the df_val dataframe
126 |             df = df.merge(df_labels, left_index=True, right_on="seqID")
127 | 
128 |             # drop the seqID column and rename the anomaly column to y
129 |             df = df.drop("seqID", axis=1).rename(columns={"anomaly": "y"})
130 | 
131 |         return df
132 | 
133 | 
134 | class AirbusPrepMethodA(AirbusDataLoad):
135 |     """
136 |     Class used to prepare the Airbus Helicopter Accelerometer Dataset before feature engining or machine learning.
137 |     Method is described in the paper:
138 | 
139 |     `Temporal signals to images: Monitoring the condition of industrial assets with deep learning image processing algorithms`
140 |     by Garcia et al., 2021 - https://arxiv.org/abs/2005.07031
141 | 
142 |     Args:
143 |         root (string): Root directory to place all the  data sets. (likely the raw data folder)
144 | 
145 |         dataset_folder_name (string): Name of folder (within root) containing raw data.
146 |             This folder will be created in the root directory if not present.
147 | 
148 |         download (bool): If True, the data will be downloaded from the ETH Zurich website.
149 | 
150 |         path_df_labels (Path, optional): Path to the csv with the labels. If not provided, it
151 |             will default to airbus_dfvalid_groundtruth.csv in the auxilary_metadata folder.
152 | 
153 |         window_size (int): Size of the window to be used for the sliding window.
154 | 
155 |         stride (int): Size of the stride to be used for the sliding window.
156 | 
157 |     """
158 | 
159 |     def __init__(
160 |         self,
161 |         root: Path,
162 |         dataset_folder_name: str = "airbus",
163 |         download: bool = False,
164 |         path_df_labels: Path = None,
165 |         window_size: int = 64,
166 |         stride: int = 64,
167 |     ) -> None:
168 |         super().__init__(root, dataset_folder_name, download, path_df_labels)
169 | 
170 |         self.window_size = window_size  # size of the window
171 |         self.stride = stride  # stride between windows
172 | 
173 |     def create_xy_arrays(self, train_or_val: str = "train"):
174 |         """Create the x and y arrays used in deep learning.
175 | 
176 |         Parameters
177 |         ----------
178 |         train_or_val : str
179 |             Either 'train' or 'val' to indicate which dataset to use. Default is 'train'.
180 | 
181 |         Returns
182 |         -------
183 |         x : ndarray
184 |             Array of the signals (samples). Shape: (n_samples, n_windows, window_size)
185 | 
186 |         y : ndarray
187 |             Array of the labels/meta-data for each signals. Shape: (n_samples, n_windows, window_size, label_columns)
188 |             The label_columns (in order) are:
189 |                 time_increments (int) -- the index of each time increment in the window. e.g. (0, 1, 2, ...)
190 |                 sample_index (int) -- the index of each sample
191 |                 window_index (int) -- the index of each window
192 |                 label (int) -- the label of each windowed sample (0 for normal, 1 for anomaly)
193 | 
194 |         """
195 | 
196 |         # load the dataframe
197 |         df = self.load_df(train_or_val)
198 | 
199 |         x = df.drop("y", axis=1).to_numpy()
200 |         y = df["y"].to_numpy()
201 | 
202 |         # instantiate the "temporary" lists to store the windows and labels
203 |         window_list = []
204 |         y_sample_win_label_list = []
205 | 
206 |         n_samples = x.shape[0]
207 |         len_sample = x.shape[1]
208 | 
209 |         # fit the strided windows into the temporary list until the length
210 |         # of the window does not equal the proper length (better way to do this???)
211 |         for window_i in range(len_sample):
212 |             windowed_signal = x[
213 |                 :, window_i * self.stride : window_i * self.stride + self.window_size
214 |             ]
215 | 
216 |             # if the windowed signal is the proper length, add it to the list
217 |             if windowed_signal.shape == (n_samples, self.window_size):
218 |                 window_list.append(windowed_signal)
219 | 
220 |                 y_sample_win_label_list.append(
221 |                     [
222 |                         (int(sample_indices), int(window_indices), int(ys))
223 |                         for sample_indices, window_indices, ys in list(
224 |                             zip(list(range(0, n_samples)), [window_i] * n_samples, y)
225 |                         )
226 |                     ]
227 |                 )
228 | 
229 |             else:
230 |                 break
231 | 
232 |         x = np.array(window_list).reshape(n_samples, -1, self.window_size)
233 | 
234 |         y_sample_win_label_array = np.array(y_sample_win_label_list)[:, :, np.newaxis].repeat(
235 |             self.window_size, axis=2
236 |         )
237 | 
238 |         time_index = (
239 |             np.arange(0, self.window_size, 1)[np.newaxis, np.newaxis, :]
240 |             .repeat(n_samples, axis=1)
241 |             .repeat(x.shape[1], axis=0)[:, :, :, np.newaxis]
242 |         )
243 | 
244 |         y_time_sample_win_label_array = np.concatenate(
245 |             (time_index, y_sample_win_label_array), axis=3
246 |         ).reshape(n_samples, -1, self.window_size, 4)
247 |         # window_id_array = np.expand_dims(np.array(window_id_list).reshape(-1), axis=1)
248 |         # window_label_array = np.expand_dims(np.array(window_label_list).reshape(-1), axis=1)
249 | 
250 |         # x = np.vstack(window_list,)
251 | 
252 |         # y = np.hstack((window_label_array, window_id_array))
253 |         # return np.vstack(x), np.vstack(y_time_sig_win_label_array)
254 |         return x, y_time_sample_win_label_array
255 | 
256 |     def create_xy_dataframe(self, train_or_val: str = "train"):
257 |         """
258 |         Create a flat dataframe (2D array) of the x and y arrays.
259 | 
260 |         Amenable for use with TSFresh for feature engineering.
261 | 
262 |         Returns
263 |         -------
264 |         df : pd.DataFrame
265 |             Single flat dataframe containing each sample and its labels.
266 |             columns: ['x', 'time_index', 'sample_index', 'window_index', 'y']
267 | 
268 |         """
269 | 
270 |         x, y = self.create_xy_arrays(train_or_val)  # create the x and y arrays
271 | 
272 |         df = pd.DataFrame(np.vstack(x).reshape(-1,1), columns=['x'])
273 | 
274 |         # add the time_index, sample_index, window_index, and label columns
275 |         # to the dataframe
276 |         df = df.assign(time_index=np.vstack(y[:,:,:,0]).reshape(-1,1))
277 |         df = df.assign(sample_index=np.vstack(y[:,:,:,1]).reshape(-1,1))
278 |         df = df.assign(win_index=np.vstack(y[:,:,:,2]).reshape(-1,1))
279 |         df = df.assign(y=np.vstack(y[:,:,:,3]).reshape(-1,1))
280 | 
281 |         return df
282 | 


--------------------------------------------------------------------------------
/notebooks/scratch/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "ename": "ModuleNotFoundError",
 10 |      "evalue": "No module named 'pyphm'",
 11 |      "output_type": "error",
 12 |      "traceback": [
 13 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 14 |       "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 15 |       "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[1;32m      2\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/home/tim/Documents/PyPHM\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyphm\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _urlretrieve\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyphm\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmilling\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MillingDataLoad, MillingPrepMethodA\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n",
 16 |       "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyphm'"
 17 |      ]
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "import sys\n",
 22 |     "sys.path.append(r'/home/tim/Documents/PyPHM')\n",
 23 |     "from pyphm.datasets.utils import _urlretrieve\n",
 24 |     "from pyphm.datasets.milling import MillingDataLoad, MillingPrepMethodA\n",
 25 |     "from pathlib import Path\n",
 26 |     "import hashlib\n",
 27 |     "\n",
 28 |     "%load_ext autoreload\n",
 29 |     "%autoreload 2"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "/home/tim/Documents/PyPHM\n",
 42 |       "/home/tim/Documents/PyPHM/data\n",
 43 |       "<class 'pathlib.PosixPath'>\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "root_dir = Path.cwd().parent\n",
 49 |     "print(root_dir)\n",
 50 |     "path_data_raw_folder = Path(root_dir / 'data' )\n",
 51 |     "print(path_data_raw_folder)\n",
 52 |     "print(type(path_data_raw_folder))"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 4,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "type(root) =  <class 'pathlib.PosixPath'>\n",
 65 |       "Loading data!!!!\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "mill = MillingDataLoad(path_data_raw_folder, download=False)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 5,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "type(root) =  <class 'pathlib.PosixPath'>\n",
 83 |       "Loading data!!!!\n",
 84 |       "type field names:  <class 'tuple'>\n",
 85 |       "type signal names:  <class 'tuple'>\n",
 86 |       "('case', 'run', 'VB', 'time', 'DOC', 'feed', 'material', 'smcAC', 'smcDC', 'vib_table', 'vib_spindle', 'AE_table', 'AE_spindle')\n",
 87 |       "('AE_spindle', 'AE_table', 'vib_spindle', 'vib_table', 'smcDC', 'smcAC')\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "mill = MillingPrepMethodA(path_data_raw_folder, download=False)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "x.shape (11570, 64, 6)\n",
105 |       "y.shape (11570, 64, 3)\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "x, y = mill.create_xy_arrays()\n",
111 |     "print(\"x.shape\", x.shape)\n",
112 |     "print(\"y.shape\", y.shape)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "array(['0', '0_0', '0.0'], dtype='<U32')"
124 |       ]
125 |      },
126 |      "execution_count": 7,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "y[0,0,:]"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 39,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/html": [
143 |        "<div>\n",
144 |        "<style scoped>\n",
145 |        "    .dataframe tbody tr th:only-of-type {\n",
146 |        "        vertical-align: middle;\n",
147 |        "    }\n",
148 |        "\n",
149 |        "    .dataframe tbody tr th {\n",
150 |        "        vertical-align: top;\n",
151 |        "    }\n",
152 |        "\n",
153 |        "    .dataframe thead th {\n",
154 |        "        text-align: right;\n",
155 |        "    }\n",
156 |        "</style>\n",
157 |        "<table border=\"1\" class=\"dataframe\">\n",
158 |        "  <thead>\n",
159 |        "    <tr style=\"text-align: right;\">\n",
160 |        "      <th></th>\n",
161 |        "      <th>cut_id</th>\n",
162 |        "      <th>cut_no</th>\n",
163 |        "      <th>case</th>\n",
164 |        "      <th>time</th>\n",
165 |        "      <th>ae_spindle</th>\n",
166 |        "      <th>ae_table</th>\n",
167 |        "      <th>vib_spindle</th>\n",
168 |        "      <th>vib_table</th>\n",
169 |        "      <th>smcdc</th>\n",
170 |        "      <th>smcac</th>\n",
171 |        "      <th>tool_class</th>\n",
172 |        "    </tr>\n",
173 |        "  </thead>\n",
174 |        "  <tbody>\n",
175 |        "    <tr>\n",
176 |        "      <th>0</th>\n",
177 |        "      <td>0_0</td>\n",
178 |        "      <td>0</td>\n",
179 |        "      <td>1</td>\n",
180 |        "      <td>0.000</td>\n",
181 |        "      <td>0.219727</td>\n",
182 |        "      <td>0.272827</td>\n",
183 |        "      <td>0.733643</td>\n",
184 |        "      <td>2.116699</td>\n",
185 |        "      <td>6.840820</td>\n",
186 |        "      <td>0.124512</td>\n",
187 |        "      <td>0</td>\n",
188 |        "    </tr>\n",
189 |        "    <tr>\n",
190 |        "      <th>1</th>\n",
191 |        "      <td>0_0</td>\n",
192 |        "      <td>0</td>\n",
193 |        "      <td>1</td>\n",
194 |        "      <td>0.004</td>\n",
195 |        "      <td>0.246582</td>\n",
196 |        "      <td>0.322266</td>\n",
197 |        "      <td>0.778809</td>\n",
198 |        "      <td>2.277832</td>\n",
199 |        "      <td>6.660156</td>\n",
200 |        "      <td>-0.561523</td>\n",
201 |        "      <td>0</td>\n",
202 |        "    </tr>\n",
203 |        "    <tr>\n",
204 |        "      <th>2</th>\n",
205 |        "      <td>0_0</td>\n",
206 |        "      <td>0</td>\n",
207 |        "      <td>1</td>\n",
208 |        "      <td>0.008</td>\n",
209 |        "      <td>0.294189</td>\n",
210 |        "      <td>0.283813</td>\n",
211 |        "      <td>0.758057</td>\n",
212 |        "      <td>2.343750</td>\n",
213 |        "      <td>6.508789</td>\n",
214 |        "      <td>-2.099609</td>\n",
215 |        "      <td>0</td>\n",
216 |        "    </tr>\n",
217 |        "    <tr>\n",
218 |        "      <th>3</th>\n",
219 |        "      <td>0_0</td>\n",
220 |        "      <td>0</td>\n",
221 |        "      <td>1</td>\n",
222 |        "      <td>0.012</td>\n",
223 |        "      <td>0.323486</td>\n",
224 |        "      <td>0.260010</td>\n",
225 |        "      <td>0.726318</td>\n",
226 |        "      <td>2.448730</td>\n",
227 |        "      <td>6.542969</td>\n",
228 |        "      <td>-2.731934</td>\n",
229 |        "      <td>0</td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>4</th>\n",
233 |        "      <td>0_0</td>\n",
234 |        "      <td>0</td>\n",
235 |        "      <td>1</td>\n",
236 |        "      <td>0.016</td>\n",
237 |        "      <td>0.290527</td>\n",
238 |        "      <td>0.253296</td>\n",
239 |        "      <td>0.653076</td>\n",
240 |        "      <td>2.546387</td>\n",
241 |        "      <td>6.621094</td>\n",
242 |        "      <td>-3.505859</td>\n",
243 |        "      <td>0</td>\n",
244 |        "    </tr>\n",
245 |        "  </tbody>\n",
246 |        "</table>\n",
247 |        "</div>"
248 |       ],
249 |       "text/plain": [
250 |        "  cut_id  cut_no  case   time  ae_spindle  ae_table  vib_spindle  vib_table  \\\n",
251 |        "0    0_0       0     1  0.000    0.219727  0.272827     0.733643   2.116699   \n",
252 |        "1    0_0       0     1  0.004    0.246582  0.322266     0.778809   2.277832   \n",
253 |        "2    0_0       0     1  0.008    0.294189  0.283813     0.758057   2.343750   \n",
254 |        "3    0_0       0     1  0.012    0.323486  0.260010     0.726318   2.448730   \n",
255 |        "4    0_0       0     1  0.016    0.290527  0.253296     0.653076   2.546387   \n",
256 |        "\n",
257 |        "      smcdc     smcac  tool_class  \n",
258 |        "0  6.840820  0.124512           0  \n",
259 |        "1  6.660156 -0.561523           0  \n",
260 |        "2  6.508789 -2.099609           0  \n",
261 |        "3  6.542969 -2.731934           0  \n",
262 |        "4  6.621094 -3.505859           0  "
263 |       ]
264 |      },
265 |      "execution_count": 39,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "df = mill.create_xy_dataframe()\n",
272 |     "df.head()"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 40,
278 |    "metadata": {},
279 |    "outputs": [
280 |     {
281 |      "data": {
282 |       "text/plain": [
283 |        "(740480, 11)"
284 |       ]
285 |      },
286 |      "execution_count": 40,
287 |      "metadata": {},
288 |      "output_type": "execute_result"
289 |     }
290 |    ],
291 |    "source": [
292 |     "df.shape"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 10,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "data": {
302 |       "text/plain": [
303 |        "(11570, 64, 3)"
304 |       ]
305 |      },
306 |      "execution_count": 10,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "y.shape"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 8,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/plain": [
323 |        "(11570, 64, 6)"
324 |       ]
325 |      },
326 |      "execution_count": 8,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "x.shape"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": []
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 8,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "# sys.path.append(root_dir / 'pyphm')\n",
349 |     "from pyphm.datasets.utils import _urlretrieve"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": []
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 9,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "name": "stdout",
366 |      "output_type": "stream",
367 |      "text": [
368 |       "d3ca5a418c2ed0887d68bc3f91991f12\n"
369 |      ]
370 |     }
371 |    ],
372 |    "source": [
373 |     "def file_as_bytes(file):\n",
374 |     "    with file:\n",
375 |     "        return file.read()\n",
376 |     "\n",
377 |     "print(hashlib.md5(file_as_bytes(open(path_data_raw_folder / 'IMS.7z', 'rb'))).hexdigest())"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "# _urlretrieve('https://files.realpython.com/media/Python-Imports_Watermarked.ae72c8a00197.jpg', 'test.jpg')"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "import sys\n",
396 |     "sys.path"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "import pyphm"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": null,
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": []
414 |   }
415 |  ],
416 |  "metadata": {
417 |   "interpreter": {
418 |    "hash": "a445fd1dd59e042f3702a5878c89afe1dbbe900f3b58e4a7756e0c9feaaac4f1"
419 |   },
420 |   "kernelspec": {
421 |    "display_name": "Python 3.8.12 64-bit ('ganzoo': conda)",
422 |    "language": "python",
423 |    "name": "python3"
424 |   },
425 |   "language_info": {
426 |    "codemirror_mode": {
427 |     "name": "ipython",
428 |     "version": 3
429 |    },
430 |    "file_extension": ".py",
431 |    "mimetype": "text/x-python",
432 |    "name": "python",
433 |    "nbconvert_exporter": "python",
434 |    "pygments_lexer": "ipython3",
435 |    "version": "3.11.7"
436 |   },
437 |   "orig_nbformat": 4
438 |  },
439 |  "nbformat": 4,
440 |  "nbformat_minor": 2
441 | }
442 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/ims.py:
--------------------------------------------------------------------------------
  1 | import scipy.io as sio
  2 | import numpy as np
  3 | import pandas as pd
  4 | from pathlib import Path
  5 | from .pyphm import PHMDataset
  6 | import datetime
  7 | import time
  8 | import multiprocessing as mp
  9 | from typing import Any, Callable, List, Optional, Tuple
 10 | from .utils import (
 11 |     download_and_extract_archive,
 12 |     extract_archive,
 13 |     check_integrity,
 14 | )
 15 | import os
 16 | from urllib.error import URLError
 17 | 
 18 | 
 19 | class ImsDataLoad(PHMDataset):
 20 |     """
 21 |     Load the IMS bearing data set from .csv files, and download if necessary.
 22 | 
 23 |     Args:
 24 |         root (string): Root directory to place all the  data sets.
 25 | 
 26 |         dataset_folder_name (string): Name of folder containing raw data.
 27 |             This folder will be created in the root directory if not present.
 28 | 
 29 |         download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
 30 | 
 31 |     """
 32 | 
 33 |     mirrors = [
 34 |         "https://drive.google.com/file/d/1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx/view?usp=sharing",
 35 |         "https://ti.arc.nasa.gov/m/project/prognostic-repository/",
 36 |     ]
 37 | 
 38 |     resources = [
 39 |         ("IMS.7z", "d3ca5a418c2ed0887d68bc3f91991f12"),
 40 |     ]
 41 | 
 42 |     col_1st_names = [
 43 |         "b1_ch1",
 44 |         "b1_ch2",
 45 |         "b2_ch3",
 46 |         "b2_ch4",
 47 |         "b3_ch5",
 48 |         "b3_ch6",
 49 |         "b4_ch7",
 50 |         "b4_ch8",
 51 |     ]
 52 |     col_2nd_names = col_3rd_names = ["b1_ch1", "b1_ch2", "b2_ch3", "b2_ch4"]
 53 | 
 54 |     def __init__(
 55 |         self,
 56 |         root: Path,
 57 |         dataset_folder_name: str = "ims",
 58 |         download: bool = False,
 59 |         dataset_path: Path = None,
 60 |         data: np.ndarray = None,
 61 |         sample_freq: float = 20480.0,
 62 |     ) -> None:
 63 |         super().__init__(root, dataset_folder_name)
 64 | 
 65 |         self.dataset_path = self.root / self.dataset_folder_name
 66 | 
 67 |         if download:
 68 |             self.download()
 69 | 
 70 |             if not self._check_exists():
 71 |                 raise RuntimeError(
 72 |                     "Dataset not found. You can use download=True to download it"
 73 |                 )
 74 | 
 75 |         # set the paths for the three experiment run folders
 76 |         self.path_1st_folder = self.dataset_path / "1st_test"
 77 |         self.path_2nd_folder = self.dataset_path / "2nd_test"
 78 | 
 79 |         # the third test is labelled as the "4th_test" in the IMS.7z archive
 80 |         self.path_3rd_folder = self.dataset_path / "4th_test/txt"
 81 | 
 82 |         self.sample_freq = sample_freq
 83 | 
 84 |     def _check_exists(self) -> bool:
 85 |         return all(
 86 |             check_integrity(self.dataset_path / file_name)
 87 |             for file_name, _ in self.resources
 88 |         )
 89 | 
 90 |     def download(self) -> None:
 91 |         """Download the UC Berkeley milling data if it doesn't exist already."""
 92 | 
 93 |         if self._check_exists():
 94 |             print("IMS.7z already exists.")
 95 |             return
 96 | 
 97 |         # pathlib makdir if not exists
 98 |         self.dataset_path.mkdir(parents=True, exist_ok=True)
 99 | 
100 |         # download files
101 |         for filename, md5 in self.resources:
102 |             for mirror in self.mirrors:
103 |                 url = f"{mirror}{filename}"
104 |                 try:
105 |                     print(f"Downloading {url}")
106 |                     download_and_extract_archive(
107 |                         url, download_root=self.dataset_path, filename=filename, md5=md5
108 |                     )
109 | 
110 |                     # sequentially extract the .rar files
111 |                     rar_list = ["1st_test.rar", "2nd_test.rar", "3rd_test.rar"]
112 |                     for rar_file in rar_list:
113 |                         print(f"Extracting {rar_file}")
114 |                         extract_archive(
115 |                             self.dataset_path / rar_file, remove_finished=True
116 |                         )
117 | 
118 |                 except URLError as error:
119 |                     print(f"Failed to download (trying next):\n{error}")
120 |                     continue
121 |                 finally:
122 |                     print()
123 |                 break
124 |             else:
125 |                 raise RuntimeError(f"Error downloading {filename}")
126 | 
127 |     def extract(self) -> None:
128 |         """Extract the data set if it has already been dowloaded."""
129 | 
130 |         if not self._check_exists():
131 |             print("IMS.7z does not exist. Please download.")
132 |             return
133 | 
134 |         print("Extracting IMS.7z")
135 | 
136 |         # start with the .7z file
137 |         extract_archive(self.dataset_path / "IMS.7z", remove_finished=False)
138 | 
139 |         # sequentially extract the .rar files
140 |         rar_list = ["1st_test.rar", "2nd_test.rar", "3rd_test.rar"]
141 |         for rar_file in rar_list:
142 |             print(f"Extracting {rar_file}")
143 |             extract_archive(self.dataset_path / rar_file, remove_finished=True)
144 | 
145 |     @staticmethod
146 |     def process_raw_csv_to_dict(file_info_dict) -> None:
147 |         """Load an individual sample (.csv file) of the IMS data set."""
148 | 
149 |         path_run_folder = file_info_dict["path_run_folder"]
150 |         file_name = file_info_dict["file_name"]
151 |         run_no = file_info_dict["run_no"]
152 |         sample_index = file_info_dict["sample_index"]
153 | 
154 |         # load the .csv file
155 |         signals_array = np.loadtxt(path_run_folder / file_name, delimiter="\t")
156 | 
157 |         # get the start time (for the first sample) and convert to unix timestamp
158 |         start_time_unix = time.mktime(
159 |             datetime.datetime.strptime(file_name, "%Y.%m.%d.%H.%M.%S").timetuple()
160 |         )
161 | 
162 |         # create dictionary with the signals_array, id_list, run_list, file_list, time_step_array
163 |         data_dict = {
164 |             "signals_array": signals_array,
165 |             "id": f"{run_no}_{sample_index}",
166 |             "run_no": run_no,
167 |             "file_name": file_name,
168 |             "sample_index": sample_index,
169 |             "start_time_unix": start_time_unix,
170 |         }
171 | 
172 |         return data_dict
173 | 
174 |     def load_run_as_dict(
175 |         self,
176 |         run_no: int,
177 |         n_jobs: int = None,
178 |     ) -> None:
179 |         if run_no == 1:
180 |             col_names = self.col_1st_names
181 |             path_run_folder = self.path_1st_folder
182 |         elif run_no == 2:
183 |             col_names = self.col_2nd_names
184 |             path_run_folder = self.path_2nd_folder
185 |         else:
186 |             col_names = self.col_3rd_names
187 |             path_run_folder = self.path_3rd_folder
188 | 
189 |         # create a list of dictionaries containing the metadata for each file
190 |         file_info_list = []
191 |         for i, file_name in enumerate(sorted(os.listdir(path_run_folder))):
192 |             file_info_list.append(
193 |                 {
194 |                     "path_run_folder": path_run_folder,
195 |                     "file_name": file_name,
196 |                     "col_names": col_names,
197 |                     "run_no": run_no,
198 |                     "sample_index": i,
199 |                 }
200 |             )
201 | 
202 |         # get number of cpu cores
203 |         if n_jobs is None:
204 |             n_jobs = mp.cpu_count() - 2
205 |         if n_jobs < 1:
206 |             n_jobs = 1
207 |         print("n_jobs:", n_jobs)
208 |         with mp.Pool(processes=n_jobs) as pool:
209 | 
210 |             # from https://stackoverflow.com/a/36590187
211 |             data_list = pool.map(self.process_raw_csv_to_dict, file_info_list)
212 | 
213 |         # store the data from data_list as a dictionary, with the key being the file name
214 |         data_dict = {}
215 |         for data_dict_i in data_list:
216 |             data_dict[data_dict_i["file_name"]] = data_dict_i
217 |         return data_dict
218 | 
219 |     @staticmethod
220 |     def process_raw_csv_to_df(file_info_dict) -> None:
221 |         """Load an individual sample (.csv file) of the IMS data set."""
222 | 
223 |         path_run_folder = file_info_dict["path_run_folder"]
224 |         file_name = file_info_dict["file_name"]
225 |         sample_freq = file_info_dict["sample_freq"]
226 |         col_names = file_info_dict["col_names"]
227 |         run_no = file_info_dict["run_no"]
228 |         sample_index = file_info_dict["sample_index"]
229 | 
230 |         # load the .csv file
231 |         signals_array = np.loadtxt(path_run_folder / file_name, delimiter="\t")
232 | 
233 |         id_list = [f"{run_no}_{sample_index}"] * len(signals_array)
234 |         run_list = [run_no] * len(signals_array)
235 |         file_list = [file_name] * len(signals_array)
236 |         time_step_array = np.linspace(
237 |             0.0, len(signals_array) / sample_freq, len(signals_array)
238 |         )
239 | 
240 |         df = pd.DataFrame(np.vstack(signals_array), columns=col_names, dtype=np.float32)
241 |         df["id"] = id_list
242 |         df["run"] = run_list
243 |         df["file"] = file_list
244 |         df["time_step"] = np.hstack(time_step_array)
245 | 
246 |         return df.astype({"id": str, "run": int, "file": str, "time_step": np.float32})
247 | 
248 |     def load_run_as_df(
249 |         self,
250 |         run_no: int,
251 |         n_jobs: int = None,
252 |     ) -> None:
253 |         """Load the three runs as individual dataframes."""
254 | 
255 |         if run_no == 1:
256 |             col_names = self.col_1st_names
257 |             path_run_folder = self.path_1st_folder
258 |         elif run_no == 2:
259 |             col_names = self.col_2nd_names
260 |             path_run_folder = self.path_2nd_folder
261 |         else:
262 |             col_names = self.col_3rd_names
263 |             path_run_folder = self.path_3rd_folder
264 | 
265 |         # get list of every file in the folder and sort by ascending date
266 |         file_list = sorted(os.listdir(path_run_folder))
267 | 
268 |         # create a list of dictionaries containing the metadata for each file
269 |         file_info_list = []
270 |         for i, file_name in enumerate(sorted(os.listdir(path_run_folder))):
271 |             file_info_list.append(
272 |                 {
273 |                     "path_run_folder": path_run_folder,
274 |                     "file_name": file_name,
275 |                     "sample_freq": self.sample_freq,
276 |                     "col_names": col_names,
277 |                     "run_no": run_no,
278 |                     "sample_index": i,
279 |                 }
280 |             )
281 | 
282 |         # get number of cpu cores
283 |         if n_jobs is None:
284 |             n_jobs = mp.cpu_count() - 2
285 |         if n_jobs < 1:
286 |             n_jobs = 1
287 | 
288 |         # load the dataframes in parallel
289 |         with mp.Pool(processes=n_jobs) as pool:
290 | 
291 |             # from https://stackoverflow.com/a/36590187
292 |             df_run = pool.map(self.process_raw_csv_to_df, file_info_list)
293 |             df = pd.concat(df_run, ignore_index=True)
294 | 
295 |         col_names_ordered = ["id", "run", "file", "time_step"] + col_names
296 | 
297 |         return df[col_names_ordered]
298 | 
299 | 
300 | class ImsPrepMethodA(ImsDataLoad):
301 |     """
302 |     Class used to prepare the IMS bearing dataset before feature engining or machine learning.
303 | 
304 |     Args:
305 |         root (string): Root directory to place all the  data sets. (likely the raw data folder)
306 | 
307 |         dataset_folder_name (string): Name of folder containing raw data.
308 |             This folder will be created in the root directory if not present.
309 | 
310 |         download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
311 | 
312 |         path_df_labels (Path, optional): Path to the dataframe with the labels (as a string).
313 |             If not provided, the dataframe must be created.
314 | 
315 |         window_size (int): Size of the window to be used for the sliding window.
316 | 
317 |         stride (int): Size of the stride to be used for the sliding window.
318 | 
319 |         cut_drop_list (list, optional): List of cut numbers to drop. cut_no 17 and 94 are erroneous.
320 |     """
321 | 
322 |     def __init__(
323 |         self,
324 |         root: Path,
325 |         dataset_folder_name: str = "ims",
326 |         download: bool = False,
327 |     ) -> None:
328 |         super().__init__(
329 |             root,
330 |             dataset_folder_name,
331 |             download,
332 |         )
333 | 
334 |     def create_xy_arrays(
335 |         self,
336 |         run_no: int = 1,
337 |         n_jobs: int = None,
338 |     ) -> None:
339 | 
340 |         # create a list to store the x and y arrays
341 |         x = []  # instantiate X's
342 |         y_ids_runs_files_times_ctimes = []  # instantiate y's
343 | 
344 |         # create the data dict storing the signals and metadata
345 |         data_dict = self.load_run_as_dict(run_no, n_jobs)
346 | 
347 |         # get all the file names from the data_dict and sort them
348 |         file_names = sorted(data_dict.keys())
349 | 
350 |         for i, file_name in enumerate(file_names):
351 | 
352 |             x.append(data_dict[file_name]["signals_array"])
353 |             y_ids_runs_files_times_ctimes.append(
354 |                 [
355 |                     data_dict[file_name]["id"],
356 |                     data_dict[file_name]["run_no"],
357 |                     data_dict[file_name]["file_name"],
358 |                     data_dict[file_name]["sample_index"],
359 |                     data_dict[file_name]["start_time_unix"],
360 |                 ]
361 |             )
362 | 
363 |         x = np.stack(x)
364 |         n_samples = x.shape[0]
365 |         n_signals = x.shape[2]
366 | 
367 |         return x, np.stack(y_ids_runs_files_times_ctimes).reshape(-1, 5)
368 | 
369 |     def create_xy_df(
370 |         self,
371 |         run_no: int = 1,
372 |         n_jobs: int = None,
373 |     ) -> None:
374 |         return self.load_run_as_df(run_no, n_jobs)
375 | 


--------------------------------------------------------------------------------
/notebooks/images/prauc_cnc.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | <svg width="1045.5" height="555.44" version="1.1" viewBox="0 0 1045.5 555.44" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><metadata><rdf:RDF><cc:Work rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/><dc:title/></cc:Work></rdf:RDF></metadata><defs><clipPath id="clipPath102"><path d="m43.778 82.818h333.62v311.44h-333.62z"/></clipPath><clipPath id="clipPath112"><path d="m43.778 82.818h333.62v311.44h-333.62z"/></clipPath><clipPath id="clipPath236"><path d="m443.3 82.818h333.62v311.44h-333.62z"/></clipPath><clipPath id="clipPath246"><path d="m443.3 82.818h333.62v311.44h-333.62z"/></clipPath></defs><g transform="matrix(1.3333 0 0 -1.3333 0 555.44)"><g><path d="m0 0h784.13v416.58h-784.13z" fill="#fff"/><path d="m43.778 82.818h333.62v311.44h-333.62z" fill="#fff"/><path d="m58.943 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 50.99 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.0</tspan></text>
 4 | <path d="m119.6 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 111.65 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.2</tspan></text>
 5 | <path d="m180.26 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 172.31 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.4</tspan></text>
 6 | <path d="m240.92 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 232.97 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.6</tspan></text>
 7 | <path d="m301.58 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 293.63 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.8</tspan></text>
 8 | <path d="m362.24 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 354.29 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">1.0</tspan></text>
 9 | <text transform="matrix(1 0 0 -1 195.44 54.552)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.9499998 13.1 18.6 24.73 27.51" y="0">Recall</tspan></text>
10 | <path d="m43.778 96.974h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 20.872 93.177)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.0</tspan></text>
11 | <path d="m43.778 153.6h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 20.872 149.8)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.2</tspan></text>
12 | <path d="m43.778 210.23h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 20.872 206.43)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.4</tspan></text>
13 | <path d="m43.778 266.85h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 20.872 263.05)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.6</tspan></text>
14 | <path d="m43.778 323.48h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 20.872 319.68)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.8</tspan></text>
15 | <path d="m43.778 380.1h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 20.872 376.3)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">1.0</tspan></text>
16 | <text transform="matrix(0 1 1 0 14.794 216.01)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.0300002 10.14 16.290001 21.790001 24.57 29.780001 32.560001 38.68" y="0">Precision</tspan></text>
17 | </g><g clip-path="url(#clipPath102)"><path d="m362.24 104.5v0.27785l-1.6047 0.10527v0.23241l-1.6047-0.0322v0.11978l-14.443-0.26362-3.2095-0.0558v0.1082l-11.233-0.29037v0.1244l-6.419-0.0462v0.0831l-4.8142-0.0374v0.29003l-9.6284 0.0611-3.2095-0.0491v0.32848l-3.2095-0.0561v0.1649l-22.466-0.50397v2e-3l-1.6047-0.0553v1.5393l-1.6047-0.0132v0.67357l-27.281-0.66913v0.11545l-1.6047-0.0675v0.10912l-3.2095-0.0579v0.0716l-3.2095-0.0181v0.64273l-11.233-0.42246-9.6284-0.2882-3.2095-0.12358v0.88478l-3.2095 0.10212v0.34078l-1.6047 0.0563v0.62033l-1.6047 0.0369v0.15122l-1.6047 0.0562v0.40979l-3.2095-0.12676v0.32097l-8.0237-0.3703v0.26941l-4.8142 0.13348v0.35312l-1.6047-1e-3v0.32712l-3.2095 0.0884v1.4458l-1.6047 0.0916-4.8142 0.5492-1.6047-0.0275v0.65279l-1.6047 0.15837v1.1716l-1.6047-0.0345-9.6284-0.67567-1.6047-0.0451-6.419-0.79259-1.6047-0.0567-6.419-0.64799-4.8142-0.60404v0.45106l-1.6047-0.12965v0.12808l-1.6047-0.14643v3.4545l-1.6047-0.0361v0.95933l-3.2095-0.54103v0.72914l-6.419-0.94226v0.23418l-3.2095-0.57568v0.31585l-1.6047-0.0137-1.6047-0.37086-1.6047 5e-3 -1.6047-0.13538v0.81479l-4.8142-1.1642v0.53635l-1.6047-0.11866-1.6047 4e-3v2.3594l-3.2095-0.71719v0.16357l-1.6047-0.35182v0.6355l-1.6047-0.40793-3.2095-0.54199v0.31159l-1.6047-0.35372v0.65403l-1.6047-0.41524v0.44213l-1.6047-0.57965-1.6047-0.29612v0.7202l-1.6047-0.35832v0.49506l-3.2095-0.76416-1.6047-0.69108v1.626l-9.6284-5.0579v0.63659l-1.6047-0.81245v0.49497l-1.6047-0.96602v0.67432l-1.6047-1.0229v1.0722l-3.2095-2.1884v0.34475l-1.6047-1.1312v0.16615l-1.6047-1.1773v1.3917l-1.6047-1.288v3.2668l-1.6047-1.6654v4.1636l-1.6047-2.2093v2.2093l-3.2095-5.1764-1.6047-3.6135v1.1235l-3.2095-8.9882v0" fill="none" stroke="#1f253f" stroke-linecap="square" stroke-linejoin="round" stroke-miterlimit="10" stroke-width="2"/></g><g clip-path="url(#clipPath112)"><path d="m58.943 104.62h303.3" fill="none" stroke="#ffa500" stroke-dasharray="7.4, 3.2" stroke-linejoin="round" stroke-miterlimit="10" stroke-width="2"/></g><g fill="none" stroke="#000" stroke-linecap="square" stroke-miterlimit="10" stroke-width=".8"><path d="m43.778 82.818v311.44"/><path d="m377.4 82.818v311.44"/><path d="m43.778 82.818h333.62"/><path d="m43.778 394.26h333.62"/></g><g><text transform="matrix(1 0 0 -1 43.778 9.2781)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.0300002 10.14 16.290001 21.790001 24.57 29.780001 32.560001 38.68 45.02 48.630001 55.580002 61.73 67.230003 73.360001 76.139999 78.919998 82.099998 88.940002 93.050003 99.199997 105.33 108.94 116.26 122.6 128.95 135.10001 139.21001 142.82001 149.8 156.14 160.25 166.17 172.32001 175.5 183.88 187.06 193.42 196.60001 202.96001 209.32001" y="0">Precision-Recall Area-Under-Curve = 0.044</tspan></text>
18 | <text transform="matrix(1 0 0 -1 143.63 400.26)" fill="#000000" font-family="'DejaVu Sans'" font-size="12px"><tspan x="0 7.2360001 12.168 19.548 26.148001 29.483999 35.736 39.071999 46.416 54.023998 58.355999 66.695999 74.075996 80.676003 88.031998 91.367996 94.704002 98.519997 106.896 114.504 119.436 126.54" y="0">Precision-Recall Curve</tspan></text>
19 | <path d="m267.9 356.91h102.5c1.3333 0 2 0.66666 2 2v28.344c0 1.3333-0.66666 2-2 2h-102.5c-1.3333 0-2-0.66667-2-2v-28.344c0-1.3333 0.66667-2 2-2z" fill="#fff" fill-opacity=".8" stroke="#ccc" stroke-miterlimit="10" stroke-opacity=".8"/></g><path d="m269.9 381.16h20" fill="none" stroke="#1f253f" stroke-linecap="square" stroke-linejoin="round" stroke-miterlimit="10" stroke-width="2"/><text transform="matrix(1 0 0 -1 297.9 377.66)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.8600001 13.01 18.219999 22.139999 25.32 35.060001 41.18 47.529999 53.68" y="0">Best model</tspan></text>
20 | <path d="m269.9 366.49h20" fill="none" stroke="#ffa500" stroke-dasharray="7.4, 3.2" stroke-linejoin="round" stroke-miterlimit="10" stroke-width="2"/><g><text transform="matrix(1 0 0 -1 297.9 362.99)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 7.48 13.6 16.780001 21.99 27.780001 30.559999 33.34 36.119999 39.299999 49.040001 55.16 61.509998 67.660004" y="0">No skill model</tspan></text>
21 | <path d="m443.3 82.818h333.62v311.44h-333.62z" fill="#fff"/><path d="m458.47 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 450.51 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.0</tspan></text>
22 | <path d="m519.13 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 511.17 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.2</tspan></text>
23 | <path d="m579.79 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 571.83 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.4</tspan></text>
24 | <path d="m640.45 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 632.49 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.6</tspan></text>
25 | <path d="m701.1 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 693.15 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.8</tspan></text>
26 | <path d="m761.76 82.818v-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 753.81 68.224)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">1.0</tspan></text>
27 | <text transform="matrix(1 0 0 -1 562.87 54.552)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 5.75 11.88 14.66 19.870001 26.02 29.200001 35.23 41.349998 46.560001 49.34 53.259998 56.040001 61.959999 68.110001 71.290001 78.239998 84.370003 88.290001" y="0">False Positive Rate</tspan></text>
28 | <path d="m443.3 96.974h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 420.4 93.177)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.0</tspan></text>
29 | <path d="m443.3 153.6h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 420.4 149.8)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.2</tspan></text>
30 | <path d="m443.3 210.23h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 420.4 206.43)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.4</tspan></text>
31 | <path d="m443.3 266.85h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 420.4 263.05)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.6</tspan></text>
32 | <path d="m443.3 323.48h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 420.4 319.68)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">0.8</tspan></text>
33 | <path d="m443.3 380.1h-3.5" stroke="#000" stroke-linejoin="round" stroke-miterlimit="10" stroke-width=".8"/><text transform="matrix(1 0 0 -1 420.4 376.3)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.3600001 9.54" y="0">1.0</tspan></text>
34 | <text transform="matrix(0 1 1 0 414.32 192.95)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.1100001 10.22 16.559999 22.709999 25.889999 31.92 38.040001 43.25 46.029999 49.950001 52.73 58.650002 64.800003 67.980003 74.93 81.059998 84.980003" y="0">True Positive Rate</tspan></text>
35 | </g><g clip-path="url(#clipPath236)"><path d="m761.76 380.1h-11.09l-1.9726-1.498h-11.967l-0.35068-1.498h-4.1643l-0.0438-1.498h-1.4027l-0.39452-1.498h-1.1397l-0.43835-1.498-0.26301-1.498-0.35068-5.9921-0.56986-2.996h-0.56985l-0.21918-1.498h-3.726v0l-0.17534-1.498h-0.21917v0l-0.0438-5.9921h-0.0438l-0.26301-2.9961h-4.252l-0.6137-1.498h-2.6301l-0.56985-2.9961-0.78904-1.498h-2.7616l-0.70136-2.996h-2.1918l-0.48219-1.498h-9.1177l-1.9726-1.498h-1.6657l-1.6219-1.498h-1.8849l-5.041-8.9882h-9.0739l-1.3589-2.9961h-4.2958l-2.5863-5.9921-1.0082-1.498h-1.0959l-0.39452-1.498h-0.56985l-0.30685-1.498-0.43835-1.498-0.30685-2.9961h-0.43835l-0.0438-5.9921h-0.0438l-0.0438-1.498h-34.016l-1.0082-1.498h-11.616l-0.92054-2.996h-1.2712l-1.052-4.4941-1.052-1.498-1.0082-5.9921-0.96437-2.996-1.5781-1.498-1.1397-2.9961-0.78903-2.996h-1.8411l-0.17534-1.498h-1.7096l-0.35068-1.498h-0.78903l-0.43836-1.498h-1.0959l-0.70137-1.498h-0.70136l-0.78903-1.498h-9.03l-2.0602-5.9921-0.56985-4.4941-0.65753-2.996-1.0959-2.9961-0.7452-1.498-0.96437-1.498-0.83287-2.9961h-10.959l-3.4191-2.9961h-3.5945l-1.6657-1.498h-5.8739l-1.3589-1.498h-1.3151l-1.4904-1.498h-3.3315l-0.96438-2.996h-2.4548l-1.0959-2.9961-1.0082-4.4941h-1.9726l-1.0959-1.498-0.7452-1.498-1.0082-1.498h-3.2876l-0.92054-1.498h-2.0164l-2.3233-2.996h-7.5396l-1.2274-1.498-1.7534-1.498-1.5781-1.498-1.4027-1.498-0.65753-1.498h-2.5424l-1.3151-1.498h-3.9452l-0.56986-1.498-1.1397-5.9921-0.92054-2.996-0.56986-1.498-0.43835-5.9921-0.56986-1.498-1.0082-5.9921-0.43836-4.4941h-1.4466l-0.39451-1.498h-0.39452l-0.35068-1.498h-8.5479l-0.56985-1.498h-1.7972l-0.26302-2.9961h-1.2712l-0.52603-4.4941-0.35068-1.498h-0.39451l-0.30685-2.996h-0.52602l-0.61369-1.498-0.0438-1.498-0.65753-1.498-0.43835-1.498h-1.2274l-0.21918-4.4941h-0.78903l-0.48219-1.498-0.65753-1.498h-2.8493l-0.39452-2.9961h-0.17534v0l-0.21917-1.498h-0.65753l-0.17534-1.498-0.65753-2.996h-0.30685l-0.26301-1.498h-0.61369l-0.21917-1.498h-0.39452l-0.0877-1.498-0.35068-1.498h-0.61369l-0.30685-1.498h-0.39452l-0.61369-2.9961-0.17534-1.498h-1.0959l-0.26301-8.9882h-0.52602l-0.13151-1.498h-0.39451l-0.13151-1.498h-0.43835l-0.0877-1.498h-0.7452l-0.1315-2.996h-0.26301l-0.0877-1.498h-0.1315l-0.17534-1.498h-0.96437l-0.13151-1.498h-1.8411l-0.1315-1.498h-1.4904l-0.17534-1.498v0h-0.52602l-0.13151-1.498-0.26301-1.498-0.0438-1.498h-0.39452v-1.498h-0.17531v-1.498h-2.4548v0" fill="none" stroke="#1f253f" stroke-linecap="square" stroke-linejoin="round" stroke-miterlimit="10" stroke-width="2"/></g><g clip-path="url(#clipPath246)"><path d="m458.47 96.974 303.3 283.13" fill="none" stroke="#ffa500" stroke-dasharray="7.4, 3.2" stroke-linejoin="round" stroke-miterlimit="10" stroke-width="2"/></g><g fill="none" stroke="#000" stroke-linecap="square" stroke-miterlimit="10" stroke-width=".8"><path d="m443.3 82.818v311.44"/><path d="m776.93 82.818v311.44"/><path d="m443.3 82.818h333.62"/><path d="m443.3 394.26h333.62"/></g><text transform="matrix(1 0 0 -1 443.3 9.2781)" fill="#000000" font-family="'DejaVu Sans'" font-size="10px"><tspan x="0 6.9499998 14.82 21.799999 24.98 31.82 35.93 42.080002 48.209999 51.82 59.139999 65.480003 71.830002 77.980003 82.089996 85.699997 92.68 99.019997 103.13 109.05 115.2 118.38 126.76 129.94 136.3 139.48 145.84 152.2" y="0">ROC Area-Under-Curve = 0.617</tspan></text>
36 | <text transform="matrix(1 0 0 -1 577.41 400.26)" fill="#000000" font-family="'DejaVu Sans'" font-size="12px"><tspan x="0 8.3400002 17.784 26.16 29.976 38.352001 45.959999 50.891998 57.995998" y="0">ROC Curve</tspan></text>
37 | </g></svg>
38 | 


--------------------------------------------------------------------------------
/notebooks/scratch/airbus_download.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyphm.datasets.utils import _urlretrieve, download_url\n",
 10 |     "from pathlib import Path\n",
 11 |     "from pyphm.datasets.airbus import AirbusDataLoad\n",
 12 |     "\n",
 13 |     "%load_ext autoreload\n",
 14 |     "%autoreload 2"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 6,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "/home/tim/Documents/PyPHM\n",
 27 |       "/home/tim/Documents/PyPHM/data/raw\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "root_dir = Path.cwd().parent.parent\n",
 33 |     "print(root_dir)\n",
 34 |     "path_data_raw_folder = Path(root_dir / 'data/raw/' )\n",
 35 |     "print(path_data_raw_folder)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 7,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "air = AirbusDataLoad(path_data_raw_folder, download=True)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 8,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "(1677, 61441)\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "df_train = air.load_df(train_or_val=\"train\")\n",
 62 |     "print(df_train.shape)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 9,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stdout",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "(594, 61441)\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "df_val = air.load_df(train_or_val=\"val\")\n",
 80 |     "print(df_val.shape)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": []
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 12,
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "Downloading https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5 to c:\\_Python\\PyPHM\\notebooks\\dftrain.h5\n"
100 |      ]
101 |     },
102 |     {
103 |      "name": "stderr",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "  4%|▍         | 36639744/825280760 [00:04<01:32, 8542721.26it/s] \n"
107 |      ]
108 |     },
109 |     {
110 |      "ename": "KeyboardInterrupt",
111 |      "evalue": "",
112 |      "output_type": "error",
113 |      "traceback": [
114 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
115 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
116 |       "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_20668/1413174493.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      7\u001b[0m ]\n\u001b[0;32m      8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mdownload_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mroot\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpath_cwd\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
117 |       "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[1;34m(url, root, filename, md5, max_redirect_hops)\u001b[0m\n\u001b[0;32m    176\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    177\u001b[0m             \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Downloading \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0murl\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;34m\" to \"\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mfpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 178\u001b[1;33m             \u001b[0m_urlretrieve\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    179\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mURLError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# type: ignore[attr-defined]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    180\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"https\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
118 |       "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36m_urlretrieve\u001b[1;34m(url, filename, chunk_size)\u001b[0m\n\u001b[0;32m     69\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m\"User-Agent\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mUSER_AGENT\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     70\u001b[0m             \u001b[1;32mwith\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlength\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpbar\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 71\u001b[1;33m                 \u001b[1;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0miter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     72\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     73\u001b[0m                         \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
119 |       "\u001b[1;32mc:\\_python\\pyphm\\pyphm\\datasets\\utils.py\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     69\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m\"User-Agent\"\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mUSER_AGENT\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     70\u001b[0m             \u001b[1;32mwith\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlength\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpbar\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 71\u001b[1;33m                 \u001b[1;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[1;32min\u001b[0m \u001b[0miter\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mlambda\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     72\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunk\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     73\u001b[0m                         \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
120 |       "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\http\\client.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, amt)\u001b[0m\n\u001b[0;32m    457\u001b[0m             \u001b[1;31m# Amount is given, implement using readinto\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    458\u001b[0m             \u001b[0mb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m             \u001b[0mn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    460\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mn\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    461\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
121 |       "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\http\\client.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m    501\u001b[0m         \u001b[1;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    502\u001b[0m         \u001b[1;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 503\u001b[1;33m         \u001b[0mn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    504\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    505\u001b[0m             \u001b[1;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
122 |       "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m    667\u001b[0m         \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    668\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 669\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    670\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    671\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
123 |       "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\ssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[1;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[0;32m   1239\u001b[0m                   \u001b[1;34m\"non-zero flags not allowed in calls to recv_into() on %s\"\u001b[0m \u001b[1;33m%\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1240\u001b[0m                   self.__class__)\n\u001b[1;32m-> 1241\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1242\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1243\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
124 |       "\u001b[1;32m~\\Anaconda3\\envs\\featstore\\lib\\ssl.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m   1097\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1098\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mbuffer\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1099\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1100\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1101\u001b[0m                 \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
125 |       "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "mirrors = [\n",
131 |     "    \"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/\",\n",
132 |     "]\n",
133 |     "\n",
134 |     "resources = [\n",
135 |     "    (\"dftrain.h5?sequence=1&isAllowed=y\",),\n",
136 |     "]\n",
137 |     "\n",
138 |     "download_url(\"https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/415151/dftrain.h5\", root=path_cwd,)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3.8.12 ('featstore')",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.8.12"
166 |   },
167 |   "orig_nbformat": 4,
168 |   "vscode": {
169 |    "interpreter": {
170 |     "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16"
171 |    }
172 |   }
173 |  },
174 |  "nbformat": 4,
175 |  "nbformat_minor": 2
176 | }
177 | 


--------------------------------------------------------------------------------
/references/sources.bib:
--------------------------------------------------------------------------------
  1 | @incollection{buckheit1995wavelab,
  2 |   title={Wavelab and reproducible research},
  3 |   author={Buckheit, Jonathan B and Donoho, David L},
  4 |   booktitle={Wavelets and statistics},
  5 |   pages={55--81},
  6 |   year={1995},
  7 |   publisher={Springer}
  8 | }
  9 | 
 10 | @Article{         harris2020array,
 11 |  title         = {Array programming with {NumPy}},
 12 |  author        = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J.
 13 |                  van der Walt and Ralf Gommers and Pauli Virtanen and David
 14 |                  Cournapeau and Eric Wieser and Julian Taylor and Sebastian
 15 |                  Berg and Nathaniel J. Smith and Robert Kern and Matti Picus
 16 |                  and Stephan Hoyer and Marten H. van Kerkwijk and Matthew
 17 |                  Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del
 18 |                  R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre
 19 |                  G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and
 20 |                  Warren Weckesser and Hameer Abbasi and Christoph Gohlke and
 21 |                  Travis E. Oliphant},
 22 |  year          = {2020},
 23 |  month         = sep,
 24 |  journal       = {Nature},
 25 |  volume        = {585},
 26 |  number        = {7825},
 27 |  pages         = {357--362},
 28 |  doi           = {10.1038/s41586-020-2649-2},
 29 |  publisher     = {Springer Science and Business Media {LLC}},
 30 |  url           = {https://doi.org/10.1038/s41586-020-2649-2}
 31 | }
 32 | 
 33 | @ARTICLE{2020SciPy-NMeth,
 34 |   author  = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
 35 |             Haberland, Matt and Reddy, Tyler and Cournapeau, David and
 36 |             Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
 37 |             Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
 38 |             Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
 39 |             Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
 40 |             Kern, Robert and Larson, Eric and Carey, C J and
 41 |             Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and
 42 |             {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
 43 |             Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
 44 |             Harris, Charles R. and Archibald, Anne M. and
 45 |             Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and
 46 |             {van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
 47 |   title   = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
 48 |             Computing in Python}},
 49 |   journal = {Nature Methods},
 50 |   year    = {2020},
 51 |   volume  = {17},
 52 |   pages   = {261--272},
 53 |   adsurl  = {https://rdcu.be/b08Wh},
 54 |   doi     = {10.1038/s41592-019-0686-2},
 55 | }
 56 | 
 57 | @InProceedings{ mckinney-proc-scipy-2010,
 58 |   author    = { {W}es {M}c{K}inney },
 59 |   title     = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython },
 60 |   booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference },
 61 |   pages     = { 56 - 61 },
 62 |   year      = { 2010 },
 63 |   editor    = { {S}t\'efan van der {W}alt and {J}arrod {M}illman },
 64 |   doi       = { 10.25080/Majora-92bf1922-00a }
 65 | }
 66 | 
 67 | @article{donoho2008reproducible,
 68 |   title={Reproducible research in computational harmonic analysis},
 69 |   author={Donoho, David L and Maleki, Arian and Rahman, Inam Ur and Shahram, Morteza and Stodden, Victoria},
 70 |   journal={Computing in Science \& Engineering},
 71 |   volume={11},
 72 |   number={1},
 73 |   pages={8--18},
 74 |   year={2008},
 75 |   publisher={IEEE}
 76 | }
 77 | 
 78 | @article{ince2012case,
 79 |   title={The case for open computer programs},
 80 |   author={Ince, Darrel C and Hatton, Leslie and Graham-Cumming, John},
 81 |   journal={Nature},
 82 |   volume={482},
 83 |   number={7386},
 84 |   pages={485--488},
 85 |   year={2012},
 86 |   publisher={Nature Publishing Group}
 87 | }
 88 | 
 89 | 
 90 | @article{trouble_lab_2013, ISSN={0013-0613},
 91 | title={Trouble at the lab}, 
 92 | url={https://www.economist.com/briefing/2013/10/18/trouble-at-the-lab}, 
 93 | abstractNote={Scientists like to think of science as self-correcting. To an alarming degree, it is not}, 
 94 | journal={The Economist}, 
 95 | year={2013}, 
 96 | month={Oct}}
 97 | 
 98 | @article{hu2022prognostics,
 99 |   title={Prognostics and health management: A review from the perspectives of design, development and decision},
100 |   author={Hu, Yang and Miao, Xuewen and Si, Yong and Pan, Ershun and Zio, Enrico},
101 |   journal={Reliability Engineering \& System Safety},
102 |   volume={217},
103 |   pages={108063},
104 |   year={2022},
105 |   publisher={Elsevier}
106 | }
107 | 
108 | @article{national2019reproducibility,
109 |   title={Reproducibility and replicability in science},
110 |   author={National Academies of Sciences, Engineering, and Medicine and others},
111 |   year={2019},
112 |   publisher={National Academies Press}
113 | }
114 | 
115 | @inproceedings{stodden2018enabling,
116 |   title={Enabling the verification of computational results: An empirical evaluation of computational reproducibility},
117 |   author={Stodden, Victoria and Krafczyk, Matthew S and Bhaskar, Adhithya},
118 |   booktitle={Proceedings of the First International Workshop on Practical Reproducible Evaluation of Computer Systems},
119 |   pages={1--5},
120 |   year={2018}
121 | }
122 | 
123 | @article{gundersen2018reproducible,
124 |   title={On reproducible AI: Towards reproducible research, open science, and digital scholarship in AI publications},
125 |   author={Gundersen, Odd Erik and Gil, Yolanda and Aha, David W},
126 |   journal={AI magazine},
127 |   volume={39},
128 |   number={3},
129 |   pages={56--68},
130 |   year={2018}
131 | }
132 | 
133 | @book{chollet2021deep,
134 |   title={Deep learning with Python},
135 |   author={Chollet, Francois},
136 |   year={2021},
137 |   publisher={Simon and Schuster}
138 | }
139 | 
140 | @inproceedings{astfalck2016modelling,
141 |   title={A modelling ecosystem for prognostics},
142 |   author={Astfalck, Lachlan and Hodkiewicz, Melinda and Keating, Adrian and Cripps, Edward and Pecht, Michael},
143 |   booktitle={Annual Conference of the PHM Society},
144 |   volume={8},
145 |   number={1},
146 |   year={2016}
147 | }
148 | 
149 | @article{frachtenberg2022research,
150 |   title={Research artifacts and citations in computer systems papers},
151 |   author={Frachtenberg, Eitan},
152 |   journal={PeerJ Computer Science},
153 |   volume={8},
154 |   pages={e887},
155 |   year={2022},
156 |   publisher={PeerJ Inc.}
157 | }
158 | 
159 | @article{dorch2015data,
160 |   title={The data sharing advantage in astrophysics},
161 |   author={Dorch, Bertil F and Drachen, Thea M and Ellegaard, Ole},
162 |   journal={Proceedings of the International Astronomical Union},
163 |   volume={11},
164 |   number={A29A},
165 |   pages={172--175},
166 |   year={2015},
167 |   publisher={Cambridge University Press}
168 | }
169 | 
170 | @article{henneken2011linking,
171 |   title={Linking to data-effect on citation rates in astronomy},
172 |   author={Henneken, Edwin A and Accomazzi, Alberto},
173 |   journal={arXiv preprint arXiv:1111.3618},
174 |   year={2011}
175 | }
176 | 
177 | @article{piwowar2013data,
178 |   title={Data reuse and the open data citation advantage},
179 |   author={Piwowar, Heather A and Vision, Todd J},
180 |   journal={PeerJ},
181 |   volume={1},
182 |   pages={e175},
183 |   year={2013},
184 |   publisher={PeerJ Inc.}
185 | }
186 | 
187 | @article{piwowar2007sharing,
188 |   title={Sharing detailed research data is associated with increased citation rate},
189 |   author={Piwowar, Heather A and Day, Roger S and Fridsma, Douglas B},
190 |   journal={PloS one},
191 |   volume={2},
192 |   number={3},
193 |   pages={e308},
194 |   year={2007},
195 |   publisher={Public Library of Science San Francisco, USA}
196 | }
197 | 
198 | @article{colavizza2020citation,
199 |   title={The citation advantage of linking publications to research data},
200 |   author={Colavizza, Giovanni and Hrynaszkiewicz, Iain and Staden, Isla and Whitaker, Kirstie and McGillivray, Barbara},
201 |   journal={PloS one},
202 |   volume={15},
203 |   number={4},
204 |   pages={e0230416},
205 |   year={2020},
206 |   publisher={Public Library of Science San Francisco, CA USA}
207 | }
208 | 
209 | @article{fu2019meta,
210 |   title={Meta-Research: Releasing a preprint is associated with more attention and citations for the peer-reviewed article},
211 |   author={Fu, Darwin Y and Hughey, Jacob J},
212 |   journal={Elife},
213 |   volume={8},
214 |   pages={e52646},
215 |   year={2019},
216 |   publisher={eLife Sciences Publications Limited}
217 | }
218 | 
219 | @article{christensen2019study,
220 |   title={A study of the impact of data sharing on article citations using journal policies as a natural experiment},
221 |   author={Christensen, Garret and Dafoe, Allan and Miguel, Edward and Moore, Don A and Rose, Andrew K},
222 |   journal={PLoS One},
223 |   volume={14},
224 |   number={12},
225 |   pages={e0225883},
226 |   year={2019},
227 |   publisher={Public Library of Science San Francisco, CA USA}
228 | }
229 | 
230 | @article{wahlquist2018dissemination,
231 |   title={Dissemination of novel biostatistics methods: Impact of programming code availability and other characteristics on article citations},
232 |   author={Wahlquist, Amy E and Muhammad, Lutfiyya N and Herbert, Teri Lynn and Ramakrishnan, Viswanathan and Nietert, Paul J},
233 |   journal={PloS one},
234 |   volume={13},
235 |   number={8},
236 |   pages={e0201590},
237 |   year={2018},
238 |   publisher={Public Library of Science San Francisco, CA USA}
239 | }
240 | 
241 | @article{zilberman2021computer,
242 |   title={Why computer occupations are behind strong STEM employment growth in the 2019--29 decade},
243 |   author={Zilberman, Alan and Ice, Lindsey},
244 |   journal={Computer},
245 |   volume={4},
246 |   number={5,164.6},
247 |   pages={11--5},
248 |   year={2021}
249 | }
250 | 
251 | @article{rainie2017future,
252 |   title={The Future of Jobs and Jobs Training.},
253 |   author={Rainie, Lee and Anderson, Janna},
254 |   journal={Pew Research Center},
255 |   year={2017},
256 |   publisher={ERIC}
257 | }
258 | 
259 | @inproceedings{hars34working,
260 |   title={Working for Free?--Motivations of Participating in Open Source Projects; 2001},
261 |   author={Hars, A and Ou, S},
262 |   booktitle={34th Annual Hawaii International Conference on System Sciences (HICSS-34), Hava{\'\i}},
263 |   pages={25--39}
264 | }
265 | 
266 | @article{bitzer2007intrinsic,
267 |   title={Intrinsic motivation in open source software development},
268 |   author={Bitzer, J{\"u}rgen and Schrettl, Wolfram and Schr{\"o}der, Philipp JH},
269 |   journal={Journal of comparative economics},
270 |   volume={35},
271 |   number={1},
272 |   pages={160--169},
273 |   year={2007},
274 |   publisher={Elsevier}
275 | }
276 | 
277 | @misc{neurodatascience, 
278 | url={https://neurodatascience.github.io/QLS612-Overview/}, 
279 | title={An introduction to the foundations of neuro data science},
280 | publisher={McGill University},  }
281 | 
282 |  @misc{ucberkeleyreproducible,
283 |  title={Reproducible and Collaborative Data Science}, 
284 |  url={https://berkeley-stat159-f17.github.io/stat159-f17/},
285 | abstractNote={A project-based introduction to statistical data science. 
286 | Through lectures, computational laboratories, readings, homeworks, and a
287 |  group project, you will learn practical techniques and tools for producing statistically sound and appropriate, reproducible, and verifiable computational answers to scientific 
288 | questions. The course emphasizes version control, testing, process 
289 | automation, code review, and collaborative programming. Software tools 
290 | include Bash, Git, Python, Jupyter and LATEX}, 
291 | publisher={University of California, Berkeley} }
292 | 
293 | @misc{harvard2017reproducible, url={https://pll.harvard.edu/course/principles-statistical-and-computational-tools-reproducible-data-science},
294 | title={Principles, Statistical and Computational Tools for Reproducible Data Science}, 
295 | abstractNote={Learn skills and tools that support data science and reproducible research, to ensure you can trust your own research 
296 | results, reproduce them yourself, and communicate them to others.}, 
297 | publisher={Harvard University}, 
298 | year={2017}, 
299 | month={Oct} }
300 | 
301 | @article{stodden2013toward,
302 |   title={Toward reproducible computational research: an empirical analysis of data and code policy adoption by journals},
303 |   author={Stodden, Victoria and Guo, Peixuan and Ma, Zhaokun},
304 |   journal={PloS one},
305 |   volume={8},
306 |   number={6},
307 |   pages={e67111},
308 |   year={2013},
309 |   publisher={Public Library of Science San Francisco, USA}
310 | }
311 | 
312 | @article{zhao2019deep,
313 |   title={Deep learning and its applications to machine health monitoring},
314 |   author={Zhao, Rui and Yan, Ruqiang and Chen, Zhenghua and Mao, Kezhi and Wang, Peng and Gao, Robert X},
315 |   journal={Mechanical Systems and Signal Processing},
316 |   volume={115},
317 |   pages={213--237},
318 |   year={2019},
319 |   publisher={Elsevier}
320 | }
321 | 
322 | @article{wang2021recent,
323 |   title={Recent Advancement of Deep Learning Applications to Machine Condition Monitoring Part 1: A Critical Review},
324 |   author={Wang, Wenyi and Taylor, John and Rees, Robert J},
325 |   journal={Acoustics Australia},
326 |   pages={1--13},
327 |   year={2021},
328 |   publisher={Springer}
329 | }
330 | 
331 | @article{lee2007bearing,
332 |   title={Bearing data set},
333 |   author={Lee, J and Qiu, H and Yu, G and Lin, Ja and others},
334 |   journal={IMS, University of Cincinnati, NASA Ames Prognostics Data Repository, Rexnord Technical Services},
335 |   year={2007}
336 | }
337 | 
338 | @article{agogino2007milling,
339 |   title={Milling data set. NASA Ames Prognostics Data Repository},
340 |   author={Agogino, A and Goebel, K},
341 |   journal={Moffett Field, CA},
342 |   year={2007},
343 | url={https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/}
344 | }
345 | 
346 | @article{garcia2021temporal,
347 |   title={Temporal signals to images: Monitoring the condition of industrial assets with deep learning image processing algorithms},
348 |   author={Garcia, Gabriel Rodriguez and Michau, Gabriel and Ducoffe, M{\'e}lanie and Gupta, Jayant Sen and Fink, Olga},
349 |   journal={Proceedings of the Institution of Mechanical Engineers, Part O: Journal of Risk and Reliability},
350 |   pages={1748006X21994446},
351 |   year={2021},
352 |   publisher={SAGE Publications Sage UK: London, England}
353 | }
354 | 
355 | @article{esteban2019fmriprep,
356 |   title={fMRIPrep: a robust preprocessing pipeline for functional MRI},
357 |   author={Esteban, Oscar and Markiewicz, Christopher J and Blair, Ross W and Moodie, Craig A and Isik, A Ilkay and Erramuzpe, Asier and Kent, James D and Goncalves, Mathias and DuPre, Elizabeth and Snyder, Madeleine and others},
358 |   journal={Nature methods},
359 |   volume={16},
360 |   number={1},
361 |   pages={111--116},
362 |   year={2019},
363 |   publisher={Nature Publishing Group}
364 | }
365 | 
366 | @software{christian_s_perone_2018_1495335,
367 |   author       = {Christian S. Perone and
368 |                   cclauss and
369 |                   Elvis Saravia and
370 |                   Pedro Lemos Ballester and
371 |                   MohitTare},
372 |   title        = {perone/medicaltorch: Release v0.2},
373 |   month        = nov,
374 |   year         = 2018,
375 |   publisher    = {Zenodo},
376 |   version      = {v0.2},
377 |   doi          = {10.5281/zenodo.1495335},
378 |   url          = {https://doi.org/10.5281/zenodo.1495335}
379 | }
380 | 
381 | @INPROCEEDINGS{astroML,
382 |  author={{Vanderplas}, J.T. and {Connolly}, A.J.
383 |          and {Ivezi{\'c}}, {\v Z}. and {Gray}, A.},
384 |  booktitle={Conference on Intelligent Data Understanding (CIDU)},
385 |  title={Introduction to astroML: Machine learning for astrophysics},
386 |  month={oct.},
387 |  pages={47 -54},
388 |  doi={10.1109/CIDU.2012.6382200},
389 |  year={2012}}
390 | 
391 |  @incollection{NEURIPS2019_9015,
392 | title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
393 | author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
394 | booktitle = {Advances in Neural Information Processing Systems 32},
395 | editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
396 | pages = {8024--8035},
397 | year = {2019},
398 | publisher = {Curran Associates, Inc.},
399 | url = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf}
400 | }
401 | 
402 | @book{Bird_Natural_Language_Processing_2009,
403 | author = {Bird, Steven and Klein, Ewan and Loper, Edward},
404 | publisher = {O'Reilly Media, Inc.},
405 | title = {{Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit}},
406 | year = {2009}}
407 | 
408 | @article{wilson2014software,
409 |   title={Software Carpentry: lessons learned},
410 |   author={Wilson, Greg},
411 |   journal={F1000Research},
412 |   volume={3},
413 |   year={2014},
414 |   publisher={Faculty of 1000 Ltd}
415 | }
416 | 
417 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/milling.py:
--------------------------------------------------------------------------------
  1 | import scipy.io as sio
  2 | import numpy as np
  3 | import pandas as pd
  4 | from pathlib import Path
  5 | from .pyphm import PHMDataset
  6 | from typing import Any, Callable, List, Optional, Tuple
  7 | import pkg_resources
  8 | from .utils import (
  9 |     download_and_extract_archive,
 10 |     extract_archive,
 11 |     check_integrity,
 12 | )
 13 | import os
 14 | from urllib.error import URLError
 15 | 
 16 | """
 17 | Contains the data prep class for the UC-Berkely milling data set.
 18 | 
 19 | Also contains helper functions associated with the milling data set.
 20 | """
 21 | 
 22 | 
 23 | ###############################################################################
 24 | # Data Prep Classes
 25 | ###############################################################################
 26 | class MillingDataLoad(PHMDataset):
 27 |     """
 28 |     Load the UC Berkely milling data set from .mat file, and download if necessary.
 29 | 
 30 |     Args:
 31 |         root (string): Root directory to place all the  data sets.
 32 | 
 33 |         dataset_folder_name (string): Name of folder containing raw data.
 34 |             This folder will be created in the root directory if not present.
 35 | 
 36 |         download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
 37 | 
 38 |     """
 39 | 
 40 |     resources = [
 41 |         {
 42 |             "name": "aws",
 43 |             "url": "https://phm-datasets.s3.amazonaws.com/NASA/",
 44 |             "files": [
 45 |                 {
 46 |                     "filename": "3.+Milling.zip",
 47 |                     "md5": "4da3afb0aa50cb3dcdd8e20ed1ed1c7c",
 48 |                 }
 49 |             ],
 50 |         },
 51 |         {
 52 |             "name": "github",
 53 |             "url": "https://github.com/tvhahn/Manufacturing-Data-Science-with-Python/raw/master/Data%20Sets/milling_uc_berkeley/raw/",
 54 |             "files": [
 55 |                 {
 56 |                     "filename": "mill.zip",
 57 |                     "md5": "81d821fdef812183a7d38b6f83f7cefa",
 58 |                 }
 59 |             ],
 60 |         },
 61 |     ]
 62 | 
 63 |     def __init__(
 64 |         self,
 65 |         root: Path,
 66 |         dataset_folder_name: str = "milling",
 67 |         data_file_name: str = "mill.mat",
 68 |         download: bool = False,
 69 |         data: np.ndarray = None,
 70 |     ) -> None:
 71 |         super().__init__(root, dataset_folder_name)
 72 | 
 73 |         self.dataset_folder_path = self.root / self.dataset_folder_name
 74 |         self.data_file_name = data_file_name
 75 | 
 76 |         if download:
 77 |             self.download()
 78 | 
 79 |         data_file_path = self.dataset_folder_path / self.data_file_name
 80 |         # assert that data_file_path exists
 81 |         assert data_file_path.exists(), f"{data_file_path} does not exist."
 82 | 
 83 |         self.data = self.load_mat()
 84 | 
 85 |     def _check_exists(self) -> bool:
 86 |         for source in self.resources:
 87 |             for file in source["files"]:
 88 |                 file_name = file["filename"]
 89 |                 file_path = self.dataset_folder_path / file_name
 90 |                 if not check_integrity(file_path, file["md5"]):
 91 |                     return False
 92 |         return True
 93 | 
 94 | 
 95 |     def download(self) -> None:
 96 |         """Download the data files from their sources if they don't exist already."""
 97 | 
 98 |         if self._check_exists():
 99 |             print("Files already downloaded and verified.")
100 |             return
101 | 
102 |         # Ensure the dataset folder exists
103 |         self.dataset_folder_path.mkdir(parents=True, exist_ok=True)
104 | 
105 |         successful_download = False
106 | 
107 |         for source in self.resources:
108 |             all_files_downloaded = True  # Assume success, prove otherwise
109 | 
110 |             for file in source["files"]:
111 |                 file_name = file["filename"]
112 |                 md5 = file["md5"]
113 |                 file_path = self.dataset_folder_path / file_name
114 | 
115 |                 # Check if the file already exists and is verified
116 |                 if check_integrity(file_path, md5):
117 |                     print(f"{file_name} already exists and is verified.")
118 |                     continue  # Skip to the next file as this one is already handled
119 | 
120 |                 # Construct the URL for downloading
121 |                 url = f"{source['url']}{file_name}"
122 | 
123 |                 try:
124 |                     print(f"Attempting to download {url}")
125 |                     download_and_extract_archive(
126 |                         url,
127 |                         download_root=str(self.dataset_folder_path),
128 |                         filename=file_name,
129 |                         md5=md5,
130 |                         remove_finished=True,
131 |                     )
132 |                     # After successful download and extraction, check for and extract any nested archive
133 |                     self.check_and_extract_nested(file_path.parent)
134 | 
135 |                 except URLError as error:
136 |                     print(f"Failed to download {file_name} from {source['name']}:\n{error}")
137 |                     all_files_downloaded = False  # Mark as failed to trigger another source attempt
138 |                     break  # Exit the file loop to try the next source
139 | 
140 |             if all_files_downloaded:
141 |                 successful_download = True
142 |                 print(f"Successfully downloaded all files from {source['name']}")
143 |                 break  # Exit the source loop since we've successfully downloaded from this source
144 | 
145 |         if not successful_download:
146 |             raise RuntimeError("Failed to download files from all sources.")
147 | 
148 |     def check_and_extract_nested(self, directory: Path) -> None:
149 |         """Check for and extract any nested archives in the given directory."""
150 |         for item in directory.iterdir():
151 |             if item.is_dir():
152 |                 # Check each directory for nested archives
153 |                 for nested_item in item.iterdir():
154 |                     if nested_item.suffix in ['.zip', '.tar', '.gz']:
155 |                         print(f"Found nested archive: {nested_item}")
156 |                         extract_archive(str(nested_item), str(directory), remove_finished=True)
157 | 
158 | 
159 | 
160 |     def load_mat(self) -> np.ndarray:
161 |         """Load the mat file and return the data as a numpy array."""
162 |         data = sio.loadmat(self.dataset_folder_path / self.data_file_name, struct_as_record=True)
163 |         return data["mill"]
164 | 
165 | 
166 | class MillingPrepMethodA(MillingDataLoad):
167 |     """
168 |     Class used to prepare the UC Berkeley milling dataset before feature engining or machine learning.
169 |     Method is described in the paper:
170 | 
171 |     `Self-supervised learning for tool wear monitoring with a disentangled-variational-autoencoder`
172 |     by von Hahn and Mechefkse, 2021
173 | 
174 |     Args:
175 |         root (string): Root directory to place all the  data sets. (likely the raw data folder)
176 | 
177 |         dataset_folder_name (string): Name of folder (within root) containing raw data.
178 |             This folder will be created in the root directory if not present.
179 | 
180 |         download (bool): If True, the data will be downloaded from the NASA Prognostics Repository.
181 | 
182 |         path_csv_labels (Path, optional): Path to the csv of the label dataframe.
183 |             If not provided, the 'milling_labels_with_tool_class.csv' will be used, provided in the
184 |             PyPHM package.
185 | 
186 |         window_len (int): Length of the window to be used for the sliding window.
187 | 
188 |         stride (int): Amount to move (stride) between individual windows of data.
189 | 
190 |         cut_drop_list (list, optional): List of cut numbers to drop. cut_no 17 and 94 are erroneous and
191 |             will be dropped as default.
192 |     """
193 | 
194 |     def __init__(
195 |         self,
196 |         root: Path,
197 |         dataset_folder_name: str = "milling",
198 |         dataset_folder_path: Path = None,
199 |         data_file_name: str = "mill.mat",
200 |         download: bool = False,
201 |         data: np.ndarray = None,
202 |         path_csv_labels: Path = None,
203 |         window_len: int = 64,
204 |         stride: int = 64,
205 |         cut_drop_list: List[int] = [17, 94],
206 |     ) -> None:
207 |         super().__init__(root, dataset_folder_name, data_file_name, download, data)
208 | 
209 |         self.window_len = window_len  # size of the window
210 |         self.stride = stride  # stride between windows
211 |         self.cut_drop_list = cut_drop_list  # list of cut numbers to be dropped
212 | 
213 |         if path_csv_labels is not None:
214 |             self.path_csv_labels = path_csv_labels
215 |         else:
216 |             # path of pyphm source directory using pathlib
217 |             self.path_csv_labels = Path(
218 |                 pkg_resources.resource_filename(
219 |                     "pyphm", "datasets/auxilary_metadata/milling_labels_with_tool_class.csv"
220 |                 )
221 |             )
222 | 
223 |         # load the labels dataframe
224 |         self.df_labels = pd.read_csv(self.path_csv_labels)
225 | 
226 |         if self.cut_drop_list is not None:
227 |             self.df_labels.drop(self.cut_drop_list, inplace=True)  # drop the cuts that are bad
228 | 
229 |         self.df_labels.reset_index(drop=True, inplace=True)  # reset the index
230 | 
231 |         self.field_names = self.data.dtype.names
232 | 
233 |         self.signal_names = self.field_names[7:][::-1]
234 | 
235 |     def create_labels(self):
236 |         """Function that will create the label dataframe from the mill data set
237 | 
238 |         Only needed if the dataframe with the labels is not provided.
239 |         """
240 | 
241 |         # create empty dataframe for the labels
242 |         df_labels = pd.DataFrame()
243 | 
244 |         # get the labels from the original .mat file and put in dataframe
245 |         for i in range(7):
246 |             # list for storing the label data for each field
247 |             x = []
248 | 
249 |             # iterate through each of the unique cuts
250 |             for j in range(167):
251 |                 x.append(self.data[0, j][i][0][0])
252 |             x = np.array(x)
253 |             df_labels[str(i)] = x
254 | 
255 |         # add column names to the dataframe
256 |         df_labels.columns = self.field_names[0:7]
257 | 
258 |         # create a column with the unique cut number
259 |         df_labels["cut_no"] = [i for i in range(167)]
260 | 
261 |         def tool_state(cols):
262 |             """Add the label to the cut.
263 | 
264 |             Categories are:
265 |             Healthy Sate (label=0): 0~0.2mm flank wear
266 |             Degredation State (label=1): 0.2~0.7mm flank wear
267 |             Failure State (label=2): >0.7mm flank wear
268 |             """
269 |             # pass in the tool wear, VB, column
270 |             vb = cols
271 | 
272 |             if vb < 0.2:
273 |                 return 0
274 |             elif vb >= 0.2 and vb < 0.7:
275 |                 return 1
276 |             elif pd.isnull(vb):
277 |                 pass
278 |             else:
279 |                 return 2
280 | 
281 |         # apply the label to the dataframe
282 |         df_labels["tool_class"] = df_labels["VB"].apply(tool_state)
283 | 
284 |         return df_labels
285 | 
286 |     def create_data_array(self, cut_no):
287 |         """Create an array from an individual cut sample.
288 | 
289 |         Parameters
290 |         ===========
291 |         cut_no : int
292 |             Index of the cut to be used.
293 | 
294 |         Returns
295 |         ===========
296 |         sub_cut_array : np.array
297 |             Array of the cut samples. Shape of [no. samples, sample len, features/sample]
298 | 
299 |         sub_cut_labels : np.array
300 |             Array of the labels for the cut samples. Shape of [# samples, # features/sample]
301 | 
302 |         """
303 | 
304 |         assert cut_no in self.df_labels["cut_no"].values, "Cut number must be in the dataframe"
305 | 
306 |         # create a numpy array of the cut
307 |         # with a final array shape like [no. cuts, len cuts, no. signals]
308 |         cut = self.data[0, cut_no]
309 |         for i, signal_name in enumerate(self.signal_names):
310 |             if i == 0:
311 |                 cut_array = cut[signal_name].reshape((9000, 1))
312 |             else:
313 |                 cut_array = np.concatenate((cut_array, cut[signal_name].reshape((9000, 1))), axis=1)
314 | 
315 |         # select the start and end of the cut
316 |         start = self.df_labels[self.df_labels["cut_no"] == cut_no]["window_start"].values[0]
317 |         end = self.df_labels[self.df_labels["cut_no"] == cut_no]["window_end"].values[0]
318 |         cut_array = cut_array[start:end, :]
319 | 
320 |         # instantiate the "temporary" lists to store the sub-cuts and metadata
321 |         sub_cut_list = []
322 |         sub_cut_id_list = []
323 |         sub_cut_label_list = []
324 | 
325 |         # get the labels for the cut
326 |         label = self.df_labels[self.df_labels["cut_no"] == cut_no]["tool_class"].values[0]
327 | 
328 |         # fit the strided windows into the dummy_array until the length
329 |         # of the window does not equal the proper length (better way to do this???)
330 |         for i in range(cut_array.shape[0]):
331 |             windowed_signal = cut_array[i * self.stride : i * self.stride + self.window_len]
332 | 
333 |             # if the windowed signal is the proper length, add it to the list
334 |             if windowed_signal.shape == (self.window_len, 6):
335 |                 sub_cut_list.append(windowed_signal)
336 | 
337 |                 # create sub_cut_id fstring to keep track of the cut_id and the window_id
338 |                 sub_cut_id_list.append(f"{cut_no}_{i}")
339 | 
340 |                 # create the sub_cut_label and append it to the list
341 |                 sub_cut_label_list.append(int(label))
342 | 
343 |             else:
344 |                 break
345 | 
346 |         sub_cut_array = np.array(sub_cut_list)
347 | 
348 |         sub_cut_ids = np.expand_dims(np.array(sub_cut_id_list, dtype=str), axis=1)
349 |         sub_cut_ids = np.repeat(sub_cut_ids, sub_cut_array.shape[1], axis=1)
350 | 
351 |         sub_cut_labels = np.expand_dims(np.array(sub_cut_label_list, dtype=int), axis=1)
352 |         sub_cut_labels = np.repeat(sub_cut_labels, sub_cut_array.shape[1], axis=1)
353 | 
354 |         # take the length of the signals in the sub_cut_array
355 |         # and divide it by the frequency (250 Hz) to get the time (seconds) of each sub-cut
356 |         sub_cut_times = np.expand_dims(np.arange(0, sub_cut_array.shape[1]) / 250.0, axis=0)
357 |         sub_cut_times = np.repeat(
358 |             sub_cut_times,
359 |             sub_cut_array.shape[0],
360 |             axis=0,
361 |         )
362 | 
363 |         sub_cut_labels_ids_times = np.stack((sub_cut_labels, sub_cut_ids, sub_cut_times), axis=2)
364 | 
365 |         return (
366 |             sub_cut_array,
367 |             sub_cut_labels,
368 |             sub_cut_ids,
369 |             sub_cut_times,
370 |             sub_cut_labels_ids_times,
371 |         )
372 | 
373 |     def create_xy_arrays(self):
374 |         """Create the x and y arrays used in deep learning.
375 | 
376 |         Returns
377 |         ===========
378 |         x_array : np.array
379 |             Array of the cut samples. Shape of [no. samples, sample len, features/sample]
380 | 
381 |         y_array : np.array
382 |             Array of the labels for the cut samples. Shape of [no. samples, sample len, label/ids/times]
383 |             Use y[:,0,:], for example, to get the y in a shape of [no. samples, label/ids/times]
384 |             ( e.g. will be shape (no. samples, 3) )
385 | 
386 |         """
387 | 
388 |         # create a list to store the x and y arrays
389 |         x = []  # instantiate X's
390 |         y_labels_ids_times = []  # instantiate y's
391 | 
392 |         # iterate throught the df_labels
393 |         for i in self.df_labels.itertuples():
394 |             (
395 |                 sub_cut_array,
396 |                 sub_cut_labels,
397 |                 sub_cut_ids,
398 |                 sub_cut_times,
399 |                 sub_cut_labels_ids_times,
400 |             ) = self.create_data_array(i.cut_no)
401 | 
402 |             x.append(sub_cut_array)
403 |             y_labels_ids_times.append(sub_cut_labels_ids_times)
404 | 
405 |         return np.vstack(x), np.vstack(y_labels_ids_times)
406 | 
407 |     def create_xy_dataframe(self):
408 |         """
409 |         Create a flat dataframe (2D array) of the x and y arrays.
410 | 
411 |         Amenable for use with TSFresh for feature engineering.
412 | 
413 |         Returns
414 |         ===========
415 |         df : pd.DataFrame
416 |             Single flat dataframe containing each sample and its labels.
417 | 
418 |         """
419 | 
420 |         x, y_labels_ids_times = self.create_xy_arrays()  # create the x and y arrays
421 | 
422 |         # concatenate the x and y arrays and reshape them to be a flat array (2D)
423 |         x_labels = np.reshape(np.concatenate((x, y_labels_ids_times), axis=2), (-1, 9))
424 | 
425 |         # define the column names and the data types
426 |         col_names = [s.lower() for s in list(self.signal_names)] + [
427 |             "tool_class",
428 |             "cut_id",
429 |             "time",
430 |         ]
431 | 
432 |         col_names_ordered = [
433 |             "cut_id",
434 |             "cut_no",
435 |             "case",
436 |             "time",
437 |             "ae_spindle",
438 |             "ae_table",
439 |             "vib_spindle",
440 |             "vib_table",
441 |             "smcdc",
442 |             "smcac",
443 |             "tool_class",
444 |         ]
445 | 
446 |         col_dtype = [
447 |             str,
448 |             int,
449 |             int,
450 |             np.float32,
451 |             np.float32,
452 |             np.float32,
453 |             np.float32,
454 |             np.float32,
455 |             np.float32,
456 |             np.float32,
457 |             int,
458 |         ]
459 | 
460 |         col_dtype_dict = dict(zip(col_names_ordered, col_dtype))
461 | 
462 |         # create a dataframe from the x and y arrays
463 |         df = pd.DataFrame(x_labels, columns=col_names, dtype=str)
464 | 
465 |         # split the cut_id by "_" and take the first element (cut_no)
466 |         df["cut_no"] = df["cut_id"].str.split("_").str[0]
467 | 
468 |         # get the case from each cut_no using the df_labels
469 |         df = df.merge(
470 |             self.df_labels[["cut_no", "case"]].astype(dtype=str),
471 |             on="cut_no",
472 |             how="left",
473 |         )
474 | 
475 |         df = df[col_names_ordered].astype(col_dtype_dict)  # reorder the columns
476 | 
477 |         return df
478 | 


--------------------------------------------------------------------------------
/src/pyphm/datasets/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | 
  4 | The utils.py is Copyright (c) Soumith Chintala 2016, (from pytorch/vision)
  5 | All rights reserved.
  6 | 
  7 | Redistribution and use in source and binary forms, with or without
  8 | modification, are permitted provided that the following conditions are met:
  9 | 
 10 | * Redistributions of source code must retain the above copyright notice, this
 11 |   list of conditions and the following disclaimer.
 12 | 
 13 | * Redistributions in binary form must reproduce the above copyright notice,
 14 |   this list of conditions and the following disclaimer in the documentation
 15 |   and/or other materials provided with the distribution.
 16 | 
 17 | * Neither the name of the copyright holder nor the names of its
 18 |   contributors may be used to endorse or promote products derived from
 19 |   this software without specific prior written permission.
 20 | 
 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 25 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 29 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | """
 32 | 
 33 | import bz2
 34 | import gzip
 35 | import hashlib
 36 | import itertools
 37 | import lzma
 38 | import os
 39 | import os.path
 40 | import pathlib
 41 | from pathlib import Path
 42 | import re
 43 | import tarfile
 44 | import rarfile # needed for IMS dataset
 45 | import py7zr # needed for IMS dataset
 46 | import urllib
 47 | import urllib.error
 48 | import urllib.request
 49 | import zipfile
 50 | from typing import Any, Callable, List, Iterable, Optional, TypeVar, Dict, IO, Tuple, Iterator
 51 | from urllib.parse import urlparse
 52 | import gdown
 53 | 
 54 | import requests
 55 | from tqdm.auto import tqdm
 56 | 
 57 | 
 58 | def _download_file_from_remote_location(fpath: str, url: str) -> None:
 59 |     pass
 60 | 
 61 | 
 62 | def _is_remote_location_available() -> bool:
 63 |     return False
 64 | 
 65 | USER_AGENT = "PyPHM"
 66 | 
 67 | 
 68 | def _urlretrieve(url: str, filename: str, chunk_size: int = 1024) -> None:
 69 |     with open(filename, "wb") as fh:
 70 |         with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response:
 71 |             with tqdm(total=response.length) as pbar:
 72 |                 for chunk in iter(lambda: response.read(chunk_size), ""):
 73 |                     if not chunk:
 74 |                         break
 75 |                     pbar.update(chunk_size)
 76 |                     fh.write(chunk)
 77 | 
 78 | 
 79 | def gen_bar_updater() -> Callable[[int, int, int], None]:
 80 |     pbar = tqdm(total=None)
 81 | 
 82 |     def bar_update(count, block_size, total_size):
 83 |         if pbar.total is None and total_size:
 84 |             pbar.total = total_size
 85 |         progress_bytes = count * block_size
 86 |         pbar.update(progress_bytes - pbar.n)
 87 | 
 88 |     return bar_update
 89 | 
 90 | 
 91 | def calculate_md5(fpath: Path, chunk_size: int = 1024 * 1024) -> str:
 92 |     md5 = hashlib.md5()
 93 |     with open(fpath, "rb") as f:
 94 |         for chunk in iter(lambda: f.read(chunk_size), b""):
 95 |             md5.update(chunk)
 96 |     return md5.hexdigest()
 97 | 
 98 | 
 99 | def check_md5(fpath: Path, md5: str, **kwargs: Any) -> bool:
100 |     return md5 == calculate_md5(fpath, **kwargs)
101 | 
102 | 
103 | def check_integrity(fpath: Path, md5: Optional[str] = None) -> bool:
104 |     fpath = Path(fpath)
105 |     if not fpath.exists() and not fpath.is_file():
106 |         return False
107 |     if md5 is None:
108 |         return True
109 |     return check_md5(fpath, md5)
110 | 
111 | 
112 | def _get_redirect_url(url: str, max_hops: int = 3) -> str:
113 |     initial_url = url
114 |     headers = {"Method": "HEAD", "User-Agent": USER_AGENT}
115 | 
116 |     for _ in range(max_hops + 1):
117 |         with urllib.request.urlopen(urllib.request.Request(url, headers=headers)) as response:
118 |             if response.url == url or response.url is None:
119 |                 return url
120 | 
121 |             url = response.url
122 |     else:
123 |         raise RecursionError(
124 |             f"Request to {initial_url} exceeded {max_hops} redirects. The last redirect points to {url}."
125 |         )
126 | 
127 | 
128 | def _get_google_drive_file_id(url: str) -> Optional[str]:
129 |     parts = urlparse(url)
130 | 
131 |     if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None:
132 |         return None
133 | 
134 |     match = re.match(r"/file/d/(?P<id>[^/]*)", parts.path)
135 |     if match is None:
136 |         return None
137 | 
138 |     return match.group("id")
139 | 
140 | 
141 | def download_url(
142 |     url: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None, max_redirect_hops: int = 3
143 | ) -> None:
144 |     """Download a file from a url and place it in root.
145 | 
146 |     Args:
147 |         url (str): URL to download file from
148 |         root (str): Directory to place downloaded file in
149 |         filename (str, optional): Name to save the file under. If None, use the basename of the URL
150 |         md5 (str, optional): MD5 checksum of the download. If None, do not check
151 |         max_redirect_hops (int, optional): Maximum number of redirect hops allowed
152 |     """
153 |     root = os.path.expanduser(root)
154 |     if not filename:
155 |         filename = os.path.basename(url)
156 |     fpath = os.path.join(root, filename)
157 | 
158 |     os.makedirs(root, exist_ok=True)
159 | 
160 |     # check if file is already present locally
161 |     if check_integrity(fpath, md5):
162 |         print("Using downloaded and verified file: " + fpath)
163 |         return
164 | 
165 |     if _is_remote_location_available():
166 |         _download_file_from_remote_location(fpath, url)
167 |     else:
168 |         # expand redirect chain if needed
169 |         url = _get_redirect_url(url, max_hops=max_redirect_hops)
170 | 
171 |         # check if file is located on Google Drive
172 |         file_id = _get_google_drive_file_id(url)
173 |         if file_id is not None:
174 |             print("Goolgle drive file id:", file_id)
175 |             return gdown.download(id=file_id, output=str(Path(root) / filename), quiet=False)
176 |             # return download_file_from_google_drive(file_id, root, filename, md5)
177 | 
178 |         # download the file
179 |         try:
180 |             print("Downloading " + url + " to " + fpath)
181 |             _urlretrieve(url, fpath)
182 |         except (urllib.error.URLError, OSError) as e:  # type: ignore[attr-defined]
183 |             if url[:5] == "https":
184 |                 url = url.replace("https:", "http:")
185 |                 print("Failed download. Trying https -> http instead. Downloading " + url + " to " + fpath)
186 |                 _urlretrieve(url, fpath)
187 |             else:
188 |                 raise e
189 | 
190 |     # check integrity of downloaded file
191 |     if not check_integrity(fpath, md5):
192 |         raise RuntimeError("File not found or corrupted.")
193 | 
194 | 
195 | def list_dir(root: str, prefix: bool = False) -> List[str]:
196 |     """List all directories at a given root
197 | 
198 |     Args:
199 |         root (str): Path to directory whose folders need to be listed
200 |         prefix (bool, optional): If true, prepends the path to each result, otherwise
201 |             only returns the name of the directories found
202 |     """
203 |     root = os.path.expanduser(root)
204 |     directories = [p for p in os.listdir(root) if os.path.isdir(os.path.join(root, p))]
205 |     if prefix is True:
206 |         directories = [os.path.join(root, d) for d in directories]
207 |     return directories
208 | 
209 | 
210 | def list_files(root: str, suffix: str, prefix: bool = False) -> List[str]:
211 |     """List all files ending with a suffix at a given root
212 | 
213 |     Args:
214 |         root (str): Path to directory whose folders need to be listed
215 |         suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png').
216 |             It uses the Python "str.endswith" method and is passed directly
217 |         prefix (bool, optional): If true, prepends the path to each result, otherwise
218 |             only returns the name of the files found
219 |     """
220 |     root = os.path.expanduser(root)
221 |     files = [p for p in os.listdir(root) if os.path.isfile(os.path.join(root, p)) and p.endswith(suffix)]
222 |     if prefix is True:
223 |         files = [os.path.join(root, d) for d in files]
224 |     return files
225 | 
226 | 
227 | def _quota_exceeded(first_chunk: bytes) -> bool:
228 |     try:
229 |         return "Google Drive - Quota exceeded" in first_chunk.decode()
230 |     except UnicodeDecodeError:
231 |         return False
232 | 
233 | 
234 | def download_file_from_google_drive(file_id: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None):
235 |     """Download a Google Drive file from  and place it in root.
236 | 
237 |     Args:
238 |         file_id (str): id of file to be downloaded
239 |         root (str): Directory to place downloaded file in
240 |         filename (str, optional): Name to save the file under. If None, use the id of the file.
241 |         md5 (str, optional): MD5 checksum of the download. If None, do not check
242 |     """
243 |     # Based on https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
244 | 
245 |     url = "https://docs.google.com/uc?export=download"
246 | 
247 |     root = os.path.expanduser(root)
248 |     if not filename:
249 |         filename = file_id
250 |     fpath = os.path.join(root, filename)
251 | 
252 |     os.makedirs(root, exist_ok=True)
253 | 
254 |     if os.path.isfile(fpath) and check_integrity(fpath, md5):
255 |         print("Using downloaded and verified file: " + fpath)
256 |     else:
257 |         session = requests.Session()
258 | 
259 |         response = session.get(url, params={"id": file_id}, stream=True)
260 |         token = _get_confirm_token(response)
261 | 
262 |         if token:
263 |             params = {"id": file_id, "confirm": token}
264 |             response = session.get(url, params=params, stream=True)
265 | 
266 |         # Ideally, one would use response.status_code to check for quota limits, but google drive is not consistent
267 |         # with their own API, refer https://github.com/pytorch/vision/issues/2992#issuecomment-730614517.
268 |         # Should this be fixed at some place in future, one could refactor the following to no longer rely on decoding
269 |         # the first_chunk of the payload
270 |         response_content_generator = response.iter_content(32768)
271 |         first_chunk = None
272 |         while not first_chunk:  # filter out keep-alive new chunks
273 |             first_chunk = next(response_content_generator)
274 | 
275 |         if _quota_exceeded(first_chunk):
276 |             msg = (
277 |                 f"The daily quota of the file {filename} is exceeded and it "
278 |                 f"can't be downloaded. This is a limitation of Google Drive "
279 |                 f"and can only be overcome by trying again later."
280 |             )
281 |             raise RuntimeError(msg)
282 | 
283 |         _save_response_content(itertools.chain((first_chunk,), response_content_generator), fpath)
284 |         response.close()
285 | 
286 | 
287 | def _get_confirm_token(response: requests.models.Response) -> Optional[str]:
288 |     for key, value in response.cookies.items():
289 |         if key.startswith("download_warning"):
290 |             return value
291 | 
292 |     return None
293 | 
294 | 
295 | def _save_response_content(
296 |     response_gen: Iterator[bytes],
297 |     destination: str,
298 | ) -> None:
299 |     with open(destination, "wb") as f:
300 |         pbar = tqdm(total=None)
301 |         progress = 0
302 | 
303 |         for chunk in response_gen:
304 |             if chunk:  # filter out keep-alive new chunks
305 |                 f.write(chunk)
306 |                 progress += len(chunk)
307 |                 pbar.update(progress - pbar.n)
308 |         pbar.close()
309 | 
310 | 
311 | def _extract_tar(from_path: str, to_path: str, compression: Optional[str]) -> None:
312 |     with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar:
313 |         tar.extractall(to_path)
314 | 
315 | 
316 | def _extract_rar(from_path: str, to_path: str, compression: Optional[str]) -> None:
317 |     with rarfile.RarFile(from_path, f"r:{compression[1:]}" if compression else "r") as rar:
318 |         rar.extractall(to_path)
319 | 
320 | 
321 | def _extract_7z(from_path: str, to_path: str, compression: Optional[str]) -> None:
322 |     with py7zr.SevenZipFile(from_path, f"r:{compression[1:]}" if compression else "r") as z:
323 |         z.extractall(to_path)
324 | 
325 | 
326 | _ZIP_COMPRESSION_MAP: Dict[str, int] = {
327 |     ".bz2": zipfile.ZIP_BZIP2,
328 |     ".xz": zipfile.ZIP_LZMA,
329 | }
330 | 
331 | 
332 | def _extract_zip(from_path: str, to_path: str, compression: Optional[str]) -> None:
333 |     with zipfile.ZipFile(
334 |         from_path, "r", compression=_ZIP_COMPRESSION_MAP[compression] if compression else zipfile.ZIP_STORED
335 |     ) as zip:
336 |         zip.extractall(to_path)
337 | 
338 | 
339 | _ARCHIVE_EXTRACTORS: Dict[str, Callable[[str, str, Optional[str]], None]] = {
340 |     ".tar": _extract_tar,
341 |     ".zip": _extract_zip,
342 |     ".rar": _extract_rar,
343 |     ".7z": _extract_7z,
344 | }
345 | _COMPRESSED_FILE_OPENERS: Dict[str, Callable[..., IO]] = {
346 |     ".bz2": bz2.open,
347 |     ".gz": gzip.open,
348 |     ".xz": lzma.open,
349 | }
350 | _FILE_TYPE_ALIASES: Dict[str, Tuple[Optional[str], Optional[str]]] = {
351 |     ".tbz": (".tar", ".bz2"),
352 |     ".tbz2": (".tar", ".bz2"),
353 |     ".tgz": (".tar", ".gz"),
354 | }
355 | 
356 | 
357 | def _detect_file_type(file: str) -> Tuple[str, Optional[str], Optional[str]]:
358 |     """Detect the archive type and/or compression of a file.
359 | 
360 |     Args:
361 |         file (str): the filename
362 | 
363 |     Returns:
364 |         (tuple): tuple of suffix, archive type, and compression
365 | 
366 |     Raises:
367 |         RuntimeError: if file has no suffix or suffix is not supported
368 |     """
369 |     suffixes = pathlib.Path(file).suffixes
370 |     if not suffixes:
371 |         raise RuntimeError(
372 |             f"File '{file}' has no suffixes that could be used to detect the archive type and compression."
373 |         )
374 |     suffix = suffixes[-1]
375 | 
376 |     # check if the suffix is a known alias
377 |     if suffix in _FILE_TYPE_ALIASES:
378 |         return (suffix, *_FILE_TYPE_ALIASES[suffix])
379 | 
380 |     # check if the suffix is an archive type
381 |     if suffix in _ARCHIVE_EXTRACTORS:
382 |         return suffix, suffix, None
383 | 
384 |     # check if the suffix is a compression
385 |     if suffix in _COMPRESSED_FILE_OPENERS:
386 |         # check for suffix hierarchy
387 |         if len(suffixes) > 1:
388 |             suffix2 = suffixes[-2]
389 | 
390 |             # check if the suffix2 is an archive type
391 |             if suffix2 in _ARCHIVE_EXTRACTORS:
392 |                 return suffix2 + suffix, suffix2, suffix
393 | 
394 |         return suffix, None, suffix
395 | 
396 |     valid_suffixes = sorted(set(_FILE_TYPE_ALIASES) | set(_ARCHIVE_EXTRACTORS) | set(_COMPRESSED_FILE_OPENERS))
397 |     raise RuntimeError(f"Unknown compression or archive type: '{suffix}'.\nKnown suffixes are: '{valid_suffixes}'.")
398 | 
399 | 
400 | def _decompress(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
401 |     r"""Decompress a file.
402 | 
403 |     The compression is automatically detected from the file name.
404 | 
405 |     Args:
406 |         from_path (str): Path to the file to be decompressed.
407 |         to_path (str): Path to the decompressed file. If omitted, ``from_path`` without compression extension is used.
408 |         remove_finished (bool): If ``True``, remove the file after the extraction.
409 | 
410 |     Returns:
411 |         (str): Path to the decompressed file.
412 |     """
413 |     suffix, archive_type, compression = _detect_file_type(from_path)
414 |     if not compression:
415 |         raise RuntimeError(f"Couldn't detect a compression from suffix {suffix}.")
416 | 
417 |     if to_path is None:
418 |         to_path = from_path.replace(suffix, archive_type if archive_type is not None else "")
419 | 
420 |     # We don't need to check for a missing key here, since this was already done in _detect_file_type()
421 |     compressed_file_opener = _COMPRESSED_FILE_OPENERS[compression]
422 | 
423 |     with compressed_file_opener(from_path, "rb") as rfh, open(to_path, "wb") as wfh:
424 |         wfh.write(rfh.read())
425 | 
426 |     if remove_finished:
427 |         os.remove(from_path)
428 | 
429 |     return to_path
430 | 
431 | 
432 | def extract_archive(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
433 |     """Extract an archive.
434 | 
435 |     The archive type and a possible compression is automatically detected from the file name. If the file is compressed
436 |     but not an archive the call is dispatched to :func:`decompress`.
437 | 
438 |     Args:
439 |         from_path (str): Path to the file to be extracted.
440 |         to_path (str): Path to the directory the file will be extracted to. If omitted, the directory of the file is
441 |             used.
442 |         remove_finished (bool): If ``True``, remove the file after the extraction.
443 | 
444 |     Returns:
445 |         (str): Path to the directory the file was extracted to.
446 |     """
447 |     if to_path is None:
448 |         to_path = os.path.dirname(from_path)
449 | 
450 |     suffix, archive_type, compression = _detect_file_type(from_path)
451 |     if not archive_type:
452 |         return _decompress(
453 |             from_path,
454 |             os.path.join(to_path, os.path.basename(from_path).replace(suffix, "")),
455 |             remove_finished=remove_finished,
456 |         )
457 | 
458 |     # We don't need to check for a missing key here, since this was already done in _detect_file_type()
459 |     extractor = _ARCHIVE_EXTRACTORS[archive_type]
460 | 
461 |     extractor(from_path, to_path, compression)
462 |     if remove_finished:
463 |         os.remove(from_path)
464 | 
465 |     return to_path
466 | 
467 | 
468 | def download_and_extract_archive(
469 |     url: str,
470 |     download_root: str,
471 |     extract_root: Optional[str] = None,
472 |     filename: Optional[str] = None,
473 |     md5: Optional[str] = None,
474 |     remove_finished: bool = False,
475 | ) -> None:
476 |     download_root = os.path.expanduser(download_root)
477 |     if extract_root is None:
478 |         extract_root = download_root
479 |     if not filename:
480 |         filename = os.path.basename(url)
481 | 
482 |     download_url(url, download_root, filename, md5)
483 | 
484 |     archive = os.path.join(download_root, filename)
485 |     print(f"Extracting {archive} to {extract_root}")
486 |     extract_archive(archive, extract_root, remove_finished)
487 | 
488 | 
489 | def iterable_to_str(iterable: Iterable) -> str:
490 |     return "'" + "', '".join([str(item) for item in iterable]) + "'"
491 | 
492 | 
493 | T = TypeVar("T", str, bytes)
494 | 
495 | 
496 | # def verify_str_arg(
497 | #     value: T,
498 | #     arg: Optional[str] = None,
499 | #     valid_values: Iterable[T] = None,
500 | #     custom_msg: Optional[str] = None,
501 | # ) -> T:
502 | #     if not isinstance(value, torch._six.string_classes):
503 | #         if arg is None:
504 | #             msg = "Expected type str, but got type {type}."
505 | #         else:
506 | #             msg = "Expected type str for argument {arg}, but got type {type}."
507 | #         msg = msg.format(type=type(value), arg=arg)
508 | #         raise ValueError(msg)
509 | 
510 | #     if valid_values is None:
511 | #         return value
512 | 
513 | #     if value not in valid_values:
514 | #         if custom_msg is not None:
515 | #             msg = custom_msg
516 | #         else:
517 | #             msg = "Unknown value '{value}' for argument {arg}. Valid values are {{{valid_values}}}."
518 | #             msg = msg.format(value=value, arg=arg, valid_values=iterable_to_str(valid_values))
519 | #         raise ValueError(msg)
520 | 
521 | #     return value
522 | 


--------------------------------------------------------------------------------
/notebooks/scratch/ims_download.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "\n",
 10 |     "from pyphm.datasets.ims import ImsPrepMethodA\n",
 11 |     "from pathlib import Path\n",
 12 |     "import pandas as pd\n",
 13 |     "import os\n",
 14 |     "import numpy as np\n",
 15 |     "import time\n",
 16 |     "import datetime\n",
 17 |     "import csv\n",
 18 |     "\n",
 19 |     "\n",
 20 |     "%load_ext autoreload\n",
 21 |     "%autoreload 2"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "/home/tim/Documents/PyPHM/data/raw\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "# define the location of where the raw data folders will be kept.\n",
 39 |     "# e.g. the ims data will be in path_data_raw_folder/ims/ \n",
 40 |     "path_data_raw_folder = Path(Path.cwd().parent.parent / 'data/raw/' )\n",
 41 |     "print(path_data_raw_folder)\n",
 42 |     "\n",
 43 |     "# create the path_data_raw_folder if it does not exist\n",
 44 |     "path_data_raw_folder.mkdir(parents=True, exist_ok=True)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "Downloading https://drive.google.com/file/d/1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx/view?usp=sharingIMS.7z\n",
 57 |       "Goolgle drive file id: 1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx\n"
 58 |      ]
 59 |     },
 60 |     {
 61 |      "name": "stderr",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "Downloading...\n",
 65 |       "From: https://drive.google.com/uc?id=1iJqTYQpHst_uYSyU5d2THsZkA8Vk6Inx\n",
 66 |       "To: /home/tim/Documents/PyPHM/data/raw/ims/IMS.7z\n",
 67 |       " 49%|████▉     | 532M/1.08G [00:12<00:19, 28.4MB/s] "
 68 |      ]
 69 |     },
 70 |     {
 71 |      "name": "stdout",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "\n"
 75 |      ]
 76 |     },
 77 |     {
 78 |      "ename": "KeyboardInterrupt",
 79 |      "evalue": "",
 80 |      "output_type": "error",
 81 |      "traceback": [
 82 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 83 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 84 |       "\u001b[0;32m/tmp/ipykernel_93187/765225230.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# instantiate the ImsPrepMethodA class and download data if it does not exist\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mims\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImsPrepMethodA\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath_data_raw_folder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 85 |       "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, dataset_folder_name, download)\u001b[0m\n\u001b[1;32m    326\u001b[0m         \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    327\u001b[0m     ) -> None:\n\u001b[0;32m--> 328\u001b[0;31m         super().__init__(\n\u001b[0m\u001b[1;32m    329\u001b[0m             \u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    330\u001b[0m             \u001b[0mdataset_folder_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 86 |       "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, dataset_folder_name, download, dataset_path, data, sample_freq)\u001b[0m\n\u001b[1;32m     66\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     67\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     70\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_exists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 87 |       "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/ims.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    104\u001b[0m                 \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m                     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Downloading {url}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m                     download_and_extract_archive(\n\u001b[0m\u001b[1;32m    107\u001b[0m                         \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_root\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataset_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmd5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    108\u001b[0m                     )\n",
 88 |       "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_and_extract_archive\u001b[0;34m(url, download_root, extract_root, filename, md5, remove_finished)\u001b[0m\n\u001b[1;32m    480\u001b[0m         \u001b[0mfilename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbasename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    481\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 482\u001b[0;31m     \u001b[0mdownload_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_root\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    483\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    484\u001b[0m     \u001b[0marchive\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdownload_root\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 89 |       "\u001b[0;32m~/Documents/PyPHM/src/pyphm/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[0;34m(url, root, filename, md5, max_redirect_hops)\u001b[0m\n\u001b[1;32m    173\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mfile_id\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    174\u001b[0m             \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Goolgle drive file id:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_id\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 175\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mgdown\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfile_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquiet\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    176\u001b[0m             \u001b[0;31m# return download_file_from_google_drive(file_id, root, filename, md5)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 90 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/gdown/download.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(url, output, quiet, proxy, speed, use_cookies, verify, id, fuzzy, resume)\u001b[0m\n\u001b[1;32m    255\u001b[0m             \u001b[0mpbar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtotal\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"B\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit_scale\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    256\u001b[0m         \u001b[0mt_start\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 257\u001b[0;31m         \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miter_content\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mCHUNK_SIZE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    258\u001b[0m             \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    259\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mquiet\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 91 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/requests/models.py\u001b[0m in \u001b[0;36mgenerate\u001b[0;34m()\u001b[0m\n\u001b[1;32m    756\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'stream'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    757\u001b[0m                 \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 758\u001b[0;31m                     \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    759\u001b[0m                         \u001b[0;32myield\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    760\u001b[0m                 \u001b[0;32mexcept\u001b[0m \u001b[0mProtocolError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 92 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/urllib3/response.py\u001b[0m in \u001b[0;36mstream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m    574\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    575\u001b[0m             \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_fp_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 576\u001b[0;31m                 \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdecode_content\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdecode_content\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    577\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    578\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 93 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/site-packages/urllib3/response.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m    517\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    518\u001b[0m                 \u001b[0mcache_content\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 519\u001b[0;31m                 \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfp_closed\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34mb\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    520\u001b[0m                 if (\n\u001b[1;32m    521\u001b[0m                     \u001b[0mamt\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 94 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m    457\u001b[0m             \u001b[0;31m# Amount is given, implement using readinto\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    458\u001b[0m             \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 459\u001b[0;31m             \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    460\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    461\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 95 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    501\u001b[0m         \u001b[0;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    502\u001b[0m         \u001b[0;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m         \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    504\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    505\u001b[0m             \u001b[0;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 96 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    667\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    668\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    670\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    671\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 97 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m   1239\u001b[0m                   \u001b[0;34m\"non-zero flags not allowed in calls to recv_into() on %s\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1240\u001b[0m                   self.__class__)\n\u001b[0;32m-> 1241\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1242\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1243\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 98 |       "\u001b[0;32m~/miniconda3/envs/featstore/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m   1097\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1098\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mbuffer\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1099\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1100\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1101\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 99 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
100 |      ]
101 |     },
102 |     {
103 |      "name": "stderr",
104 |      "output_type": "stream",
105 |      "text": [
106 |       " 49%|████▉     | 532M/1.08G [00:29<00:19, 28.4MB/s]"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "# instantiate the ImsPrepMethodA class and download data if it does not exist\n",
112 |     "ims = ImsPrepMethodA(root=path_data_raw_folder, download=True)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": []
121 |   }
122 |  ],
123 |  "metadata": {
124 |   "kernelspec": {
125 |    "display_name": "Python 3.8.12 ('featstore')",
126 |    "language": "python",
127 |    "name": "python3"
128 |   },
129 |   "language_info": {
130 |    "codemirror_mode": {
131 |     "name": "ipython",
132 |     "version": 3
133 |    },
134 |    "file_extension": ".py",
135 |    "mimetype": "text/x-python",
136 |    "name": "python",
137 |    "nbconvert_exporter": "python",
138 |    "pygments_lexer": "ipython3",
139 |    "version": "3.8.12"
140 |   },
141 |   "orig_nbformat": 4,
142 |   "vscode": {
143 |    "interpreter": {
144 |     "hash": "daff1afd4d675d5e247c0a95a5de0c03bd87d8f7edee7cb37c539016070f1c16"
145 |    }
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 2
150 | }
151 | 


--------------------------------------------------------------------------------